// Simple Extensible Binary Markup Language (ebml) reader and writer on a // cursor model. See the specification here: // http://www.matroska.org/technical/specs/rfc/index.html import core::option; import option::{some, none}; export doc; export doc_at; export maybe_get_doc; export get_doc; export docs; export tagged_docs; export doc_data; export doc_as_str; export doc_as_u8; export doc_as_u16; export doc_as_u32; export doc_as_u64; export doc_as_i8; export doc_as_i16; export doc_as_i32; export doc_as_i64; export writer; export serializer; export ebml_deserializer; export deserializer; export with_doc_data; export get_doc; export extensions; type ebml_tag = {id: uint, size: uint}; type ebml_state = {ebml_tag: ebml_tag, tag_pos: uint, data_pos: uint}; // FIXME (#2739): When we have module renaming, make "reader" and "writer" // separate modules within this file. // ebml reading type doc = {data: @~[u8], start: uint, end: uint}; type tagged_doc = {tag: uint, doc: doc}; impl doc: ops::index { pure fn index(&&tag: uint) -> doc { unchecked { get_doc(self, tag) } } } fn vuint_at(data: &[u8], start: uint) -> {val: uint, next: uint} { let a = data[start]; if a & 0x80u8 != 0u8 { return {val: (a & 0x7fu8) as uint, next: start + 1u}; } if a & 0x40u8 != 0u8 { return {val: ((a & 0x3fu8) as uint) << 8u | (data[start + 1u] as uint), next: start + 2u}; } else if a & 0x20u8 != 0u8 { return {val: ((a & 0x1fu8) as uint) << 16u | (data[start + 1u] as uint) << 8u | (data[start + 2u] as uint), next: start + 3u}; } else if a & 0x10u8 != 0u8 { return {val: ((a & 0x0fu8) as uint) << 24u | (data[start + 1u] as uint) << 16u | (data[start + 2u] as uint) << 8u | (data[start + 3u] as uint), next: start + 4u}; } else { error!{"vint too big"}; fail; } } fn doc(data: @~[u8]) -> doc { return {data: data, start: 0u, end: vec::len::(*data)}; } fn doc_at(data: @~[u8], start: uint) -> tagged_doc { let elt_tag = vuint_at(*data, start); let elt_size = vuint_at(*data, elt_tag.next); let end = elt_size.next + elt_size.val; return {tag: elt_tag.val, doc: {data: data, start: elt_size.next, end: end}}; } fn maybe_get_doc(d: doc, tg: uint) -> option { let mut pos = d.start; while pos < d.end { let elt_tag = vuint_at(*d.data, pos); let elt_size = vuint_at(*d.data, elt_tag.next); pos = elt_size.next + elt_size.val; if elt_tag.val == tg { return some::({ data: d.data, start: elt_size.next, end: pos }); } } return none::; } fn get_doc(d: doc, tg: uint) -> doc { match maybe_get_doc(d, tg) { some(d) => return d, none => { error!{"failed to find block with tag %u", tg}; fail; } } } fn docs(d: doc, it: fn(uint, doc) -> bool) { let mut pos = d.start; while pos < d.end { let elt_tag = vuint_at(*d.data, pos); let elt_size = vuint_at(*d.data, elt_tag.next); pos = elt_size.next + elt_size.val; if !it(elt_tag.val, {data: d.data, start: elt_size.next, end: pos}) { break; } } } fn tagged_docs(d: doc, tg: uint, it: fn(doc) -> bool) { let mut pos = d.start; while pos < d.end { let elt_tag = vuint_at(*d.data, pos); let elt_size = vuint_at(*d.data, elt_tag.next); pos = elt_size.next + elt_size.val; if elt_tag.val == tg { if !it({data: d.data, start: elt_size.next, end: pos}) { break; } } } } fn doc_data(d: doc) -> ~[u8] { vec::slice::(*d.data, d.start, d.end) } fn with_doc_data(d: doc, f: fn(x: &[u8]) -> T) -> T { return f(vec::view(*d.data, d.start, d.end)); } fn doc_as_str(d: doc) -> ~str { return str::from_bytes(doc_data(d)); } fn doc_as_u8(d: doc) -> u8 { assert d.end == d.start + 1u; return (*d.data)[d.start]; } fn doc_as_u16(d: doc) -> u16 { assert d.end == d.start + 2u; return io::u64_from_be_bytes(*d.data, d.start, 2u) as u16; } fn doc_as_u32(d: doc) -> u32 { assert d.end == d.start + 4u; return io::u64_from_be_bytes(*d.data, d.start, 4u) as u32; } fn doc_as_u64(d: doc) -> u64 { assert d.end == d.start + 8u; return io::u64_from_be_bytes(*d.data, d.start, 8u); } fn doc_as_i8(d: doc) -> i8 { doc_as_u8(d) as i8 } fn doc_as_i16(d: doc) -> i16 { doc_as_u16(d) as i16 } fn doc_as_i32(d: doc) -> i32 { doc_as_u32(d) as i32 } fn doc_as_i64(d: doc) -> i64 { doc_as_u64(d) as i64 } // ebml writing type writer_ = {writer: io::Writer, mut size_positions: ~[uint]}; enum writer { writer_(writer_) } fn write_sized_vuint(w: io::Writer, n: uint, size: uint) { match size { 1u => w.write(&[0x80u8 | (n as u8)]), 2u => w.write(&[0x40u8 | ((n >> 8_u) as u8), n as u8]), 3u => w.write(&[0x20u8 | ((n >> 16_u) as u8), (n >> 8_u) as u8, n as u8]), 4u => w.write(&[0x10u8 | ((n >> 24_u) as u8), (n >> 16_u) as u8, (n >> 8_u) as u8, n as u8]), _ => fail fmt!{"vint to write too big: %?", n} }; } fn write_vuint(w: io::Writer, n: uint) { if n < 0x7f_u { write_sized_vuint(w, n, 1u); return; } if n < 0x4000_u { write_sized_vuint(w, n, 2u); return; } if n < 0x200000_u { write_sized_vuint(w, n, 3u); return; } if n < 0x10000000_u { write_sized_vuint(w, n, 4u); return; } fail fmt!{"vint to write too big: %?", n}; } fn writer(w: io::Writer) -> writer { let size_positions: ~[uint] = ~[]; return writer_({writer: w, mut size_positions: size_positions}); } // FIXME (#2741): Provide a function to write the standard ebml header. impl writer { fn start_tag(tag_id: uint) { debug!{"Start tag %u", tag_id}; // Write the enum ID: write_vuint(self.writer, tag_id); // Write a placeholder four-byte size. vec::push(self.size_positions, self.writer.tell()); let zeroes: &[u8] = &[0u8, 0u8, 0u8, 0u8]; self.writer.write(zeroes); } fn end_tag() { let last_size_pos = vec::pop::(self.size_positions); let cur_pos = self.writer.tell(); self.writer.seek(last_size_pos as int, io::SeekSet); let size = (cur_pos - last_size_pos - 4u); write_sized_vuint(self.writer, size, 4u); self.writer.seek(cur_pos as int, io::SeekSet); debug!{"End tag (size = %u)", size}; } fn wr_tag(tag_id: uint, blk: fn()) { self.start_tag(tag_id); blk(); self.end_tag(); } fn wr_tagged_bytes(tag_id: uint, b: &[u8]) { write_vuint(self.writer, tag_id); write_vuint(self.writer, vec::len(b)); self.writer.write(b); } fn wr_tagged_u64(tag_id: uint, v: u64) { do io::u64_to_be_bytes(v, 8u) |v| { self.wr_tagged_bytes(tag_id, v); } } fn wr_tagged_u32(tag_id: uint, v: u32) { do io::u64_to_be_bytes(v as u64, 4u) |v| { self.wr_tagged_bytes(tag_id, v); } } fn wr_tagged_u16(tag_id: uint, v: u16) { do io::u64_to_be_bytes(v as u64, 2u) |v| { self.wr_tagged_bytes(tag_id, v); } } fn wr_tagged_u8(tag_id: uint, v: u8) { self.wr_tagged_bytes(tag_id, &[v]); } fn wr_tagged_i64(tag_id: uint, v: i64) { do io::u64_to_be_bytes(v as u64, 8u) |v| { self.wr_tagged_bytes(tag_id, v); } } fn wr_tagged_i32(tag_id: uint, v: i32) { do io::u64_to_be_bytes(v as u64, 4u) |v| { self.wr_tagged_bytes(tag_id, v); } } fn wr_tagged_i16(tag_id: uint, v: i16) { do io::u64_to_be_bytes(v as u64, 2u) |v| { self.wr_tagged_bytes(tag_id, v); } } fn wr_tagged_i8(tag_id: uint, v: i8) { self.wr_tagged_bytes(tag_id, &[v as u8]); } fn wr_tagged_str(tag_id: uint, v: ~str) { // Lame: can't use str::as_bytes() here because the resulting // vector is NULL-terminated. Annoyingly, the underlying // writer interface doesn't permit us to write a slice of a // vector. We need first-class slices, I think. // str::as_bytes(v) {|b| self.wr_tagged_bytes(tag_id, b); } self.wr_tagged_bytes(tag_id, str::bytes(v)); } fn wr_bytes(b: &[u8]) { debug!{"Write %u bytes", vec::len(b)}; self.writer.write(b); } fn wr_str(s: ~str) { debug!{"Write str: %?", s}; self.writer.write(str::bytes(s)); } } // FIXME (#2743): optionally perform "relaxations" on end_tag to more // efficiently encode sizes; this is a fixed point iteration // Set to true to generate more debugging in EBML serialization. // Totally lame approach. const debug: bool = true; enum ebml_serializer_tag { es_uint, es_u64, es_u32, es_u16, es_u8, es_int, es_i64, es_i32, es_i16, es_i8, es_bool, es_str, es_f64, es_f32, es_float, es_enum, es_enum_vid, es_enum_body, es_vec, es_vec_len, es_vec_elt, es_label // Used only when debugging } trait serializer_priv { fn _emit_tagged_uint(t: ebml_serializer_tag, v: uint); fn _emit_label(label: ~str); } impl ebml::writer: serializer_priv { // used internally to emit things like the vector length and so on fn _emit_tagged_uint(t: ebml_serializer_tag, v: uint) { assert v <= 0xFFFF_FFFF_u; self.wr_tagged_u32(t as uint, v as u32); } fn _emit_label(label: ~str) { // There are various strings that we have access to, such as // the name of a record field, which do not actually appear in // the serialized EBML (normally). This is just for // efficiency. When debugging, though, we can emit such // labels and then they will be checked by deserializer to // try and check failures more quickly. if debug { self.wr_tagged_str(es_label as uint, label) } } } impl ebml::writer: serialization::serializer { fn emit_nil() {} fn emit_uint(v: uint) { self.wr_tagged_u64(es_uint as uint, v as u64); } fn emit_u64(v: u64) { self.wr_tagged_u64(es_u64 as uint, v); } fn emit_u32(v: u32) { self.wr_tagged_u32(es_u32 as uint, v); } fn emit_u16(v: u16) { self.wr_tagged_u16(es_u16 as uint, v); } fn emit_u8(v: u8) { self.wr_tagged_u8 (es_u8 as uint, v); } fn emit_int(v: int) { self.wr_tagged_i64(es_int as uint, v as i64); } fn emit_i64(v: i64) { self.wr_tagged_i64(es_i64 as uint, v); } fn emit_i32(v: i32) { self.wr_tagged_i32(es_i32 as uint, v); } fn emit_i16(v: i16) { self.wr_tagged_i16(es_i16 as uint, v); } fn emit_i8(v: i8) { self.wr_tagged_i8 (es_i8 as uint, v); } fn emit_bool(v: bool) { self.wr_tagged_u8(es_bool as uint, v as u8) } // FIXME (#2742): implement these fn emit_f64(_v: f64) { fail ~"Unimplemented: serializing an f64"; } fn emit_f32(_v: f32) { fail ~"Unimplemented: serializing an f32"; } fn emit_float(_v: float) { fail ~"Unimplemented: serializing a float"; } fn emit_str(v: ~str) { self.wr_tagged_str(es_str as uint, v) } fn emit_enum(name: ~str, f: fn()) { self._emit_label(name); self.wr_tag(es_enum as uint, f) } fn emit_enum_variant(_v_name: ~str, v_id: uint, _cnt: uint, f: fn()) { self._emit_tagged_uint(es_enum_vid, v_id); self.wr_tag(es_enum_body as uint, f) } fn emit_enum_variant_arg(_idx: uint, f: fn()) { f() } fn emit_vec(len: uint, f: fn()) { do self.wr_tag(es_vec as uint) { self._emit_tagged_uint(es_vec_len, len); f() } } fn emit_vec_elt(_idx: uint, f: fn()) { self.wr_tag(es_vec_elt as uint, f) } fn emit_box(f: fn()) { f() } fn emit_uniq(f: fn()) { f() } fn emit_rec(f: fn()) { f() } fn emit_rec_field(f_name: ~str, _f_idx: uint, f: fn()) { self._emit_label(f_name); f() } fn emit_tup(_sz: uint, f: fn()) { f() } fn emit_tup_elt(_idx: uint, f: fn()) { f() } } type ebml_deserializer_ = {mut parent: ebml::doc, mut pos: uint}; enum ebml_deserializer { ebml_deserializer_(ebml_deserializer_) } fn ebml_deserializer(d: ebml::doc) -> ebml_deserializer { ebml_deserializer_({mut parent: d, mut pos: d.start}) } priv impl ebml_deserializer { fn _check_label(lbl: ~str) { if self.pos < self.parent.end { let {tag: r_tag, doc: r_doc} = ebml::doc_at(self.parent.data, self.pos); if r_tag == (es_label as uint) { self.pos = r_doc.end; let str = ebml::doc_as_str(r_doc); if lbl != str { fail fmt!{"Expected label %s but found %s", lbl, str}; } } } } fn next_doc(exp_tag: ebml_serializer_tag) -> ebml::doc { debug!{". next_doc(exp_tag=%?)", exp_tag}; if self.pos >= self.parent.end { fail ~"no more documents in current node!"; } let {tag: r_tag, doc: r_doc} = ebml::doc_at(self.parent.data, self.pos); debug!{"self.parent=%?-%? self.pos=%? r_tag=%? r_doc=%?-%?", copy self.parent.start, copy self.parent.end, copy self.pos, r_tag, r_doc.start, r_doc.end}; if r_tag != (exp_tag as uint) { fail fmt!{"expected EMBL doc with tag %? but found tag %?", exp_tag, r_tag}; } if r_doc.end > self.parent.end { fail fmt!{"invalid EBML, child extends to 0x%x, parent to 0x%x", r_doc.end, self.parent.end}; } self.pos = r_doc.end; return r_doc; } fn push_doc(d: ebml::doc, f: fn() -> T) -> T{ let old_parent = self.parent; let old_pos = self.pos; self.parent = d; self.pos = d.start; let r = f(); self.parent = old_parent; self.pos = old_pos; return r; } fn _next_uint(exp_tag: ebml_serializer_tag) -> uint { let r = ebml::doc_as_u32(self.next_doc(exp_tag)); debug!{"_next_uint exp_tag=%? result=%?", exp_tag, r}; return r as uint; } } impl ebml_deserializer: serialization::deserializer { fn read_nil() -> () { () } fn read_u64() -> u64 { ebml::doc_as_u64(self.next_doc(es_u64)) } fn read_u32() -> u32 { ebml::doc_as_u32(self.next_doc(es_u32)) } fn read_u16() -> u16 { ebml::doc_as_u16(self.next_doc(es_u16)) } fn read_u8 () -> u8 { ebml::doc_as_u8 (self.next_doc(es_u8 )) } fn read_uint() -> uint { let v = ebml::doc_as_u64(self.next_doc(es_uint)); if v > (core::uint::max_value as u64) { fail fmt!{"uint %? too large for this architecture", v}; } return v as uint; } fn read_i64() -> i64 { ebml::doc_as_u64(self.next_doc(es_i64)) as i64 } fn read_i32() -> i32 { ebml::doc_as_u32(self.next_doc(es_i32)) as i32 } fn read_i16() -> i16 { ebml::doc_as_u16(self.next_doc(es_i16)) as i16 } fn read_i8 () -> i8 { ebml::doc_as_u8 (self.next_doc(es_i8 )) as i8 } fn read_int() -> int { let v = ebml::doc_as_u64(self.next_doc(es_int)) as i64; if v > (int::max_value as i64) || v < (int::min_value as i64) { fail fmt!{"int %? out of range for this architecture", v}; } return v as int; } fn read_bool() -> bool { ebml::doc_as_u8(self.next_doc(es_bool)) as bool } fn read_f64() -> f64 { fail ~"read_f64()"; } fn read_f32() -> f32 { fail ~"read_f32()"; } fn read_float() -> float { fail ~"read_float()"; } fn read_str() -> ~str { ebml::doc_as_str(self.next_doc(es_str)) } // Compound types: fn read_enum(name: ~str, f: fn() -> T) -> T { debug!{"read_enum(%s)", name}; self._check_label(name); self.push_doc(self.next_doc(es_enum), f) } fn read_enum_variant(f: fn(uint) -> T) -> T { debug!{"read_enum_variant()"}; let idx = self._next_uint(es_enum_vid); debug!{" idx=%u", idx}; do self.push_doc(self.next_doc(es_enum_body)) { f(idx) } } fn read_enum_variant_arg(idx: uint, f: fn() -> T) -> T { debug!{"read_enum_variant_arg(idx=%u)", idx}; f() } fn read_vec(f: fn(uint) -> T) -> T { debug!{"read_vec()"}; do self.push_doc(self.next_doc(es_vec)) { let len = self._next_uint(es_vec_len); debug!{" len=%u", len}; f(len) } } fn read_vec_elt(idx: uint, f: fn() -> T) -> T { debug!{"read_vec_elt(idx=%u)", idx}; self.push_doc(self.next_doc(es_vec_elt), f) } fn read_box(f: fn() -> T) -> T { debug!{"read_box()"}; f() } fn read_uniq(f: fn() -> T) -> T { debug!{"read_uniq()"}; f() } fn read_rec(f: fn() -> T) -> T { debug!{"read_rec()"}; f() } fn read_rec_field(f_name: ~str, f_idx: uint, f: fn() -> T) -> T { debug!{"read_rec_field(%s, idx=%u)", f_name, f_idx}; self._check_label(f_name); f() } fn read_tup(sz: uint, f: fn() -> T) -> T { debug!{"read_tup(sz=%u)", sz}; f() } fn read_tup_elt(idx: uint, f: fn() -> T) -> T { debug!{"read_tup_elt(idx=%u)", idx}; f() } } // ___________________________________________________________________________ // Testing #[test] fn test_option_int() { fn serialize_1(s: S, v: int) { s.emit_i64(v as i64); } fn serialize_0(s: S, v: option) { do s.emit_enum(~"core::option::t") { match v { none => s.emit_enum_variant( ~"core::option::none", 0u, 0u, || { } ), some(v0) => { do s.emit_enum_variant(~"core::option::some", 1u, 1u) { s.emit_enum_variant_arg(0u, || serialize_1(s, v0)); } } } } } fn deserialize_1(s: S) -> int { s.read_i64() as int } fn deserialize_0(s: S) -> option { do s.read_enum(~"core::option::t") { do s.read_enum_variant |i| { match check i { 0u => none, 1u => { let v0 = do s.read_enum_variant_arg(0u) { deserialize_1(s) }; some(v0) } } } } } fn test_v(v: option) { debug!{"v == %?", v}; let mbuf = io::mem_buffer(); let ebml_w = ebml::writer(io::mem_buffer_writer(mbuf)); serialize_0(ebml_w, v); let ebml_doc = ebml::doc(@io::mem_buffer_buf(mbuf)); let deser = ebml_deserializer(ebml_doc); let v1 = deserialize_0(deser); debug!{"v1 == %?", v1}; assert v == v1; } test_v(some(22)); test_v(none); test_v(some(3)); }