.bson file format: Nim parsing library

BSON, short for Binary JSON, is a binary-encoded serialization of JSON-like documents. Like JSON, BSON supports the embedding of documents and arrays within other documents and arrays. BSON also contains extensions that allow representation of data types that are not part of the JSON spec. For example, BSON has a Date type and a BinData type. BSON can be compared to binary interchange formats, like Protocol Buffers. BSON is more "schemaless" than Protocol Buffers, which can give it an advantage in flexibility but also a slight disadvantage in space efficiency (BSON has overhead for field names within the serialized data). BSON was designed to have the following three characteristics:

  • Lightweight. Keeping spatial overhead to a minimum is important for any data representation format, especially when used over the network.
  • Traversable. BSON is designed to be traversed easily. This is a vital property in its role as the primary data representation for MongoDB.
  • Efficient. Encoding data to BSON and decoding from BSON can be performed very quickly in most languages due to the use of C data types.

File extension

bson

KS implementation details

License: CC0-1.0

References

This page hosts a formal specification of .bson file format using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Nim source code to parse .bson file format

bson.nim

import kaitai_struct_nim_runtime
import options

type
  Bson* = ref object of KaitaiStruct
    `len`*: int32
    `fields`*: Bson_ElementsList
    `terminator`*: seq[byte]
    `parent`*: KaitaiStruct
    `rawFields`*: seq[byte]
  Bson_Timestamp* = ref object of KaitaiStruct
    `increment`*: uint32
    `timestamp`*: uint32
    `parent`*: Bson_Element
  Bson_BinData* = ref object of KaitaiStruct
    `len`*: int32
    `subtype`*: Bson_BinData_Subtype
    `content`*: KaitaiStruct
    `parent`*: Bson_Element
    `rawContent`*: seq[byte]
  Bson_BinData_Subtype* = enum
    generic = 0
    function = 1
    byte_array_deprecated = 2
    uuid_deprecated = 3
    uuid = 4
    md5 = 5
    custom = 128
  Bson_BinData_ByteArrayDeprecated* = ref object of KaitaiStruct
    `len`*: int32
    `content`*: seq[byte]
    `parent`*: Bson_BinData
  Bson_ElementsList* = ref object of KaitaiStruct
    `elements`*: seq[Bson_Element]
    `parent`*: Bson
  Bson_Cstring* = ref object of KaitaiStruct
    `str`*: string
    `parent`*: KaitaiStruct
  Bson_String* = ref object of KaitaiStruct
    `len`*: int32
    `str`*: string
    `terminator`*: seq[byte]
    `parent`*: KaitaiStruct
  Bson_Element* = ref object of KaitaiStruct
    `typeByte`*: Bson_Element_BsonType
    `name`*: Bson_Cstring
    `content`*: KaitaiStruct
    `parent`*: Bson_ElementsList
  Bson_Element_BsonType* = enum
    min_key = -1
    end_of_object = 0
    number_double = 1
    string = 2
    document = 3
    array = 4
    bin_data = 5
    undefined = 6
    object_id = 7
    boolean = 8
    utc_datetime = 9
    jst_null = 10
    reg_ex = 11
    db_pointer = 12
    javascript = 13
    symbol = 14
    code_with_scope = 15
    number_int = 16
    timestamp = 17
    number_long = 18
    number_decimal = 19
    max_key = 127
  Bson_DbPointer* = ref object of KaitaiStruct
    `namespace`*: Bson_String
    `id`*: Bson_ObjectId
    `parent`*: Bson_Element
  Bson_U3* = ref object of KaitaiStruct
    `b1`*: uint8
    `b2`*: uint8
    `b3`*: uint8
    `parent`*: Bson_ObjectId
    `valueInst`: int
    `valueInstFlag`: bool
  Bson_CodeWithScope* = ref object of KaitaiStruct
    `id`*: int32
    `source`*: Bson_String
    `scope`*: Bson
    `parent`*: Bson_Element
  Bson_F16* = ref object of KaitaiStruct
    `str`*: bool
    `exponent`*: uint64
    `significandHi`*: uint64
    `significandLo`*: uint64
    `parent`*: Bson_Element
  Bson_ObjectId* = ref object of KaitaiStruct
    `epochTime`*: uint32
    `machineId`*: Bson_U3
    `processId`*: uint16
    `counter`*: Bson_U3
    `parent`*: KaitaiStruct
  Bson_RegEx* = ref object of KaitaiStruct
    `pattern`*: Bson_Cstring
    `options`*: Bson_Cstring
    `parent`*: Bson_Element

proc read*(_: typedesc[Bson], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Bson
proc read*(_: typedesc[Bson_Timestamp], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_Timestamp
proc read*(_: typedesc[Bson_BinData], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_BinData
proc read*(_: typedesc[Bson_BinData_ByteArrayDeprecated], io: KaitaiStream, root: KaitaiStruct, parent: Bson_BinData): Bson_BinData_ByteArrayDeprecated
proc read*(_: typedesc[Bson_ElementsList], io: KaitaiStream, root: KaitaiStruct, parent: Bson): Bson_ElementsList
proc read*(_: typedesc[Bson_Cstring], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Bson_Cstring
proc read*(_: typedesc[Bson_String], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Bson_String
proc read*(_: typedesc[Bson_Element], io: KaitaiStream, root: KaitaiStruct, parent: Bson_ElementsList): Bson_Element
proc read*(_: typedesc[Bson_DbPointer], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_DbPointer
proc read*(_: typedesc[Bson_U3], io: KaitaiStream, root: KaitaiStruct, parent: Bson_ObjectId): Bson_U3
proc read*(_: typedesc[Bson_CodeWithScope], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_CodeWithScope
proc read*(_: typedesc[Bson_F16], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_F16
proc read*(_: typedesc[Bson_ObjectId], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Bson_ObjectId
proc read*(_: typedesc[Bson_RegEx], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_RegEx

proc value*(this: Bson_U3): int


##[
BSON, short for Binary JSON, is a binary-encoded serialization of JSON-like documents. Like JSON, BSON supports the embedding of documents and arrays within other documents and arrays. BSON also contains extensions that allow representation of data types that are not part of the JSON spec. For example, BSON has a Date type and a BinData type. BSON can be compared to binary interchange formats, like Protocol Buffers. BSON is more "schemaless" than Protocol Buffers, which can give it an advantage in flexibility but also a slight disadvantage in space efficiency (BSON has overhead for field names within the serialized data). BSON was designed to have the following three characteristics:
  * Lightweight. Keeping spatial overhead to a minimum is important for any data representation format, especially when used over the network.
  * Traversable. BSON is designed to be traversed easily. This is a vital property in its role as the primary data representation for MongoDB.
  * Efficient. Encoding data to BSON and decoding from BSON can be performed very quickly in most languages due to the use of C data types.

]##
proc read*(_: typedesc[Bson], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Bson =
  template this: untyped = result
  this = new(Bson)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent


  ##[
  Total number of bytes comprising the document.
  ]##
  let lenExpr = this.io.readS4le()
  this.len = lenExpr
  let rawFieldsExpr = this.io.readBytes(int((this.len - 5)))
  this.rawFields = rawFieldsExpr
  let rawFieldsIo = newKaitaiStream(rawFieldsExpr)
  let fieldsExpr = Bson_ElementsList.read(rawFieldsIo, this.root, this)
  this.fields = fieldsExpr
  let terminatorExpr = this.io.readBytes(int(1))
  this.terminator = terminatorExpr

proc fromFile*(_: typedesc[Bson], filename: string): Bson =
  Bson.read(newKaitaiFileStream(filename), nil, nil)


##[
Special internal type used by MongoDB replication and sharding. First 4 bytes are an increment, second 4 are a timestamp.
]##
proc read*(_: typedesc[Bson_Timestamp], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_Timestamp =
  template this: untyped = result
  this = new(Bson_Timestamp)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let incrementExpr = this.io.readU4le()
  this.increment = incrementExpr
  let timestampExpr = this.io.readU4le()
  this.timestamp = timestampExpr

proc fromFile*(_: typedesc[Bson_Timestamp], filename: string): Bson_Timestamp =
  Bson_Timestamp.read(newKaitaiFileStream(filename), nil, nil)


##[
The BSON "binary" or "BinData" datatype is used to represent arrays of bytes. It is somewhat analogous to the Java notion of a ByteArray. BSON binary values have a subtype. This is used to indicate what kind of data is in the byte array. Subtypes from zero to 127 are predefined or reserved. Subtypes from 128-255 are user-defined.
]##
proc read*(_: typedesc[Bson_BinData], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_BinData =
  template this: untyped = result
  this = new(Bson_BinData)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let lenExpr = this.io.readS4le()
  this.len = lenExpr
  let subtypeExpr = Bson_BinData_Subtype(this.io.readU1())
  this.subtype = subtypeExpr
  block:
    let on = this.subtype
    if on == bson.byte_array_deprecated:
      let rawContentExpr = this.io.readBytes(int(this.len))
      this.rawContent = rawContentExpr
      let rawContentIo = newKaitaiStream(rawContentExpr)
      let contentExpr = Bson_BinData_ByteArrayDeprecated.read(rawContentIo, this.root, this)
      this.content = contentExpr
    else:
      let contentExpr = this.io.readBytes(int(this.len))
      this.content = contentExpr

proc fromFile*(_: typedesc[Bson_BinData], filename: string): Bson_BinData =
  Bson_BinData.read(newKaitaiFileStream(filename), nil, nil)


##[
The BSON "binary" or "BinData" datatype is used to represent arrays of bytes. It is somewhat analogous to the Java notion of a ByteArray. BSON binary values have a subtype. This is used to indicate what kind of data is in the byte array. Subtypes from zero to 127 are predefined or reserved. Subtypes from 128-255 are user-defined.
]##
proc read*(_: typedesc[Bson_BinData_ByteArrayDeprecated], io: KaitaiStream, root: KaitaiStruct, parent: Bson_BinData): Bson_BinData_ByteArrayDeprecated =
  template this: untyped = result
  this = new(Bson_BinData_ByteArrayDeprecated)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let lenExpr = this.io.readS4le()
  this.len = lenExpr
  let contentExpr = this.io.readBytes(int(this.len))
  this.content = contentExpr

proc fromFile*(_: typedesc[Bson_BinData_ByteArrayDeprecated], filename: string): Bson_BinData_ByteArrayDeprecated =
  Bson_BinData_ByteArrayDeprecated.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Bson_ElementsList], io: KaitaiStream, root: KaitaiStruct, parent: Bson): Bson_ElementsList =
  template this: untyped = result
  this = new(Bson_ElementsList)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  block:
    var i: int
    while not this.io.isEof:
      let it = Bson_Element.read(this.io, this.root, this)
      this.elements.add(it)
      inc i

proc fromFile*(_: typedesc[Bson_ElementsList], filename: string): Bson_ElementsList =
  Bson_ElementsList.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Bson_Cstring], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Bson_Cstring =
  template this: untyped = result
  this = new(Bson_Cstring)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent


  ##[
  MUST NOT contain '\x00', hence it is not full UTF-8.
  ]##
  let strExpr = encode(this.io.readBytesTerm(0, false, true, true), "UTF-8")
  this.str = strExpr

proc fromFile*(_: typedesc[Bson_Cstring], filename: string): Bson_Cstring =
  Bson_Cstring.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Bson_String], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Bson_String =
  template this: untyped = result
  this = new(Bson_String)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let lenExpr = this.io.readS4le()
  this.len = lenExpr
  let strExpr = encode(this.io.readBytes(int((this.len - 1))), "UTF-8")
  this.str = strExpr
  let terminatorExpr = this.io.readBytes(int(1))
  this.terminator = terminatorExpr

proc fromFile*(_: typedesc[Bson_String], filename: string): Bson_String =
  Bson_String.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Bson_Element], io: KaitaiStream, root: KaitaiStruct, parent: Bson_ElementsList): Bson_Element =
  template this: untyped = result
  this = new(Bson_Element)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let typeByteExpr = Bson_Element_BsonType(this.io.readU1())
  this.typeByte = typeByteExpr
  let nameExpr = Bson_Cstring.read(this.io, this.root, this)
  this.name = nameExpr
  block:
    let on = this.typeByte
    if on == bson.code_with_scope:
      let contentExpr = Bson_CodeWithScope.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.reg_ex:
      let contentExpr = Bson_RegEx.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.number_double:
      let contentExpr = KaitaiStruct(this.io.readF8le())
      this.content = contentExpr
    elif on == bson.symbol:
      let contentExpr = Bson_String.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.timestamp:
      let contentExpr = Bson_Timestamp.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.number_int:
      let contentExpr = KaitaiStruct(this.io.readS4le())
      this.content = contentExpr
    elif on == bson.document:
      let contentExpr = Bson.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.object_id:
      let contentExpr = Bson_ObjectId.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.javascript:
      let contentExpr = Bson_String.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.utc_datetime:
      let contentExpr = KaitaiStruct(this.io.readS8le())
      this.content = contentExpr
    elif on == bson.boolean:
      let contentExpr = KaitaiStruct(this.io.readU1())
      this.content = contentExpr
    elif on == bson.number_long:
      let contentExpr = KaitaiStruct(this.io.readS8le())
      this.content = contentExpr
    elif on == bson.bin_data:
      let contentExpr = Bson_BinData.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.string:
      let contentExpr = Bson_String.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.db_pointer:
      let contentExpr = Bson_DbPointer.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.array:
      let contentExpr = Bson.read(this.io, this.root, this)
      this.content = contentExpr
    elif on == bson.number_decimal:
      let contentExpr = Bson_F16.read(this.io, this.root, this)
      this.content = contentExpr

proc fromFile*(_: typedesc[Bson_Element], filename: string): Bson_Element =
  Bson_Element.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Bson_DbPointer], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_DbPointer =
  template this: untyped = result
  this = new(Bson_DbPointer)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let namespaceExpr = Bson_String.read(this.io, this.root, this)
  this.namespace = namespaceExpr
  let idExpr = Bson_ObjectId.read(this.io, this.root, this)
  this.id = idExpr

proc fromFile*(_: typedesc[Bson_DbPointer], filename: string): Bson_DbPointer =
  Bson_DbPointer.read(newKaitaiFileStream(filename), nil, nil)


##[
Implements unsigned 24-bit (3 byte) integer.

]##
proc read*(_: typedesc[Bson_U3], io: KaitaiStream, root: KaitaiStruct, parent: Bson_ObjectId): Bson_U3 =
  template this: untyped = result
  this = new(Bson_U3)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let b1Expr = this.io.readU1()
  this.b1 = b1Expr
  let b2Expr = this.io.readU1()
  this.b2 = b2Expr
  let b3Expr = this.io.readU1()
  this.b3 = b3Expr

proc value(this: Bson_U3): int = 
  if this.valueInstFlag:
    return this.valueInst
  let valueInstExpr = int(((this.b1 or (this.b2 shl 8)) or (this.b3 shl 16)))
  this.valueInst = valueInstExpr
  this.valueInstFlag = true
  return this.valueInst

proc fromFile*(_: typedesc[Bson_U3], filename: string): Bson_U3 =
  Bson_U3.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Bson_CodeWithScope], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_CodeWithScope =
  template this: untyped = result
  this = new(Bson_CodeWithScope)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let idExpr = this.io.readS4le()
  this.id = idExpr
  let sourceExpr = Bson_String.read(this.io, this.root, this)
  this.source = sourceExpr

  ##[
  mapping from identifiers to values, representing the scope in which the string should be evaluated.
  ]##
  let scopeExpr = Bson.read(this.io, this.root, this)
  this.scope = scopeExpr

proc fromFile*(_: typedesc[Bson_CodeWithScope], filename: string): Bson_CodeWithScope =
  Bson_CodeWithScope.read(newKaitaiFileStream(filename), nil, nil)


##[
128-bit IEEE 754-2008 decimal floating point
]##
proc read*(_: typedesc[Bson_F16], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_F16 =
  template this: untyped = result
  this = new(Bson_F16)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let strExpr = this.io.readBitsIntBe(1) != 0
  this.str = strExpr
  let exponentExpr = this.io.readBitsIntBe(15)
  this.exponent = exponentExpr
  let significandHiExpr = this.io.readBitsIntBe(49)
  this.significandHi = significandHiExpr
  alignToByte(this.io)
  let significandLoExpr = this.io.readU8le()
  this.significandLo = significandLoExpr

proc fromFile*(_: typedesc[Bson_F16], filename: string): Bson_F16 =
  Bson_F16.read(newKaitaiFileStream(filename), nil, nil)


##[
@see <a href="https://www.mongodb.com/docs/manual/reference/method/ObjectId/">Source</a>
]##
proc read*(_: typedesc[Bson_ObjectId], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Bson_ObjectId =
  template this: untyped = result
  this = new(Bson_ObjectId)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent


  ##[
  seconds since the Unix epoch
  ]##
  let epochTimeExpr = this.io.readU4le()
  this.epochTime = epochTimeExpr
  let machineIdExpr = Bson_U3.read(this.io, this.root, this)
  this.machineId = machineIdExpr
  let processIdExpr = this.io.readU2le()
  this.processId = processIdExpr

  ##[
  counter, starting with a random value.
  ]##
  let counterExpr = Bson_U3.read(this.io, this.root, this)
  this.counter = counterExpr

proc fromFile*(_: typedesc[Bson_ObjectId], filename: string): Bson_ObjectId =
  Bson_ObjectId.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Bson_RegEx], io: KaitaiStream, root: KaitaiStruct, parent: Bson_Element): Bson_RegEx =
  template this: untyped = result
  this = new(Bson_RegEx)
  let root = if root == nil: cast[Bson](this) else: cast[Bson](root)
  this.io = io
  this.root = root
  this.parent = parent

  let patternExpr = Bson_Cstring.read(this.io, this.root, this)
  this.pattern = patternExpr
  let optionsExpr = Bson_Cstring.read(this.io, this.root, this)
  this.options = optionsExpr

proc fromFile*(_: typedesc[Bson_RegEx], filename: string): Bson_RegEx =
  Bson_RegEx.read(newKaitaiFileStream(filename), nil, nil)