ruby_marshal: Python parsing library

Ruby's Marshal module allows serialization and deserialization of many standard and arbitrary Ruby objects in a compact binary format. It is relatively fast, available in stdlibs standard and allows conservation of language-specific properties (such as symbols or encoding-aware strings).

Feature-wise, it is comparable to other language-specific implementations, such as:

From internal perspective, serialized stream consists of a simple magic header and a record.

KS implementation details

License: CC0-1.0

This page hosts a formal specification of ruby_marshal using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Usage

Runtime library

All parsing code for Python generated by Kaitai Struct depends on the Python runtime library. You have to install it before you can parse data.

The Python runtime library can be installed from PyPI:

python3 -m pip install kaitaistruct

Code

Parse a local file and get structure in memory:

data = RubyMarshal.from_file("path/to/local/file.bin")

Or parse structure from a bytes:

from kaitaistruct import KaitaiStream, BytesIO

raw = b"\x00\x01\x02..."
data = RubyMarshal(KaitaiStream(BytesIO(raw)))

After that, one can get various attributes from the structure by invoking getter methods like:

data.version # => get version

Python source code to parse ruby_marshal

ruby_marshal.py

# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

import kaitaistruct
from kaitaistruct import KaitaiStruct, KaitaiStream, BytesIO
from enum import Enum


if getattr(kaitaistruct, 'API_VERSION', (0, 9)) < (0, 9):
    raise Exception("Incompatible Kaitai Struct Python API: 0.9 or later is required, but you have %s" % (kaitaistruct.__version__))

class RubyMarshal(KaitaiStruct):
    """Ruby's Marshal module allows serialization and deserialization of
    many standard and arbitrary Ruby objects in a compact binary
    format. It is relatively fast, available in stdlibs standard and
    allows conservation of language-specific properties (such as symbols
    or encoding-aware strings).
    
    Feature-wise, it is comparable to other language-specific
    implementations, such as:
    
    * Java's
      [Serializable](https://docs.oracle.com/javase/8/docs/api/java/io/Serializable.html)
    * .NET
      [BinaryFormatter](https://learn.microsoft.com/en-us/dotnet/api/system.runtime.serialization.formatters.binary.binaryformatter?view=net-7.0)
    * Python's
      [marshal](https://docs.python.org/3/library/marshal.html),
      [pickle](https://docs.python.org/3/library/pickle.html) and
      [shelve](https://docs.python.org/3/library/shelve.html)
    
    From internal perspective, serialized stream consists of a simple
    magic header and a record.
    
    .. seealso::
       Source - https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Stream+Format
    """

    class Codes(Enum):
        ruby_string = 34
        const_nil = 48
        ruby_symbol = 58
        ruby_symbol_link = 59
        ruby_object_link = 64
        const_false = 70
        instance_var = 73
        ruby_struct = 83
        const_true = 84
        ruby_array = 91
        packed_int = 105
        bignum = 108
        ruby_hash = 123
    def __init__(self, _io, _parent=None, _root=None):
        self._io = _io
        self._parent = _parent
        self._root = _root if _root else self
        self._read()

    def _read(self):
        self.version = self._io.read_bytes(2)
        if not self.version == b"\x04\x08":
            raise kaitaistruct.ValidationNotEqualError(b"\x04\x08", self.version, self._io, u"/seq/0")
        self.records = RubyMarshal.Record(self._io, self, self._root)

    class RubyArray(KaitaiStruct):
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.num_elements = RubyMarshal.PackedInt(self._io, self, self._root)
            self.elements = []
            for i in range(self.num_elements.value):
                self.elements.append(RubyMarshal.Record(self._io, self, self._root))



    class Bignum(KaitaiStruct):
        """
        .. seealso::
           Source - https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Bignum
        """
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.sign = self._io.read_u1()
            self.len_div_2 = RubyMarshal.PackedInt(self._io, self, self._root)
            self.body = self._io.read_bytes((self.len_div_2.value * 2))


    class RubyStruct(KaitaiStruct):
        """
        .. seealso::
           Source - https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Struct
        """
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.name = RubyMarshal.Record(self._io, self, self._root)
            self.num_members = RubyMarshal.PackedInt(self._io, self, self._root)
            self.members = []
            for i in range(self.num_members.value):
                self.members.append(RubyMarshal.Pair(self._io, self, self._root))



    class RubySymbol(KaitaiStruct):
        """
        .. seealso::
           Source - https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Symbols+and+Byte+Sequence
        """
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.len = RubyMarshal.PackedInt(self._io, self, self._root)
            self.name = (self._io.read_bytes(self.len.value)).decode(u"UTF-8")


    class PackedInt(KaitaiStruct):
        """Ruby uses sophisticated system to pack integers: first `code`
        byte either determines packing scheme or carries encoded
        immediate value (thus allowing smaller values from -123 to 122
        (inclusive) to take only one byte. There are 11 encoding schemes
        in total:
        
        * 0 is encoded specially (as 0)
        * 1..122 are encoded as immediate value with a shift
        * 123..255 are encoded with code of 0x01 and 1 extra byte
        * 0x100..0xffff are encoded with code of 0x02 and 2 extra bytes
        * 0x10000..0xffffff are encoded with code of 0x03 and 3 extra
          bytes
        * 0x1000000..0xffffffff are encoded with code of 0x04 and 4
          extra bytes
        * -123..-1 are encoded as immediate value with another shift
        * -256..-124 are encoded with code of 0xff and 1 extra byte
        * -0x10000..-257 are encoded with code of 0xfe and 2 extra bytes
        * -0x1000000..0x10001 are encoded with code of 0xfd and 3 extra
           bytes
        * -0x40000000..-0x1000001 are encoded with code of 0xfc and 4
           extra bytes
        
        Values beyond that are serialized as bignum (even if they
        technically might be not Bignum class in Ruby implementation,
        i.e. if they fit into 64 bits on a 64-bit platform).
        
        .. seealso::
           Source - https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Fixnum+and+long
        """
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.code = self._io.read_u1()
            _on = self.code
            if _on == 4:
                self.encoded = self._io.read_u4le()
            elif _on == 1:
                self.encoded = self._io.read_u1()
            elif _on == 252:
                self.encoded = self._io.read_u4le()
            elif _on == 253:
                self.encoded = self._io.read_u2le()
            elif _on == 3:
                self.encoded = self._io.read_u2le()
            elif _on == 2:
                self.encoded = self._io.read_u2le()
            elif _on == 255:
                self.encoded = self._io.read_u1()
            elif _on == 254:
                self.encoded = self._io.read_u2le()
            _on = self.code
            if _on == 3:
                self.encoded2 = self._io.read_u1()
            elif _on == 253:
                self.encoded2 = self._io.read_u1()

        @property
        def is_immediate(self):
            if hasattr(self, '_m_is_immediate'):
                return self._m_is_immediate

            self._m_is_immediate =  ((self.code > 4) and (self.code < 252)) 
            return getattr(self, '_m_is_immediate', None)

        @property
        def value(self):
            if hasattr(self, '_m_value'):
                return self._m_value

            self._m_value = (((self.code - 5) if self.code < 128 else (4 - (~(self.code) & 127))) if self.is_immediate else (0 if self.code == 0 else ((self.encoded - 256) if self.code == 255 else ((self.encoded - 65536) if self.code == 254 else ((((self.encoded2 << 16) | self.encoded) - 16777216) if self.code == 253 else (((self.encoded2 << 16) | self.encoded) if self.code == 3 else self.encoded))))))
            return getattr(self, '_m_value', None)


    class Pair(KaitaiStruct):
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.key = RubyMarshal.Record(self._io, self, self._root)
            self.value = RubyMarshal.Record(self._io, self, self._root)


    class InstanceVar(KaitaiStruct):
        """
        .. seealso::
           Source - https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Instance+Variables
        """
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.obj = RubyMarshal.Record(self._io, self, self._root)
            self.num_vars = RubyMarshal.PackedInt(self._io, self, self._root)
            self.vars = []
            for i in range(self.num_vars.value):
                self.vars.append(RubyMarshal.Pair(self._io, self, self._root))



    class Record(KaitaiStruct):
        """Each record starts with a single byte that determines its type
        (`code`) and contents. If necessary, additional info as parsed
        as `body`, to be determined by `code`.
        """
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.code = KaitaiStream.resolve_enum(RubyMarshal.Codes, self._io.read_u1())
            _on = self.code
            if _on == RubyMarshal.Codes.packed_int:
                self.body = RubyMarshal.PackedInt(self._io, self, self._root)
            elif _on == RubyMarshal.Codes.bignum:
                self.body = RubyMarshal.Bignum(self._io, self, self._root)
            elif _on == RubyMarshal.Codes.ruby_array:
                self.body = RubyMarshal.RubyArray(self._io, self, self._root)
            elif _on == RubyMarshal.Codes.ruby_symbol_link:
                self.body = RubyMarshal.PackedInt(self._io, self, self._root)
            elif _on == RubyMarshal.Codes.ruby_struct:
                self.body = RubyMarshal.RubyStruct(self._io, self, self._root)
            elif _on == RubyMarshal.Codes.ruby_string:
                self.body = RubyMarshal.RubyString(self._io, self, self._root)
            elif _on == RubyMarshal.Codes.instance_var:
                self.body = RubyMarshal.InstanceVar(self._io, self, self._root)
            elif _on == RubyMarshal.Codes.ruby_hash:
                self.body = RubyMarshal.RubyHash(self._io, self, self._root)
            elif _on == RubyMarshal.Codes.ruby_symbol:
                self.body = RubyMarshal.RubySymbol(self._io, self, self._root)
            elif _on == RubyMarshal.Codes.ruby_object_link:
                self.body = RubyMarshal.PackedInt(self._io, self, self._root)


    class RubyHash(KaitaiStruct):
        """
        .. seealso::
           Source - https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Hash+and+Hash+with+Default+Value
        """
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.num_pairs = RubyMarshal.PackedInt(self._io, self, self._root)
            self.pairs = []
            for i in range(self.num_pairs.value):
                self.pairs.append(RubyMarshal.Pair(self._io, self, self._root))



    class RubyString(KaitaiStruct):
        """
        .. seealso::
           Source - https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-String
        """
        def __init__(self, _io, _parent=None, _root=None):
            self._io = _io
            self._parent = _parent
            self._root = _root if _root else self
            self._read()

        def _read(self):
            self.len = RubyMarshal.PackedInt(self._io, self, self._root)
            self.body = self._io.read_bytes(self.len.value)