ruby_marshal: Ruby parsing library

Ruby's Marshal module allows serialization and deserialization of many standard and arbitrary Ruby objects in a compact binary format. It is relatively fast, available in stdlibs standard and allows conservation of language-specific properties (such as symbols or encoding-aware strings).

Feature-wise, it is comparable to other language-specific implementations, such as:

From internal perspective, serialized stream consists of a simple magic header and a record.

KS implementation details

License: CC0-1.0

This page hosts a formal specification of ruby_marshal using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Usage

Runtime library

All parsing code for Ruby generated by Kaitai Struct depends on the Ruby runtime library. You have to install it before you can parse data.

The Ruby runtime library can be installed from RubyGems:

gem install kaitai-struct

Code

Parse a local file and get structure in memory:

data = RubyMarshal.from_file("path/to/local/file.bin")

Or parse structure from a string of bytes:

bytes = "\x00\x01\x02..."
data = RubyMarshal.new(Kaitai::Struct::Stream.new(bytes))

After that, one can get various attributes from the structure by invoking getter methods like:

data.version # => get version

Ruby source code to parse ruby_marshal

ruby_marshal.rb

# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

require 'kaitai/struct/struct'

unless Gem::Version.new(Kaitai::Struct::VERSION) >= Gem::Version.new('0.9')
  raise "Incompatible Kaitai Struct Ruby API: 0.9 or later is required, but you have #{Kaitai::Struct::VERSION}"
end


##
# Ruby's Marshal module allows serialization and deserialization of
# many standard and arbitrary Ruby objects in a compact binary
# format. It is relatively fast, available in stdlibs standard and
# allows conservation of language-specific properties (such as symbols
# or encoding-aware strings).
# 
# Feature-wise, it is comparable to other language-specific
# implementations, such as:
# 
# * Java's
#   [Serializable](https://docs.oracle.com/javase/8/docs/api/java/io/Serializable.html)
# * .NET
#   [BinaryFormatter](https://learn.microsoft.com/en-us/dotnet/api/system.runtime.serialization.formatters.binary.binaryformatter?view=net-7.0)
# * Python's
#   [marshal](https://docs.python.org/3/library/marshal.html),
#   [pickle](https://docs.python.org/3/library/pickle.html) and
#   [shelve](https://docs.python.org/3/library/shelve.html)
# 
# From internal perspective, serialized stream consists of a simple
# magic header and a record.
# @see https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Stream+Format Source
class RubyMarshal < Kaitai::Struct::Struct

  CODES = {
    34 => :codes_ruby_string,
    48 => :codes_const_nil,
    58 => :codes_ruby_symbol,
    59 => :codes_ruby_symbol_link,
    64 => :codes_ruby_object_link,
    70 => :codes_const_false,
    73 => :codes_instance_var,
    83 => :codes_ruby_struct,
    84 => :codes_const_true,
    91 => :codes_ruby_array,
    105 => :codes_packed_int,
    108 => :codes_bignum,
    123 => :codes_ruby_hash,
  }
  I__CODES = CODES.invert
  def initialize(_io, _parent = nil, _root = self)
    super(_io, _parent, _root)
    _read
  end

  def _read
    @version = @_io.read_bytes(2)
    raise Kaitai::Struct::ValidationNotEqualError.new([4, 8].pack('C*'), version, _io, "/seq/0") if not version == [4, 8].pack('C*')
    @records = Record.new(@_io, self, @_root)
    self
  end
  class RubyArray < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @num_elements = PackedInt.new(@_io, self, @_root)
      @elements = []
      (num_elements.value).times { |i|
        @elements << Record.new(@_io, self, @_root)
      }
      self
    end
    attr_reader :num_elements
    attr_reader :elements
  end

  ##
  # @see https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Bignum Source
  class Bignum < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @sign = @_io.read_u1
      @len_div_2 = PackedInt.new(@_io, self, @_root)
      @body = @_io.read_bytes((len_div_2.value * 2))
      self
    end

    ##
    # A single byte containing `+` for a positive value or `-` for a negative value.
    attr_reader :sign

    ##
    # Length of bignum body, divided by 2.
    attr_reader :len_div_2

    ##
    # Bytes that represent the number, see ruby-lang.org docs for reconstruction algorithm.
    attr_reader :body
  end

  ##
  # @see https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Struct Source
  class RubyStruct < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @name = Record.new(@_io, self, @_root)
      @num_members = PackedInt.new(@_io, self, @_root)
      @members = []
      (num_members.value).times { |i|
        @members << Pair.new(@_io, self, @_root)
      }
      self
    end

    ##
    # Symbol containing the name of the struct.
    attr_reader :name

    ##
    # Number of members in a struct
    attr_reader :num_members
    attr_reader :members
  end

  ##
  # @see https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Symbols+and+Byte+Sequence Source
  class RubySymbol < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @len = PackedInt.new(@_io, self, @_root)
      @name = (@_io.read_bytes(len.value)).force_encoding("UTF-8")
      self
    end
    attr_reader :len
    attr_reader :name
  end

  ##
  # Ruby uses sophisticated system to pack integers: first `code`
  # byte either determines packing scheme or carries encoded
  # immediate value (thus allowing smaller values from -123 to 122
  # (inclusive) to take only one byte. There are 11 encoding schemes
  # in total:
  # 
  # * 0 is encoded specially (as 0)
  # * 1..122 are encoded as immediate value with a shift
  # * 123..255 are encoded with code of 0x01 and 1 extra byte
  # * 0x100..0xffff are encoded with code of 0x02 and 2 extra bytes
  # * 0x10000..0xffffff are encoded with code of 0x03 and 3 extra
  #   bytes
  # * 0x1000000..0xffffffff are encoded with code of 0x04 and 4
  #   extra bytes
  # * -123..-1 are encoded as immediate value with another shift
  # * -256..-124 are encoded with code of 0xff and 1 extra byte
  # * -0x10000..-257 are encoded with code of 0xfe and 2 extra bytes
  # * -0x1000000..0x10001 are encoded with code of 0xfd and 3 extra
  #    bytes
  # * -0x40000000..-0x1000001 are encoded with code of 0xfc and 4
  #    extra bytes
  # 
  # Values beyond that are serialized as bignum (even if they
  # technically might be not Bignum class in Ruby implementation,
  # i.e. if they fit into 64 bits on a 64-bit platform).
  # @see https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Fixnum+and+long Source
  class PackedInt < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @code = @_io.read_u1
      case code
      when 4
        @encoded = @_io.read_u4le
      when 1
        @encoded = @_io.read_u1
      when 252
        @encoded = @_io.read_u4le
      when 253
        @encoded = @_io.read_u2le
      when 3
        @encoded = @_io.read_u2le
      when 2
        @encoded = @_io.read_u2le
      when 255
        @encoded = @_io.read_u1
      when 254
        @encoded = @_io.read_u2le
      end
      case code
      when 3
        @encoded2 = @_io.read_u1
      when 253
        @encoded2 = @_io.read_u1
      end
      self
    end
    def is_immediate
      return @is_immediate unless @is_immediate.nil?
      @is_immediate =  ((code > 4) && (code < 252)) 
      @is_immediate
    end
    def value
      return @value unless @value.nil?
      @value = (is_immediate ? (code < 128 ? (code - 5) : (4 - (~(code) & 127))) : (code == 0 ? 0 : (code == 255 ? (encoded - 256) : (code == 254 ? (encoded - 65536) : (code == 253 ? (((encoded2 << 16) | encoded) - 16777216) : (code == 3 ? ((encoded2 << 16) | encoded) : encoded))))))
      @value
    end
    attr_reader :code
    attr_reader :encoded

    ##
    # One extra byte for 3-byte integers (0x03 and 0xfd), as
    # there is no standard `u3` type in KS.
    attr_reader :encoded2
  end
  class Pair < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @key = Record.new(@_io, self, @_root)
      @value = Record.new(@_io, self, @_root)
      self
    end
    attr_reader :key
    attr_reader :value
  end

  ##
  # @see https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Instance+Variables Source
  class InstanceVar < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @obj = Record.new(@_io, self, @_root)
      @num_vars = PackedInt.new(@_io, self, @_root)
      @vars = []
      (num_vars.value).times { |i|
        @vars << Pair.new(@_io, self, @_root)
      }
      self
    end
    attr_reader :obj
    attr_reader :num_vars
    attr_reader :vars
  end

  ##
  # Each record starts with a single byte that determines its type
  # (`code`) and contents. If necessary, additional info as parsed
  # as `body`, to be determined by `code`.
  class Record < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @code = Kaitai::Struct::Stream::resolve_enum(RubyMarshal::CODES, @_io.read_u1)
      case code
      when :codes_packed_int
        @body = PackedInt.new(@_io, self, @_root)
      when :codes_bignum
        @body = Bignum.new(@_io, self, @_root)
      when :codes_ruby_array
        @body = RubyArray.new(@_io, self, @_root)
      when :codes_ruby_symbol_link
        @body = PackedInt.new(@_io, self, @_root)
      when :codes_ruby_struct
        @body = RubyStruct.new(@_io, self, @_root)
      when :codes_ruby_string
        @body = RubyString.new(@_io, self, @_root)
      when :codes_instance_var
        @body = InstanceVar.new(@_io, self, @_root)
      when :codes_ruby_hash
        @body = RubyHash.new(@_io, self, @_root)
      when :codes_ruby_symbol
        @body = RubySymbol.new(@_io, self, @_root)
      when :codes_ruby_object_link
        @body = PackedInt.new(@_io, self, @_root)
      end
      self
    end
    attr_reader :code
    attr_reader :body
  end

  ##
  # @see https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-Hash+and+Hash+with+Default+Value Source
  class RubyHash < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @num_pairs = PackedInt.new(@_io, self, @_root)
      @pairs = []
      (num_pairs.value).times { |i|
        @pairs << Pair.new(@_io, self, @_root)
      }
      self
    end
    attr_reader :num_pairs
    attr_reader :pairs
  end

  ##
  # @see https://docs.ruby-lang.org/en/2.4.0/marshal_rdoc.html#label-String Source
  class RubyString < Kaitai::Struct::Struct
    def initialize(_io, _parent = nil, _root = self)
      super(_io, _parent, _root)
      _read
    end

    def _read
      @len = PackedInt.new(@_io, self, @_root)
      @body = @_io.read_bytes(len.value)
      self
    end
    attr_reader :len
    attr_reader :body
  end
  attr_reader :version
  attr_reader :records
end