Variable length quantity, unsigned integer, base128, little-endian: C++/STL parsing library

A variable-length unsigned integer using base128 encoding. 1-byte groups consists of 1-bit flag of continuation and 7-bit value, and are ordered "least significant group first", i.e. in "little-endian" manner.

This particular encoding is specified and used in:

  • DWARF debug file format, where it's dubbed "unsigned LEB128" or "ULEB128". http://dwarfstd.org/doc/dwarf-2.0.0.pdf - page 139
  • Google Protocol Buffers, where it's called "Base 128 Varints". https://developers.google.com/protocol-buffers/docs/encoding?csw=1#varints
  • Apache Lucene, where it's called "VInt" http://lucene.apache.org/core/3_5_0/fileformats.html#VInt
  • Apache Avro uses this as a basis for integer encoding, adding ZigZag on top of it for signed ints http://avro.apache.org/docs/current/spec.html#binary_encode_primitive

More information on this encoding is available at https://en.wikipedia.org/wiki/LEB128

This particular implementation supports serialized values to up 8 bytes long.

KS implementation details

License: CC0-1.0
Minimal Kaitai Struct required: 0.7

This page hosts a formal specification of Variable length quantity, unsigned integer, base128, little-endian using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Usage

Using Kaitai Struct in C++/STL usually consists of 3 steps.

  1. We need to create an STL input stream (std::istream).
    • One can open a stream for reading from a local file:
      #include <fstream>
      
      std::ifstream is("path/to/local/file.vlq_base128_le", std::ifstream::binary);
    • Or one can prepare a stream for reading from existing std::string str:
      #include <sstream>
      
      std::istringstream is(str);
    • Or one can parse arbitrary char* buffer in memory, given that we know its size:
      #include <sstream>
      
      const char buf[] = { ... };
      std::string str(buf, sizeof buf);
      std::istringstream is(str);
  2. We need to wrap our input stream into Kaitai stream:
    #include <kaitai/kaitaistream.h>
    
    kaitai::kstream ks(&is);
  3. And finally, we can invoke the parsing:
    vlq_base128_le_t data(&ks);

After that, one can get various attributes from the structure by invoking getter methods like:

data.value() // => Resulting value as normal integer

C++/STL source code to parse Variable length quantity, unsigned integer, base128, little-endian

vlq_base128_le.h

#ifndef VLQ_BASE128_LE_H_
#define VLQ_BASE128_LE_H_

// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

#include "kaitai/kaitaistruct.h"

#include <stdint.h>
#include <vector>

#if KAITAI_STRUCT_VERSION < 7000L
#error "Incompatible Kaitai Struct C++/STL API: version 0.7 or later is required"
#endif

/**
 * A variable-length unsigned integer using base128 encoding. 1-byte groups
 * consists of 1-bit flag of continuation and 7-bit value, and are ordered
 * "least significant group first", i.e. in "little-endian" manner.
 * 
 * This particular encoding is specified and used in:
 * 
 * * DWARF debug file format, where it's dubbed "unsigned LEB128" or "ULEB128".
 *   http://dwarfstd.org/doc/dwarf-2.0.0.pdf - page 139
 * * Google Protocol Buffers, where it's called "Base 128 Varints".
 *   https://developers.google.com/protocol-buffers/docs/encoding?csw=1#varints
 * * Apache Lucene, where it's called "VInt"
 *   http://lucene.apache.org/core/3_5_0/fileformats.html#VInt
 * * Apache Avro uses this as a basis for integer encoding, adding ZigZag on
 *   top of it for signed ints
 *   http://avro.apache.org/docs/current/spec.html#binary_encode_primitive
 * 
 * More information on this encoding is available at https://en.wikipedia.org/wiki/LEB128
 * 
 * This particular implementation supports serialized values to up 8 bytes long.
 */

class vlq_base128_le_t : public kaitai::kstruct {

public:
    class group_t;

    vlq_base128_le_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent = 0, vlq_base128_le_t* p__root = 0);

private:
    void _read();

public:
    ~vlq_base128_le_t();

    /**
     * One byte group, clearly divided into 7-bit "value" and 1-bit "has continuation
     * in the next byte" flag.
     */

    class group_t : public kaitai::kstruct {

    public:

        group_t(kaitai::kstream* p__io, vlq_base128_le_t* p__parent = 0, vlq_base128_le_t* p__root = 0);

    private:
        void _read();

    public:
        ~group_t();

    private:
        bool f_has_next;
        bool m_has_next;

    public:

        /**
         * If true, then we have more bytes to read
         */
        bool has_next();

    private:
        bool f_value;
        int32_t m_value;

    public:

        /**
         * The 7-bit (base128) numeric value of this group
         */
        int32_t value();

    private:
        uint8_t m_b;
        vlq_base128_le_t* m__root;
        vlq_base128_le_t* m__parent;

    public:
        uint8_t b() const { return m_b; }
        vlq_base128_le_t* _root() const { return m__root; }
        vlq_base128_le_t* _parent() const { return m__parent; }
    };

private:
    bool f_len;
    int32_t m_len;

public:
    int32_t len();

private:
    bool f_value;
    int32_t m_value;

public:

    /**
     * Resulting value as normal integer
     */
    int32_t value();

private:
    std::vector<group_t*>* m_groups;
    vlq_base128_le_t* m__root;
    kaitai::kstruct* m__parent;

public:
    std::vector<group_t*>* groups() const { return m_groups; }
    vlq_base128_le_t* _root() const { return m__root; }
    kaitai::kstruct* _parent() const { return m__parent; }
};

#endif  // VLQ_BASE128_LE_H_

vlq_base128_le.cpp

// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

#include "vlq_base128_le.h"



vlq_base128_le_t::vlq_base128_le_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent, vlq_base128_le_t* p__root) : kaitai::kstruct(p__io) {
    m__parent = p__parent;
    m__root = this;
    f_len = false;
    f_value = false;
    _read();
}

void vlq_base128_le_t::_read() {
    m_groups = new std::vector<group_t*>();
    {
        int i = 0;
        group_t* _;
        do {
            _ = new group_t(m__io, this, m__root);
            m_groups->push_back(_);
            i++;
        } while (!(!(_->has_next())));
    }
}

vlq_base128_le_t::~vlq_base128_le_t() {
    for (std::vector<group_t*>::iterator it = m_groups->begin(); it != m_groups->end(); ++it) {
        delete *it;
    }
    delete m_groups;
}

vlq_base128_le_t::group_t::group_t(kaitai::kstream* p__io, vlq_base128_le_t* p__parent, vlq_base128_le_t* p__root) : kaitai::kstruct(p__io) {
    m__parent = p__parent;
    m__root = p__root;
    f_has_next = false;
    f_value = false;
    _read();
}

void vlq_base128_le_t::group_t::_read() {
    m_b = m__io->read_u1();
}

vlq_base128_le_t::group_t::~group_t() {
}

bool vlq_base128_le_t::group_t::has_next() {
    if (f_has_next)
        return m_has_next;
    m_has_next = (b() & 128) != 0;
    f_has_next = true;
    return m_has_next;
}

int32_t vlq_base128_le_t::group_t::value() {
    if (f_value)
        return m_value;
    m_value = (b() & 127);
    f_value = true;
    return m_value;
}

int32_t vlq_base128_le_t::len() {
    if (f_len)
        return m_len;
    m_len = groups()->size();
    f_len = true;
    return m_len;
}

int32_t vlq_base128_le_t::value() {
    if (f_value)
        return m_value;
    m_value = (((((((groups()->at(0)->value() + ((len() >= 2) ? ((groups()->at(1)->value() << 7)) : (0))) + ((len() >= 3) ? ((groups()->at(2)->value() << 14)) : (0))) + ((len() >= 4) ? ((groups()->at(3)->value() << 21)) : (0))) + ((len() >= 5) ? ((groups()->at(4)->value() << 28)) : (0))) + ((len() >= 6) ? ((groups()->at(5)->value() << 35)) : (0))) + ((len() >= 7) ? ((groups()->at(6)->value() << 42)) : (0))) + ((len() >= 8) ? ((groups()->at(7)->value() << 49)) : (0)));
    f_value = true;
    return m_value;
}