UTF-8-encoded string: C++/STL parsing library

UTF-8 is a popular character encoding scheme that allows to represent strings as sequence of code points defined in Unicode standard. Its features are:

  • variable width (i.e. one code point might be represented by 1 to 4 bytes)
  • backward compatiblity with ASCII
  • basic validity checking (and thus distinguishing from other legacy 8-bit encodings)
  • maintaining sort order of codepoints if sorted as a byte array

WARNING: For the vast majority of practical purposes of format definitions in Kaitai Struct, you'd likely NOT want to use this and rather just use type: str with encoding: utf8. That will use native string implementations, which are most likely more efficient and will give you native language strings, rather than an array of individual codepoints. This format definition is provided mostly for educational / research purposes.

KS implementation details

License: CC0-1.0

This page hosts a formal specification of UTF-8-encoded string using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Usage

Using Kaitai Struct in C++/STL usually consists of 3 steps.

  1. We need to create an STL input stream (std::istream). One can open local file for that, or use existing std::string or char* buffer.
    #include <fstream>
    
    std::ifstream is("path/to/local/file.utf8_string", std::ifstream::binary);
    #include <sstream>
    
    std::istringstream is(str);
    #include <sstream>
    
    const char buf[] = { ... };
    std::string str(buf, sizeof buf);
    std::istringstream is(str);
  2. We need to wrap our input stream into Kaitai stream:
    #include <kaitai/kaitaistream.h>
    
    kaitai::kstream ks(&is);
  3. And finally, we can invoke the parsing:
    utf8_string_t data(&ks);

After that, one can get various attributes from the structure by invoking getter methods like:

data.codepoints() // => get codepoints

C++/STL source code to parse UTF-8-encoded string

utf8_string.h

#ifndef UTF8_STRING_H_
#define UTF8_STRING_H_

// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

#include "kaitai/kaitaistruct.h"

#include <stdint.h>
#include <vector>

#if KAITAI_STRUCT_VERSION < 7000L
#error "Incompatible Kaitai Struct C++/STL API: version 0.7 or later is required"
#endif

/**
 * UTF-8 is a popular character encoding scheme that allows to
 * represent strings as sequence of code points defined in Unicode
 * standard. Its features are:
 * 
 * * variable width (i.e. one code point might be represented by 1 to 4
 *   bytes)
 * * backward compatiblity with ASCII
 * * basic validity checking (and thus distinguishing from other legacy
 *   8-bit encodings)
 * * maintaining sort order of codepoints if sorted as a byte array
 * 
 * WARNING: For the vast majority of practical purposes of format
 * definitions in Kaitai Struct, you'd likely NOT want to use this and
 * rather just use `type: str` with `encoding: utf8`. That will use
 * native string implementations, which are most likely more efficient
 * and will give you native language strings, rather than an array of
 * individual codepoints.  This format definition is provided mostly
 * for educational / research purposes.
 */

class utf8_string_t : public kaitai::kstruct {

public:
    class utf8_codepoint_t;

    utf8_string_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent = 0, utf8_string_t* p__root = 0);

private:
    void _read();

public:
    ~utf8_string_t();

    class utf8_codepoint_t : public kaitai::kstruct {

    public:

        utf8_codepoint_t(kaitai::kstream* p__io, utf8_string_t* p__parent = 0, utf8_string_t* p__root = 0);

    private:
        void _read();

    public:
        ~utf8_codepoint_t();

    private:
        bool f_raw1;
        int32_t m_raw1;

    public:
        int32_t raw1();

    private:
        bool f_raw4;
        int32_t m_raw4;
        bool n_raw4;

    public:
        bool _is_null_raw4() { raw4(); return n_raw4; };

    private:

    public:
        int32_t raw4();

    private:
        bool f_raw3;
        int32_t m_raw3;
        bool n_raw3;

    public:
        bool _is_null_raw3() { raw3(); return n_raw3; };

    private:

    public:
        int32_t raw3();

    private:
        bool f_value_as_int;
        int32_t m_value_as_int;

    public:
        int32_t value_as_int();

    private:
        bool f_raw2;
        int32_t m_raw2;
        bool n_raw2;

    public:
        bool _is_null_raw2() { raw2(); return n_raw2; };

    private:

    public:
        int32_t raw2();

    private:
        bool f_len;
        int32_t m_len;

    public:
        int32_t len();

    private:
        uint8_t m_byte1;
        uint8_t m_byte2;
        bool n_byte2;

    public:
        bool _is_null_byte2() { byte2(); return n_byte2; };

    private:
        uint8_t m_byte3;
        bool n_byte3;

    public:
        bool _is_null_byte3() { byte3(); return n_byte3; };

    private:
        uint8_t m_byte4;
        bool n_byte4;

    public:
        bool _is_null_byte4() { byte4(); return n_byte4; };

    private:
        utf8_string_t* m__root;
        utf8_string_t* m__parent;

    public:
        uint8_t byte1() const { return m_byte1; }
        uint8_t byte2() const { return m_byte2; }
        uint8_t byte3() const { return m_byte3; }
        uint8_t byte4() const { return m_byte4; }
        utf8_string_t* _root() const { return m__root; }
        utf8_string_t* _parent() const { return m__parent; }
    };

private:
    std::vector<utf8_codepoint_t*>* m_codepoints;
    utf8_string_t* m__root;
    kaitai::kstruct* m__parent;

public:
    std::vector<utf8_codepoint_t*>* codepoints() const { return m_codepoints; }
    utf8_string_t* _root() const { return m__root; }
    kaitai::kstruct* _parent() const { return m__parent; }
};

#endif  // UTF8_STRING_H_

utf8_string.cpp

// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

#include "utf8_string.h"



utf8_string_t::utf8_string_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent, utf8_string_t* p__root) : kaitai::kstruct(p__io) {
    m__parent = p__parent;
    m__root = this;
    _read();
}

void utf8_string_t::_read() {
    m_codepoints = new std::vector<utf8_codepoint_t*>();
    {
        int i = 0;
        while (!m__io->is_eof()) {
            m_codepoints->push_back(new utf8_codepoint_t(m__io, this, m__root));
            i++;
        }
    }
}

utf8_string_t::~utf8_string_t() {
    for (std::vector<utf8_codepoint_t*>::iterator it = m_codepoints->begin(); it != m_codepoints->end(); ++it) {
        delete *it;
    }
    delete m_codepoints;
}

utf8_string_t::utf8_codepoint_t::utf8_codepoint_t(kaitai::kstream* p__io, utf8_string_t* p__parent, utf8_string_t* p__root) : kaitai::kstruct(p__io) {
    m__parent = p__parent;
    m__root = p__root;
    f_raw1 = false;
    f_raw4 = false;
    f_raw3 = false;
    f_value_as_int = false;
    f_raw2 = false;
    f_len = false;
    _read();
}

void utf8_string_t::utf8_codepoint_t::_read() {
    m_byte1 = m__io->read_u1();
    n_byte2 = true;
    if (len() >= 2) {
        n_byte2 = false;
        m_byte2 = m__io->read_u1();
    }
    n_byte3 = true;
    if (len() >= 3) {
        n_byte3 = false;
        m_byte3 = m__io->read_u1();
    }
    n_byte4 = true;
    if (len() >= 4) {
        n_byte4 = false;
        m_byte4 = m__io->read_u1();
    }
}

utf8_string_t::utf8_codepoint_t::~utf8_codepoint_t() {
    if (!n_byte2) {
    }
    if (!n_byte3) {
    }
    if (!n_byte4) {
    }
}

int32_t utf8_string_t::utf8_codepoint_t::raw1() {
    if (f_raw1)
        return m_raw1;
    m_raw1 = (byte1() & ((len() == 1) ? (127) : (((len() == 2) ? (31) : (((len() == 3) ? (15) : (((len() == 4) ? (7) : (0)))))))));
    f_raw1 = true;
    return m_raw1;
}

int32_t utf8_string_t::utf8_codepoint_t::raw4() {
    if (f_raw4)
        return m_raw4;
    n_raw4 = true;
    if (len() >= 4) {
        n_raw4 = false;
        m_raw4 = (byte4() & 63);
    }
    f_raw4 = true;
    return m_raw4;
}

int32_t utf8_string_t::utf8_codepoint_t::raw3() {
    if (f_raw3)
        return m_raw3;
    n_raw3 = true;
    if (len() >= 3) {
        n_raw3 = false;
        m_raw3 = (byte3() & 63);
    }
    f_raw3 = true;
    return m_raw3;
}

int32_t utf8_string_t::utf8_codepoint_t::value_as_int() {
    if (f_value_as_int)
        return m_value_as_int;
    m_value_as_int = ((len() == 1) ? (raw1()) : (((len() == 2) ? (((raw1() << 6) | raw2())) : (((len() == 3) ? ((((raw1() << 12) | (raw2() << 6)) | raw3())) : (((len() == 4) ? (((((raw1() << 18) | (raw2() << 12)) | (raw3() << 6)) | raw4())) : (-1))))))));
    f_value_as_int = true;
    return m_value_as_int;
}

int32_t utf8_string_t::utf8_codepoint_t::raw2() {
    if (f_raw2)
        return m_raw2;
    n_raw2 = true;
    if (len() >= 2) {
        n_raw2 = false;
        m_raw2 = (byte2() & 63);
    }
    f_raw2 = true;
    return m_raw2;
}

int32_t utf8_string_t::utf8_codepoint_t::len() {
    if (f_len)
        return m_len;
    m_len = (((byte1() & 128) == 0) ? (1) : ((((byte1() & 224) == 192) ? (2) : ((((byte1() & 240) == 224) ? (3) : ((((byte1() & 248) == 240) ? (4) : (-1))))))));
    f_len = true;
    return m_len;
}