UTF-8-encoded string: C++98/STL parsing library

UTF-8 is a popular character encoding scheme that allows to represent strings as sequence of code points defined in Unicode standard. Its features are:

  • variable width (i.e. one code point might be represented by 1 to 4 bytes)
  • backward compatiblity with ASCII
  • basic validity checking (and thus distinguishing from other legacy 8-bit encodings)
  • maintaining sort order of codepoints if sorted as a byte array

WARNING: For the vast majority of practical purposes of format definitions in Kaitai Struct, you'd likely NOT want to use this and rather just use type: str with encoding: utf-8. That will use native string implementations, which are most likely more efficient and will give you native language strings, rather than an array of individual codepoints. This format definition is provided mostly for educational / research purposes.

File extension

txt

KS implementation details

License: CC0-1.0

References

This page hosts a formal specification of UTF-8-encoded string using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Usage

Runtime library

All parsing code for C++98/STL generated by Kaitai Struct depends on the C++/STL runtime library. You have to install it before you can parse data.

For C++, the easiest way is to clone the runtime library sources and build them along with your project.

Code

Using Kaitai Struct in C++/STL usually consists of 3 steps.

  1. We need to create an STL input stream (std::istream). One can open local file for that, or use existing std::string or char* buffer.
    #include <fstream>
    
    std::ifstream is("path/to/local/file.txt", std::ifstream::binary);
    
    #include <sstream>
    
    std::istringstream is(str);
    
    #include <sstream>
    
    const char buf[] = { ... };
    std::string str(buf, sizeof buf);
    std::istringstream is(str);
    
  2. We need to wrap our input stream into Kaitai stream:
    #include "kaitai/kaitaistream.h"
    
    kaitai::kstream ks(&is);
    
  3. And finally, we can invoke the parsing:
    utf8_string_t data(&ks);
    

After that, one can get various attributes from the structure by invoking getter methods like:

data.codepoints() // => get codepoints

C++98/STL source code to parse UTF-8-encoded string

utf8_string.h

#ifndef UTF8_STRING_H_
#define UTF8_STRING_H_

// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

#include "kaitai/kaitaistruct.h"
#include <stdint.h>
#include <vector>

#if KAITAI_STRUCT_VERSION < 9000L
#error "Incompatible Kaitai Struct C++/STL API: version 0.9 or later is required"
#endif

/**
 * UTF-8 is a popular character encoding scheme that allows to
 * represent strings as sequence of code points defined in Unicode
 * standard. Its features are:
 * 
 * * variable width (i.e. one code point might be represented by 1 to 4
 *   bytes)
 * * backward compatiblity with ASCII
 * * basic validity checking (and thus distinguishing from other legacy
 *   8-bit encodings)
 * * maintaining sort order of codepoints if sorted as a byte array
 * 
 * WARNING: For the vast majority of practical purposes of format
 * definitions in Kaitai Struct, you'd likely NOT want to use this and
 * rather just use `type: str` with `encoding: utf-8`. That will use
 * native string implementations, which are most likely more efficient
 * and will give you native language strings, rather than an array of
 * individual codepoints.  This format definition is provided mostly
 * for educational / research purposes.
 */

class utf8_string_t : public kaitai::kstruct {

public:
    class utf8_codepoint_t;

    utf8_string_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent = 0, utf8_string_t* p__root = 0);

private:
    void _read();
    void _clean_up();

public:
    ~utf8_string_t();

    class utf8_codepoint_t : public kaitai::kstruct {

    public:

        utf8_codepoint_t(uint64_t p_ofs, kaitai::kstream* p__io, utf8_string_t* p__parent = 0, utf8_string_t* p__root = 0);

    private:
        void _read();
        void _clean_up();

    public:
        ~utf8_codepoint_t();

    private:
        bool f_raw1;
        int32_t m_raw1;
        bool n_raw1;

    public:
        bool _is_null_raw1() { raw1(); return n_raw1; };

    private:

    public:
        int32_t raw1();

    private:
        bool f_len_bytes;
        int32_t m_len_bytes;

    public:
        int32_t len_bytes();

    private:
        bool f_raw3;
        int32_t m_raw3;
        bool n_raw3;

    public:
        bool _is_null_raw3() { raw3(); return n_raw3; };

    private:

    public:
        int32_t raw3();

    private:
        bool f_value_as_int;
        int32_t m_value_as_int;

    public:
        int32_t value_as_int();

    private:
        bool f_raw0;
        int32_t m_raw0;

    public:
        int32_t raw0();

    private:
        bool f_byte0;
        uint8_t m_byte0;

    public:
        uint8_t byte0();

    private:
        bool f_raw2;
        int32_t m_raw2;
        bool n_raw2;

    public:
        bool _is_null_raw2() { raw2(); return n_raw2; };

    private:

    public:
        int32_t raw2();

    private:
        std::string m_bytes;
        uint64_t m_ofs;
        utf8_string_t* m__root;
        utf8_string_t* m__parent;

    public:
        std::string bytes() const { return m_bytes; }
        uint64_t ofs() const { return m_ofs; }
        utf8_string_t* _root() const { return m__root; }
        utf8_string_t* _parent() const { return m__parent; }
    };

private:
    std::vector<utf8_codepoint_t*>* m_codepoints;
    utf8_string_t* m__root;
    kaitai::kstruct* m__parent;

public:
    std::vector<utf8_codepoint_t*>* codepoints() const { return m_codepoints; }
    utf8_string_t* _root() const { return m__root; }
    kaitai::kstruct* _parent() const { return m__parent; }
};

#endif  // UTF8_STRING_H_

utf8_string.cpp

// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

#include "utf8_string.h"

utf8_string_t::utf8_string_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent, utf8_string_t* p__root) : kaitai::kstruct(p__io) {
    m__parent = p__parent;
    m__root = this;
    m_codepoints = 0;

    try {
        _read();
    } catch(...) {
        _clean_up();
        throw;
    }
}

void utf8_string_t::_read() {
    m_codepoints = new std::vector<utf8_codepoint_t*>();
    {
        int i = 0;
        while (!m__io->is_eof()) {
            m_codepoints->push_back(new utf8_codepoint_t(_io()->pos(), m__io, this, m__root));
            i++;
        }
    }
}

utf8_string_t::~utf8_string_t() {
    _clean_up();
}

void utf8_string_t::_clean_up() {
    if (m_codepoints) {
        for (std::vector<utf8_codepoint_t*>::iterator it = m_codepoints->begin(); it != m_codepoints->end(); ++it) {
            delete *it;
        }
        delete m_codepoints; m_codepoints = 0;
    }
}

utf8_string_t::utf8_codepoint_t::utf8_codepoint_t(uint64_t p_ofs, kaitai::kstream* p__io, utf8_string_t* p__parent, utf8_string_t* p__root) : kaitai::kstruct(p__io) {
    m__parent = p__parent;
    m__root = p__root;
    m_ofs = p_ofs;
    f_raw1 = false;
    f_len_bytes = false;
    f_raw3 = false;
    f_value_as_int = false;
    f_raw0 = false;
    f_byte0 = false;
    f_raw2 = false;

    try {
        _read();
    } catch(...) {
        _clean_up();
        throw;
    }
}

void utf8_string_t::utf8_codepoint_t::_read() {
    m_bytes = m__io->read_bytes(len_bytes());
}

utf8_string_t::utf8_codepoint_t::~utf8_codepoint_t() {
    _clean_up();
}

void utf8_string_t::utf8_codepoint_t::_clean_up() {
    if (f_byte0) {
    }
}

int32_t utf8_string_t::utf8_codepoint_t::raw1() {
    if (f_raw1)
        return m_raw1;
    n_raw1 = true;
    if (len_bytes() >= 2) {
        n_raw1 = false;
        m_raw1 = (bytes()[1] & 63);
    }
    f_raw1 = true;
    return m_raw1;
}

int32_t utf8_string_t::utf8_codepoint_t::len_bytes() {
    if (f_len_bytes)
        return m_len_bytes;
    m_len_bytes = (((byte0() & 128) == 0) ? (1) : ((((byte0() & 224) == 192) ? (2) : ((((byte0() & 240) == 224) ? (3) : ((((byte0() & 248) == 240) ? (4) : (-1))))))));
    f_len_bytes = true;
    return m_len_bytes;
}

int32_t utf8_string_t::utf8_codepoint_t::raw3() {
    if (f_raw3)
        return m_raw3;
    n_raw3 = true;
    if (len_bytes() >= 4) {
        n_raw3 = false;
        m_raw3 = (bytes()[3] & 63);
    }
    f_raw3 = true;
    return m_raw3;
}

int32_t utf8_string_t::utf8_codepoint_t::value_as_int() {
    if (f_value_as_int)
        return m_value_as_int;
    m_value_as_int = ((len_bytes() == 1) ? (raw0()) : (((len_bytes() == 2) ? (((raw0() << 6) | raw1())) : (((len_bytes() == 3) ? ((((raw0() << 12) | (raw1() << 6)) | raw2())) : (((len_bytes() == 4) ? (((((raw0() << 18) | (raw1() << 12)) | (raw2() << 6)) | raw3())) : (-1))))))));
    f_value_as_int = true;
    return m_value_as_int;
}

int32_t utf8_string_t::utf8_codepoint_t::raw0() {
    if (f_raw0)
        return m_raw0;
    m_raw0 = (bytes()[0] & ((len_bytes() == 1) ? (127) : (((len_bytes() == 2) ? (31) : (((len_bytes() == 3) ? (15) : (((len_bytes() == 4) ? (7) : (0)))))))));
    f_raw0 = true;
    return m_raw0;
}

uint8_t utf8_string_t::utf8_codepoint_t::byte0() {
    if (f_byte0)
        return m_byte0;
    std::streampos _pos = m__io->pos();
    m__io->seek(ofs());
    m_byte0 = m__io->read_u1();
    m__io->seek(_pos);
    f_byte0 = true;
    return m_byte0;
}

int32_t utf8_string_t::utf8_codepoint_t::raw2() {
    if (f_raw2)
        return m_raw2;
    n_raw2 = true;
    if (len_bytes() >= 3) {
        n_raw2 = false;
        m_raw2 = (bytes()[2] & 63);
    }
    f_raw2 = true;
    return m_raw2;
}