UTF-16 string with BOM: C++98/STL parsing library

A simple wrapper which allows to read a UTF-16 encoded string that starts with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16 encoding, which can be either big-endian (BE) or little-endian (LE).

Use:

  • value to get the string value with BOM stripped, regardless of endianness.
  • is_be and is_le to check the endianness indicated by the BOM.
  • bom to check the raw byte order mark.

KS implementation details

License: CC0-1.0
Minimal Kaitai Struct required: 0.9

References

This page hosts a formal specification of UTF-16 string with BOM using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Usage

Runtime library

All C++98/STL code generated by Kaitai Struct depends on the Kaitai Struct runtime library for C++/STL. You must add this dependency to your project before you can parse or serialize any data.

For C++, the easiest way is to clone the runtime library sources and build them along with your project.

Code

Using Kaitai Struct in C++/STL usually consists of 3 steps.

  1. We need to create an STL input stream (std::istream). One can open local file for that, or use existing std::string or char* buffer.
    #include <fstream>
    
    std::ifstream is("path/to/local/file.bin", std::ifstream::binary);
    
    #include <sstream>
    
    std::istringstream is(str);
    
    #include <sstream>
    
    const char buf[] = { ... };
    std::string str(buf, sizeof buf);
    std::istringstream is(str);
    
  2. We need to wrap our input stream into Kaitai stream:
    #include "kaitai/kaitaistream.h"
    
    kaitai::kstream ks(&is);
    
  3. And finally, we can invoke the parsing:
    utf16_with_bom_t data(&ks);
    

After that, one can get various attributes from the structure by invoking getter methods like:

data.bom() // => The byte order mark (BOM) is a special marker at the beginning of the
string that indicates the endianness of the UTF-16 encoding. The
character U+FEFF is used as the BOM, and its byte representation differs
based on endianness:

* For big-endian (BE) UTF-16, it's `[0xFE, 0xFF]`
* For little-endian (LE) UTF-16, it's `[0xFF, 0xFE]`

This implementation checks for the presence of a valid BOM and strips it
from the resulting string value.

data.is_be() // => True if the byte order mark indicates big-endian UTF-16 encoding.

C++98/STL source code to parse UTF-16 string with BOM

utf16_with_bom.h

#ifndef UTF16_WITH_BOM_H_
#define UTF16_WITH_BOM_H_

// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

class utf16_with_bom_t;

#include "kaitai/kaitaistruct.h"
#include <stdint.h>

#if KAITAI_STRUCT_VERSION < 11000L
#error "Incompatible Kaitai Struct C++/STL API: version 0.11 or later is required"
#endif

/**
 * A simple wrapper which allows to read a UTF-16 encoded string that starts
 * with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16
 * encoding, which can be either big-endian (BE) or little-endian (LE).
 * 
 * Use:
 * 
 * * `value` to get the string value with BOM stripped, regardless of endianness.
 * * `is_be` and `is_le` to check the endianness indicated by the BOM.
 * * `bom` to check the raw byte order mark.
 * \sa - https://en.wikipedia.org/wiki/Byte_order_mark
 */

class utf16_with_bom_t : public kaitai::kstruct {

public:

    utf16_with_bom_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent = 0, utf16_with_bom_t* p__root = 0);

private:
    void _read();
    void _clean_up();

public:
    ~utf16_with_bom_t();

private:
    bool f_is_be;
    bool m_is_be;

public:

    /**
     * True if the byte order mark indicates big-endian UTF-16 encoding.
     */
    bool is_be();

private:
    bool f_is_le;
    bool m_is_le;

public:

    /**
     * True if the byte order mark indicates little-endian UTF-16 encoding.
     */
    bool is_le();

private:
    bool f_value;
    std::string m_value;

public:

    /**
     * The string value with BOM stripped, regardless of endianness.
     */
    std::string value();

private:
    std::string m_bom;
    std::string m_str_be;
    bool n_str_be;

public:
    bool _is_null_str_be() { str_be(); return n_str_be; };

private:
    std::string m_str_le;
    bool n_str_le;

public:
    bool _is_null_str_le() { str_le(); return n_str_le; };

private:
    utf16_with_bom_t* m__root;
    kaitai::kstruct* m__parent;

public:

    /**
     * The byte order mark (BOM) is a special marker at the beginning of the
     * string that indicates the endianness of the UTF-16 encoding. The
     * character U+FEFF is used as the BOM, and its byte representation differs
     * based on endianness:
     * 
     * * For big-endian (BE) UTF-16, it's `[0xFE, 0xFF]`
     * * For little-endian (LE) UTF-16, it's `[0xFF, 0xFE]`
     * 
     * This implementation checks for the presence of a valid BOM and strips it
     * from the resulting string value.
     */
    std::string bom() const { return m_bom; }
    std::string str_be() const { return m_str_be; }
    std::string str_le() const { return m_str_le; }
    utf16_with_bom_t* _root() const { return m__root; }
    kaitai::kstruct* _parent() const { return m__parent; }
};

#endif  // UTF16_WITH_BOM_H_

utf16_with_bom.cpp

// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

#include "utf16_with_bom.h"
#include "kaitai/exceptions.h"

utf16_with_bom_t::utf16_with_bom_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent, utf16_with_bom_t* p__root) : kaitai::kstruct(p__io) {
    m__parent = p__parent;
    m__root = p__root ? p__root : this;
    f_is_be = false;
    f_is_le = false;
    f_value = false;

    try {
        _read();
    } catch(...) {
        _clean_up();
        throw;
    }
}

void utf16_with_bom_t::_read() {
    m_bom = m__io->read_bytes(2);
    if (!( ((m_bom == std::string("\xFE\xFF", 2)) || (m_bom == std::string("\xFF\xFE", 2))) )) {
        throw kaitai::validation_not_any_of_error<std::string>(m_bom, m__io, std::string("/seq/0"));
    }
    n_str_be = true;
    if (is_be()) {
        n_str_be = false;
        m_str_be = kaitai::kstream::bytes_to_str(m__io->read_bytes_full(), "UTF-16BE");
    }
    n_str_le = true;
    if (is_le()) {
        n_str_le = false;
        m_str_le = kaitai::kstream::bytes_to_str(m__io->read_bytes_full(), "UTF-16LE");
    }
}

utf16_with_bom_t::~utf16_with_bom_t() {
    _clean_up();
}

void utf16_with_bom_t::_clean_up() {
    if (!n_str_be) {
    }
    if (!n_str_le) {
    }
}

bool utf16_with_bom_t::is_be() {
    if (f_is_be)
        return m_is_be;
    f_is_be = true;
    m_is_be = bom() == std::string("\xFE\xFF", 2);
    return m_is_be;
}

bool utf16_with_bom_t::is_le() {
    if (f_is_le)
        return m_is_le;
    f_is_le = true;
    m_is_le = bom() == std::string("\xFF\xFE", 2);
    return m_is_le;
}

std::string utf16_with_bom_t::value() {
    if (f_value)
        return m_value;
    f_value = true;
    m_value = ((is_be()) ? (str_be()) : (str_le()));
    return m_value;
}