A simple wrapper which allows to read a UTF-16 encoded string that starts with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16 encoding, which can be either big-endian (BE) or little-endian (LE).
Use:
value to get the string value with BOM stripped, regardless of endianness.is_be and is_le to check the endianness indicated by the BOM.bom to check the raw byte order mark.This page hosts a formal specification of UTF-16 string with BOM using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.
All C++11/STL code generated by Kaitai Struct depends on the Kaitai Struct runtime library for C++/STL. You must add this dependency to your project before you can parse or serialize any data.
For C++, the easiest way is to clone the runtime library sources and build them along with your project.
Using Kaitai Struct in C++/STL usually consists of 3 steps.
std::istream). One can open local file for that, or use existing std::string or char* buffer.
#include <fstream>
std::ifstream is("path/to/local/file.bin", std::ifstream::binary);
#include <sstream>
std::istringstream is(str);
#include <sstream>
const char buf[] = { ... };
std::string str(buf, sizeof buf);
std::istringstream is(str);
#include "kaitai/kaitaistream.h"
kaitai::kstream ks(&is);
utf16_with_bom_t data(&ks);
After that, one can get various attributes from the structure by invoking getter methods like:
data.bom() // => The byte order mark (BOM) is a special marker at the beginning of the
string that indicates the endianness of the UTF-16 encoding. The
character U+FEFF is used as the BOM, and its byte representation differs
based on endianness:
* For big-endian (BE) UTF-16, it's `[0xFE, 0xFF]`
* For little-endian (LE) UTF-16, it's `[0xFF, 0xFE]`
This implementation checks for the presence of a valid BOM and strips it
from the resulting string value.
data.is_be() // => True if the byte order mark indicates big-endian UTF-16 encoding.
#pragma once
// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
class utf16_with_bom_t;
#include "kaitai/kaitaistruct.h"
#include <stdint.h>
#include <memory>
#if KAITAI_STRUCT_VERSION < 11000L
#error "Incompatible Kaitai Struct C++/STL API: version 0.11 or later is required"
#endif
/**
* A simple wrapper which allows to read a UTF-16 encoded string that starts
* with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16
* encoding, which can be either big-endian (BE) or little-endian (LE).
*
* Use:
*
* * `value` to get the string value with BOM stripped, regardless of endianness.
* * `is_be` and `is_le` to check the endianness indicated by the BOM.
* * `bom` to check the raw byte order mark.
* \sa - https://en.wikipedia.org/wiki/Byte_order_mark
*/
class utf16_with_bom_t : public kaitai::kstruct {
public:
utf16_with_bom_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent = nullptr, utf16_with_bom_t* p__root = nullptr);
private:
void _read();
void _clean_up();
public:
~utf16_with_bom_t();
private:
bool f_is_be;
bool m_is_be;
public:
/**
* True if the byte order mark indicates big-endian UTF-16 encoding.
*/
bool is_be();
private:
bool f_is_le;
bool m_is_le;
public:
/**
* True if the byte order mark indicates little-endian UTF-16 encoding.
*/
bool is_le();
private:
bool f_value;
std::string m_value;
public:
/**
* The string value with BOM stripped, regardless of endianness.
*/
std::string value();
private:
std::string m_bom;
std::string m_str_be;
bool n_str_be;
public:
bool _is_null_str_be() { str_be(); return n_str_be; };
private:
std::string m_str_le;
bool n_str_le;
public:
bool _is_null_str_le() { str_le(); return n_str_le; };
private:
utf16_with_bom_t* m__root;
kaitai::kstruct* m__parent;
public:
/**
* The byte order mark (BOM) is a special marker at the beginning of the
* string that indicates the endianness of the UTF-16 encoding. The
* character U+FEFF is used as the BOM, and its byte representation differs
* based on endianness:
*
* * For big-endian (BE) UTF-16, it's `[0xFE, 0xFF]`
* * For little-endian (LE) UTF-16, it's `[0xFF, 0xFE]`
*
* This implementation checks for the presence of a valid BOM and strips it
* from the resulting string value.
*/
std::string bom() const { return m_bom; }
std::string str_be() const { return m_str_be; }
std::string str_le() const { return m_str_le; }
utf16_with_bom_t* _root() const { return m__root; }
kaitai::kstruct* _parent() const { return m__parent; }
};
// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
#include "utf16_with_bom.h"
#include "kaitai/exceptions.h"
utf16_with_bom_t::utf16_with_bom_t(kaitai::kstream* p__io, kaitai::kstruct* p__parent, utf16_with_bom_t* p__root) : kaitai::kstruct(p__io) {
m__parent = p__parent;
m__root = p__root ? p__root : this;
f_is_be = false;
f_is_le = false;
f_value = false;
_read();
}
void utf16_with_bom_t::_read() {
m_bom = m__io->read_bytes(2);
if (!( ((m_bom == std::string("\xFE\xFF", 2)) || (m_bom == std::string("\xFF\xFE", 2))) )) {
throw kaitai::validation_not_any_of_error<std::string>(m_bom, m__io, std::string("/seq/0"));
}
n_str_be = true;
if (is_be()) {
n_str_be = false;
m_str_be = kaitai::kstream::bytes_to_str(m__io->read_bytes_full(), "UTF-16BE");
}
n_str_le = true;
if (is_le()) {
n_str_le = false;
m_str_le = kaitai::kstream::bytes_to_str(m__io->read_bytes_full(), "UTF-16LE");
}
}
utf16_with_bom_t::~utf16_with_bom_t() {
_clean_up();
}
void utf16_with_bom_t::_clean_up() {
if (!n_str_be) {
}
if (!n_str_le) {
}
}
bool utf16_with_bom_t::is_be() {
if (f_is_be)
return m_is_be;
f_is_be = true;
m_is_be = bom() == std::string("\xFE\xFF", 2);
return m_is_be;
}
bool utf16_with_bom_t::is_le() {
if (f_is_le)
return m_is_le;
f_is_le = true;
m_is_le = bom() == std::string("\xFF\xFE", 2);
return m_is_le;
}
std::string utf16_with_bom_t::value() {
if (f_value)
return m_value;
f_value = true;
m_value = ((is_be()) ? (str_be()) : (str_le()));
return m_value;
}