UTF-16 string with BOM: Go parsing library

A simple wrapper which allows to read a UTF-16 encoded string that starts with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16 encoding, which can be either big-endian (BE) or little-endian (LE).

Use:

  • value to get the string value with BOM stripped, regardless of endianness.
  • is_be and is_le to check the endianness indicated by the BOM.
  • bom to check the raw byte order mark.

KS implementation details

License: CC0-1.0
Minimal Kaitai Struct required: 0.9

References

This page hosts a formal specification of UTF-16 string with BOM using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Go source code to parse UTF-16 string with BOM

utf16_with_bom.go

// Code generated by kaitai-struct-compiler from a .ksy source file. DO NOT EDIT.

import (
	"github.com/kaitai-io/kaitai_struct_go_runtime/kaitai"
	"bytes"
	"golang.org/x/text/encoding/unicode"
)


/**
 * A simple wrapper which allows to read a UTF-16 encoded string that starts
 * with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16
 * encoding, which can be either big-endian (BE) or little-endian (LE).
 * 
 * Use:
 * 
 * * `value` to get the string value with BOM stripped, regardless of endianness.
 * * `is_be` and `is_le` to check the endianness indicated by the BOM.
 * * `bom` to check the raw byte order mark.
 * @see "- https://en.wikipedia.org/wiki/Byte_order_mark
 * "
 */
type Utf16WithBom struct {
	Bom []byte
	StrBe string
	StrLe string
	_io *kaitai.Stream
	_root *Utf16WithBom
	_parent kaitai.Struct
	_f_isBe bool
	isBe bool
	_f_isLe bool
	isLe bool
	_f_value bool
	value string
}
func NewUtf16WithBom() *Utf16WithBom {
	return &Utf16WithBom{
	}
}

func (this Utf16WithBom) IO_() *kaitai.Stream {
	return this._io
}

func (this *Utf16WithBom) Read(io *kaitai.Stream, parent kaitai.Struct, root *Utf16WithBom) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp1, err := this._io.ReadBytes(int(2))
	if err != nil {
		return err
	}
	tmp1 = tmp1
	this.Bom = tmp1
	if !( ((bytes.Equal(this.Bom, []uint8{254, 255})) || (bytes.Equal(this.Bom, []uint8{255, 254}))) ) {
		return kaitai.NewValidationNotAnyOfError(this.Bom, this._io, "/seq/0")
	}
	tmp2, err := this.IsBe()
	if err != nil {
		return err
	}
	if (tmp2) {
		tmp3, err := this._io.ReadBytesFull()
		if err != nil {
			return err
		}
		tmp3 = tmp3
		tmp4, err := kaitai.BytesToStr(tmp3, unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder())
		if err != nil {
			return err
		}
		this.StrBe = tmp4
	}
	tmp5, err := this.IsLe()
	if err != nil {
		return err
	}
	if (tmp5) {
		tmp6, err := this._io.ReadBytesFull()
		if err != nil {
			return err
		}
		tmp6 = tmp6
		tmp7, err := kaitai.BytesToStr(tmp6, unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder())
		if err != nil {
			return err
		}
		this.StrLe = tmp7
	}
	return err
}

/**
 * True if the byte order mark indicates big-endian UTF-16 encoding.
 */
func (this *Utf16WithBom) IsBe() (v bool, err error) {
	if (this._f_isBe) {
		return this.isBe, nil
	}
	this._f_isBe = true
	this.isBe = bool(bytes.Equal(this.Bom, []uint8{254, 255}))
	return this.isBe, nil
}

/**
 * True if the byte order mark indicates little-endian UTF-16 encoding.
 */
func (this *Utf16WithBom) IsLe() (v bool, err error) {
	if (this._f_isLe) {
		return this.isLe, nil
	}
	this._f_isLe = true
	this.isLe = bool(bytes.Equal(this.Bom, []uint8{255, 254}))
	return this.isLe, nil
}

/**
 * The string value with BOM stripped, regardless of endianness.
 */
func (this *Utf16WithBom) Value() (v string, err error) {
	if (this._f_value) {
		return this.value, nil
	}
	this._f_value = true
	var tmp8 string;
	tmp9, err := this.IsBe()
	if err != nil {
		return "", err
	}
	if (tmp9) {
		tmp8 = this.StrBe
	} else {
		tmp8 = this.StrLe
	}
	this.value = string(tmp8)
	return this.value, nil
}

/**
 * The byte order mark (BOM) is a special marker at the beginning of the
 * string that indicates the endianness of the UTF-16 encoding. The
 * character U+FEFF is used as the BOM, and its byte representation differs
 * based on endianness:
 * 
 * * For big-endian (BE) UTF-16, it's `[0xFE, 0xFF]`
 * * For little-endian (LE) UTF-16, it's `[0xFF, 0xFE]`
 * 
 * This implementation checks for the presence of a valid BOM and strips it
 * from the resulting string value.
 */