UTF-16 string with BOM: PHP parsing library

A simple wrapper which allows to read a UTF-16 encoded string that starts with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16 encoding, which can be either big-endian (BE) or little-endian (LE).

Use:

  • value to get the string value with BOM stripped, regardless of endianness.
  • is_be and is_le to check the endianness indicated by the BOM.
  • bom to check the raw byte order mark.

KS implementation details

License: CC0-1.0
Minimal Kaitai Struct required: 0.9

References

This page hosts a formal specification of UTF-16 string with BOM using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

PHP source code to parse UTF-16 string with BOM

Utf16WithBom.php

<?php
// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

/**
 * A simple wrapper which allows to read a UTF-16 encoded string that starts
 * with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16
 * encoding, which can be either big-endian (BE) or little-endian (LE).
 * 
 * Use:
 * 
 * * `value` to get the string value with BOM stripped, regardless of endianness.
 * * `is_be` and `is_le` to check the endianness indicated by the BOM.
 * * `bom` to check the raw byte order mark.
 */

namespace {
    class Utf16WithBom extends \Kaitai\Struct\Struct {
        public function __construct(\Kaitai\Struct\Stream $_io, ?\Kaitai\Struct\Struct $_parent = null, ?\Utf16WithBom $_root = null) {
            parent::__construct($_io, $_parent, $_root === null ? $this : $_root);
            $this->_read();
        }

        private function _read() {
            $this->_m_bom = $this->_io->readBytes(2);
            if (!( (($this->_m_bom == "\xFE\xFF") || ($this->_m_bom == "\xFF\xFE")) )) {
                throw new \Kaitai\Struct\Error\ValidationNotAnyOfError($this->_m_bom, $this->_io, "/seq/0");
            }
            if ($this->isBe()) {
                $this->_m_strBe = \Kaitai\Struct\Stream::bytesToStr($this->_io->readBytesFull(), "UTF-16BE");
            }
            if ($this->isLe()) {
                $this->_m_strLe = \Kaitai\Struct\Stream::bytesToStr($this->_io->readBytesFull(), "UTF-16LE");
            }
        }
        protected $_m_isBe;

        /**
         * True if the byte order mark indicates big-endian UTF-16 encoding.
         */
        public function isBe() {
            if ($this->_m_isBe !== null)
                return $this->_m_isBe;
            $this->_m_isBe = $this->bom() == "\xFE\xFF";
            return $this->_m_isBe;
        }
        protected $_m_isLe;

        /**
         * True if the byte order mark indicates little-endian UTF-16 encoding.
         */
        public function isLe() {
            if ($this->_m_isLe !== null)
                return $this->_m_isLe;
            $this->_m_isLe = $this->bom() == "\xFF\xFE";
            return $this->_m_isLe;
        }
        protected $_m_value;

        /**
         * The string value with BOM stripped, regardless of endianness.
         */
        public function value() {
            if ($this->_m_value !== null)
                return $this->_m_value;
            $this->_m_value = ($this->isBe() ? $this->strBe() : $this->strLe());
            return $this->_m_value;
        }
        protected $_m_bom;
        protected $_m_strBe;
        protected $_m_strLe;

        /**
         * The byte order mark (BOM) is a special marker at the beginning of the
         * string that indicates the endianness of the UTF-16 encoding. The
         * character U+FEFF is used as the BOM, and its byte representation differs
         * based on endianness:
         * 
         * * For big-endian (BE) UTF-16, it's `[0xFE, 0xFF]`
         * * For little-endian (LE) UTF-16, it's `[0xFF, 0xFE]`
         * 
         * This implementation checks for the presence of a valid BOM and strips it
         * from the resulting string value.
         */
        public function bom() { return $this->_m_bom; }
        public function strBe() { return $this->_m_strBe; }
        public function strLe() { return $this->_m_strLe; }
    }
}