UTF-16 string with BOM: Rust parsing library

A simple wrapper which allows to read a UTF-16 encoded string that starts with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16 encoding, which can be either big-endian (BE) or little-endian (LE).

Use:

  • value to get the string value with BOM stripped, regardless of endianness.
  • is_be and is_le to check the endianness indicated by the BOM.
  • bom to check the raw byte order mark.

KS implementation details

License: CC0-1.0
Minimal Kaitai Struct required: 0.9

References

This page hosts a formal specification of UTF-16 string with BOM using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Rust source code to parse UTF-16 string with BOM

utf16_with_bom.rs

// This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild

#![allow(unused_imports)]
#![allow(non_snake_case)]
#![allow(non_camel_case_types)]
#![allow(irrefutable_let_patterns)]
#![allow(unused_comparisons)]

extern crate kaitai;
use kaitai::*;
use std::convert::{TryFrom, TryInto};
use std::cell::{Ref, Cell, RefCell};
use std::rc::{Rc, Weak};

/**
 * A simple wrapper which allows to read a UTF-16 encoded string that starts
 * with a byte order mark (BOM). The BOM indicates the endianness of the UTF-16
 * encoding, which can be either big-endian (BE) or little-endian (LE).
 * 
 * Use:
 * 
 * * `value` to get the string value with BOM stripped, regardless of endianness.
 * * `is_be` and `is_le` to check the endianness indicated by the BOM.
 * * `bom` to check the raw byte order mark.
 * \sa - https://en.wikipedia.org/wiki/Byte_order_mark
 */

#[derive(Default, Debug, Clone)]
pub struct Utf16WithBom {
    pub _root: SharedType<Utf16WithBom>,
    pub _parent: SharedType<Utf16WithBom>,
    pub _self: SharedType<Self>,
    bom: RefCell<Vec<u8>>,
    str_be: RefCell<String>,
    str_le: RefCell<String>,
    _io: RefCell<BytesReader>,
    f_is_be: Cell<bool>,
    is_be: RefCell<bool>,
    f_is_le: Cell<bool>,
    is_le: RefCell<bool>,
    f_value: Cell<bool>,
    value: RefCell<String>,
}
impl KStruct for Utf16WithBom {
    type Root = Utf16WithBom;
    type Parent = Utf16WithBom;

    fn read<S: KStream>(
        self_rc: &OptRc<Self>,
        _io: &S,
        _root: SharedType<Self::Root>,
        _parent: SharedType<Self::Parent>,
    ) -> KResult<()> {
        *self_rc._io.borrow_mut() = _io.clone();
        self_rc._root.set(_root.get());
        self_rc._parent.set(_parent.get());
        self_rc._self.set(Ok(self_rc.clone()));
        let _rrc = self_rc._root.get_value().borrow().upgrade();
        let _prc = self_rc._parent.get_value().borrow().upgrade();
        let _r = _rrc.as_ref().unwrap();
        *self_rc.bom.borrow_mut() = _io.read_bytes(2 as usize)?.into();
        if !( ((*self_rc.bom() == vec![0xfeu8, 0xffu8]) || (*self_rc.bom() == vec![0xffu8, 0xfeu8])) ) {
            return Err(KError::ValidationFailed(ValidationFailedError { kind: ValidationKind::NotAnyOf, src_path: "/seq/0".to_string() }));
        }
        if *self_rc.is_be()? {
            *self_rc.str_be.borrow_mut() = bytes_to_str(&_io.read_bytes_full()?.into(), "UTF-16BE")?;
        }
        if *self_rc.is_le()? {
            *self_rc.str_le.borrow_mut() = bytes_to_str(&_io.read_bytes_full()?.into(), "UTF-16LE")?;
        }
        Ok(())
    }
}
impl Utf16WithBom {

    /**
     * True if the byte order mark indicates big-endian UTF-16 encoding.
     */
    pub fn is_be(
        &self
    ) -> KResult<Ref<'_, bool>> {
        let _io = self._io.borrow();
        let _rrc = self._root.get_value().borrow().upgrade();
        let _prc = self._parent.get_value().borrow().upgrade();
        let _r = _rrc.as_ref().unwrap();
        if self.f_is_be.get() {
            return Ok(self.is_be.borrow());
        }
        self.f_is_be.set(true);
        *self.is_be.borrow_mut() = (*self.bom() == vec![0xfeu8, 0xffu8]) as bool;
        Ok(self.is_be.borrow())
    }

    /**
     * True if the byte order mark indicates little-endian UTF-16 encoding.
     */
    pub fn is_le(
        &self
    ) -> KResult<Ref<'_, bool>> {
        let _io = self._io.borrow();
        let _rrc = self._root.get_value().borrow().upgrade();
        let _prc = self._parent.get_value().borrow().upgrade();
        let _r = _rrc.as_ref().unwrap();
        if self.f_is_le.get() {
            return Ok(self.is_le.borrow());
        }
        self.f_is_le.set(true);
        *self.is_le.borrow_mut() = (*self.bom() == vec![0xffu8, 0xfeu8]) as bool;
        Ok(self.is_le.borrow())
    }

    /**
     * The string value with BOM stripped, regardless of endianness.
     */
    pub fn value(
        &self
    ) -> KResult<Ref<'_, String>> {
        let _io = self._io.borrow();
        let _rrc = self._root.get_value().borrow().upgrade();
        let _prc = self._parent.get_value().borrow().upgrade();
        let _r = _rrc.as_ref().unwrap();
        if self.f_value.get() {
            return Ok(self.value.borrow());
        }
        self.f_value.set(true);
        *self.value.borrow_mut() = if *self.is_be()? { self.str_be().to_string() } else { self.str_le().to_string() }.to_string();
        Ok(self.value.borrow())
    }
}

/**
 * The byte order mark (BOM) is a special marker at the beginning of the
 * string that indicates the endianness of the UTF-16 encoding. The
 * character U+FEFF is used as the BOM, and its byte representation differs
 * based on endianness:
 * 
 * * For big-endian (BE) UTF-16, it's `[0xFE, 0xFF]`
 * * For little-endian (LE) UTF-16, it's `[0xFF, 0xFE]`
 * 
 * This implementation checks for the presence of a valid BOM and strips it
 * from the resulting string value.
 */
impl Utf16WithBom {
    pub fn bom(&self) -> Ref<'_, Vec<u8>> {
        self.bom.borrow()
    }
}
impl Utf16WithBom {
    pub fn str_be(&self) -> Ref<'_, String> {
        self.str_be.borrow()
    }
}
impl Utf16WithBom {
    pub fn str_le(&self) -> Ref<'_, String> {
        self.str_le.borrow()
    }
}
impl Utf16WithBom {
    pub fn _io(&self) -> Ref<'_, BytesReader> {
        self._io.borrow()
    }
}