.gz file format: Nim parsing library

Gzip is a popular and standard single-file archiving format. It essentially provides a container that stores original file name, timestamp and a few other things (like optional comment), basic CRCs, etc, and a file compressed by a chosen compression algorithm.

As of 2019, there is actually only one working solution for compression algorithms, so it's typically raw DEFLATE stream (without zlib header) in all gzipped files.

This page hosts a formal specification of .gz file format using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Nim source code to parse .gz file format

gzip.nim

import kaitai_struct_nim_runtime
import options

type
  Gzip* = ref object of KaitaiStruct
    `magic`*: seq[byte]
    `compressionMethod`*: Gzip_CompressionMethods
    `flags`*: Gzip_Flags
    `modTime`*: uint32
    `extraFlags`*: Gzip_ExtraFlagsDeflate
    `os`*: Gzip_Oses
    `extras`*: Gzip_Extras
    `name`*: seq[byte]
    `comment`*: seq[byte]
    `headerCrc16`*: uint16
    `body`*: seq[byte]
    `bodyCrc32`*: uint32
    `lenUncompressed`*: uint32
    `parent`*: KaitaiStruct
  Gzip_CompressionMethods* = enum
    deflate = 8
  Gzip_Oses* = enum
    fat = 0
    amiga = 1
    vms = 2
    unix = 3
    vm_cms = 4
    atari_tos = 5
    hpfs = 6
    macintosh = 7
    z_system = 8
    cp_m = 9
    tops_20 = 10
    ntfs = 11
    qdos = 12
    acorn_riscos = 13
    unknown = 255
  Gzip_Flags* = ref object of KaitaiStruct
    `reserved1`*: uint64
    `hasComment`*: bool
    `hasName`*: bool
    `hasExtra`*: bool
    `hasHeaderCrc`*: bool
    `isText`*: bool
    `parent`*: Gzip
  Gzip_ExtraFlagsDeflate* = ref object of KaitaiStruct
    `compressionStrength`*: Gzip_ExtraFlagsDeflate_CompressionStrengths
    `parent`*: Gzip
  Gzip_ExtraFlagsDeflate_CompressionStrengths* = enum
    best = 2
    fast = 4
  Gzip_Subfields* = ref object of KaitaiStruct
    `entries`*: seq[Gzip_Subfield]
    `parent`*: Gzip_Extras
  Gzip_Subfield* = ref object of KaitaiStruct
    `id`*: uint16
    `lenData`*: uint16
    `data`*: seq[byte]
    `parent`*: Gzip_Subfields
  Gzip_Extras* = ref object of KaitaiStruct
    `lenSubfields`*: uint16
    `subfields`*: Gzip_Subfields
    `parent`*: Gzip
    `rawSubfields`*: seq[byte]

proc read*(_: typedesc[Gzip], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Gzip
proc read*(_: typedesc[Gzip_Flags], io: KaitaiStream, root: KaitaiStruct, parent: Gzip): Gzip_Flags
proc read*(_: typedesc[Gzip_ExtraFlagsDeflate], io: KaitaiStream, root: KaitaiStruct, parent: Gzip): Gzip_ExtraFlagsDeflate
proc read*(_: typedesc[Gzip_Subfields], io: KaitaiStream, root: KaitaiStruct, parent: Gzip_Extras): Gzip_Subfields
proc read*(_: typedesc[Gzip_Subfield], io: KaitaiStream, root: KaitaiStruct, parent: Gzip_Subfields): Gzip_Subfield
proc read*(_: typedesc[Gzip_Extras], io: KaitaiStream, root: KaitaiStruct, parent: Gzip): Gzip_Extras



##[
Gzip is a popular and standard single-file archiving format. It
essentially provides a container that stores original file name,
timestamp and a few other things (like optional comment), basic
CRCs, etc, and a file compressed by a chosen compression algorithm.

As of 2019, there is actually only one working solution for
compression algorithms, so it's typically raw DEFLATE stream
(without zlib header) in all gzipped files.

@see <a href="https://www.rfc-editor.org/rfc/rfc1952">Source</a>
]##
proc read*(_: typedesc[Gzip], io: KaitaiStream, root: KaitaiStruct, parent: KaitaiStruct): Gzip =
  template this: untyped = result
  this = new(Gzip)
  let root = if root == nil: cast[Gzip](this) else: cast[Gzip](root)
  this.io = io
  this.root = root
  this.parent = parent

  let magicExpr = this.io.readBytes(int(2))
  this.magic = magicExpr

  ##[
  Compression method used to compress file body. In practice, only
one method is widely used: 8 = deflate.

  ]##
  let compressionMethodExpr = Gzip_CompressionMethods(this.io.readU1())
  this.compressionMethod = compressionMethodExpr
  let flagsExpr = Gzip_Flags.read(this.io, this.root, this)
  this.flags = flagsExpr

  ##[
  Last modification time of a file archived in UNIX timestamp format.
  ]##
  let modTimeExpr = this.io.readU4le()
  this.modTime = modTimeExpr

  ##[
  Extra flags, specific to compression method chosen.
  ]##
  block:
    let on = this.compressionMethod
    if on == gzip.deflate:
      let extraFlagsExpr = Gzip_ExtraFlagsDeflate.read(this.io, this.root, this)
      this.extraFlags = extraFlagsExpr

  ##[
  OS used to compress this file.
  ]##
  let osExpr = Gzip_Oses(this.io.readU1())
  this.os = osExpr
  if this.flags.hasExtra:
    let extrasExpr = Gzip_Extras.read(this.io, this.root, this)
    this.extras = extrasExpr
  if this.flags.hasName:
    let nameExpr = this.io.readBytesTerm(0, false, true, true)
    this.name = nameExpr
  if this.flags.hasComment:
    let commentExpr = this.io.readBytesTerm(0, false, true, true)
    this.comment = commentExpr
  if this.flags.hasHeaderCrc:
    let headerCrc16Expr = this.io.readU2le()
    this.headerCrc16 = headerCrc16Expr

  ##[
  Compressed body of a file archived. Note that we don't make an
attempt to decompress it here.

  ]##
  let bodyExpr = this.io.readBytes(int(((this.io.size - this.io.pos) - 8)))
  this.body = bodyExpr

  ##[
  CRC32 checksum of an uncompressed file body

  ]##
  let bodyCrc32Expr = this.io.readU4le()
  this.bodyCrc32 = bodyCrc32Expr

  ##[
  Size of original uncompressed data in bytes (truncated to 32
bits).

  ]##
  let lenUncompressedExpr = this.io.readU4le()
  this.lenUncompressed = lenUncompressedExpr

proc fromFile*(_: typedesc[Gzip], filename: string): Gzip =
  Gzip.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Gzip_Flags], io: KaitaiStream, root: KaitaiStruct, parent: Gzip): Gzip_Flags =
  template this: untyped = result
  this = new(Gzip_Flags)
  let root = if root == nil: cast[Gzip](this) else: cast[Gzip](root)
  this.io = io
  this.root = root
  this.parent = parent

  let reserved1Expr = this.io.readBitsIntBe(3)
  this.reserved1 = reserved1Expr
  let hasCommentExpr = this.io.readBitsIntBe(1) != 0
  this.hasComment = hasCommentExpr
  let hasNameExpr = this.io.readBitsIntBe(1) != 0
  this.hasName = hasNameExpr

  ##[
  If true, optional extra fields are present in the archive.
  ]##
  let hasExtraExpr = this.io.readBitsIntBe(1) != 0
  this.hasExtra = hasExtraExpr

  ##[
  If true, this archive includes a CRC16 checksum for the header.

  ]##
  let hasHeaderCrcExpr = this.io.readBitsIntBe(1) != 0
  this.hasHeaderCrc = hasHeaderCrcExpr

  ##[
  If true, file inside this archive is a text file from
compressor's point of view.

  ]##
  let isTextExpr = this.io.readBitsIntBe(1) != 0
  this.isText = isTextExpr

proc fromFile*(_: typedesc[Gzip_Flags], filename: string): Gzip_Flags =
  Gzip_Flags.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Gzip_ExtraFlagsDeflate], io: KaitaiStream, root: KaitaiStruct, parent: Gzip): Gzip_ExtraFlagsDeflate =
  template this: untyped = result
  this = new(Gzip_ExtraFlagsDeflate)
  let root = if root == nil: cast[Gzip](this) else: cast[Gzip](root)
  this.io = io
  this.root = root
  this.parent = parent

  let compressionStrengthExpr = Gzip_ExtraFlagsDeflate_CompressionStrengths(this.io.readU1())
  this.compressionStrength = compressionStrengthExpr

proc fromFile*(_: typedesc[Gzip_ExtraFlagsDeflate], filename: string): Gzip_ExtraFlagsDeflate =
  Gzip_ExtraFlagsDeflate.read(newKaitaiFileStream(filename), nil, nil)


##[
Container for many subfields, constrained by size of stream.

]##
proc read*(_: typedesc[Gzip_Subfields], io: KaitaiStream, root: KaitaiStruct, parent: Gzip_Extras): Gzip_Subfields =
  template this: untyped = result
  this = new(Gzip_Subfields)
  let root = if root == nil: cast[Gzip](this) else: cast[Gzip](root)
  this.io = io
  this.root = root
  this.parent = parent

  block:
    var i: int
    while not this.io.isEof:
      let it = Gzip_Subfield.read(this.io, this.root, this)
      this.entries.add(it)
      inc i

proc fromFile*(_: typedesc[Gzip_Subfields], filename: string): Gzip_Subfields =
  Gzip_Subfields.read(newKaitaiFileStream(filename), nil, nil)


##[
Every subfield follows typical [TLV scheme](https://en.wikipedia.org/wiki/Type-length-value):

* `id` serves role of "T"ype
* `len_data` serves role of "L"ength
* `data` serves role of "V"alue

This way it's possible to for arbitrary parser to skip over
subfields it does not support.

]##
proc read*(_: typedesc[Gzip_Subfield], io: KaitaiStream, root: KaitaiStruct, parent: Gzip_Subfields): Gzip_Subfield =
  template this: untyped = result
  this = new(Gzip_Subfield)
  let root = if root == nil: cast[Gzip](this) else: cast[Gzip](root)
  this.io = io
  this.root = root
  this.parent = parent


  ##[
  Subfield ID, typically two ASCII letters.
  ]##
  let idExpr = this.io.readU2le()
  this.id = idExpr
  let lenDataExpr = this.io.readU2le()
  this.lenData = lenDataExpr
  let dataExpr = this.io.readBytes(int(this.lenData))
  this.data = dataExpr

proc fromFile*(_: typedesc[Gzip_Subfield], filename: string): Gzip_Subfield =
  Gzip_Subfield.read(newKaitaiFileStream(filename), nil, nil)

proc read*(_: typedesc[Gzip_Extras], io: KaitaiStream, root: KaitaiStruct, parent: Gzip): Gzip_Extras =
  template this: untyped = result
  this = new(Gzip_Extras)
  let root = if root == nil: cast[Gzip](this) else: cast[Gzip](root)
  this.io = io
  this.root = root
  this.parent = parent

  let lenSubfieldsExpr = this.io.readU2le()
  this.lenSubfields = lenSubfieldsExpr
  let rawSubfieldsExpr = this.io.readBytes(int(this.lenSubfields))
  this.rawSubfields = rawSubfieldsExpr
  let rawSubfieldsIo = newKaitaiStream(rawSubfieldsExpr)
  let subfieldsExpr = Gzip_Subfields.read(rawSubfieldsIo, this.root, this)
  this.subfields = subfieldsExpr

proc fromFile*(_: typedesc[Gzip_Extras], filename: string): Gzip_Extras =
  Gzip_Extras.read(newKaitaiFileStream(filename), nil, nil)