.gz file format: Go parsing library

Gzip is a popular and standard single-file archiving format. It essentially provides a container that stores original file name, timestamp and a few other things (like optional comment), basic CRCs, etc, and a file compressed by a chosen compression algorithm.

As of 2019, there is actually only one working solution for compression algorithms, so it's typically raw DEFLATE stream (without zlib header) in all gzipped files.

This page hosts a formal specification of .gz file format using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Go source code to parse .gz file format

gzip.go

// Code generated by kaitai-struct-compiler from a .ksy source file. DO NOT EDIT.

import (
	"github.com/kaitai-io/kaitai_struct_go_runtime/kaitai"
	"bytes"
)


/**
 * Gzip is a popular and standard single-file archiving format. It
 * essentially provides a container that stores original file name,
 * timestamp and a few other things (like optional comment), basic
 * CRCs, etc, and a file compressed by a chosen compression algorithm.
 * 
 * As of 2019, there is actually only one working solution for
 * compression algorithms, so it's typically raw DEFLATE stream
 * (without zlib header) in all gzipped files.
 * @see <a href="https://www.rfc-editor.org/rfc/rfc1952">Source</a>
 */

type Gzip_CompressionMethods int
const (
	Gzip_CompressionMethods__Deflate Gzip_CompressionMethods = 8
)

type Gzip_Oses int
const (
	Gzip_Oses__Fat Gzip_Oses = 0
	Gzip_Oses__Amiga Gzip_Oses = 1
	Gzip_Oses__Vms Gzip_Oses = 2
	Gzip_Oses__Unix Gzip_Oses = 3
	Gzip_Oses__VmCms Gzip_Oses = 4
	Gzip_Oses__AtariTos Gzip_Oses = 5
	Gzip_Oses__Hpfs Gzip_Oses = 6
	Gzip_Oses__Macintosh Gzip_Oses = 7
	Gzip_Oses__ZSystem Gzip_Oses = 8
	Gzip_Oses__CpM Gzip_Oses = 9
	Gzip_Oses__Tops20 Gzip_Oses = 10
	Gzip_Oses__Ntfs Gzip_Oses = 11
	Gzip_Oses__Qdos Gzip_Oses = 12
	Gzip_Oses__AcornRiscos Gzip_Oses = 13
	Gzip_Oses__Unknown Gzip_Oses = 255
)
type Gzip struct {
	Magic []byte
	CompressionMethod Gzip_CompressionMethods
	Flags *Gzip_Flags
	ModTime uint32
	ExtraFlags *Gzip_ExtraFlagsDeflate
	Os Gzip_Oses
	Extras *Gzip_Extras
	Name []byte
	Comment []byte
	HeaderCrc16 uint16
	Body []byte
	BodyCrc32 uint32
	LenUncompressed uint32
	_io *kaitai.Stream
	_root *Gzip
	_parent interface{}
}
func NewGzip() *Gzip {
	return &Gzip{
	}
}

func (this *Gzip) Read(io *kaitai.Stream, parent interface{}, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp1, err := this._io.ReadBytes(int(2))
	if err != nil {
		return err
	}
	tmp1 = tmp1
	this.Magic = tmp1
	if !(bytes.Equal(this.Magic, []uint8{31, 139})) {
		return kaitai.NewValidationNotEqualError([]uint8{31, 139}, this.Magic, this._io, "/seq/0")
	}
	tmp2, err := this._io.ReadU1()
	if err != nil {
		return err
	}
	this.CompressionMethod = Gzip_CompressionMethods(tmp2)
	tmp3 := NewGzip_Flags()
	err = tmp3.Read(this._io, this, this._root)
	if err != nil {
		return err
	}
	this.Flags = tmp3
	tmp4, err := this._io.ReadU4le()
	if err != nil {
		return err
	}
	this.ModTime = uint32(tmp4)
	switch (this.CompressionMethod) {
	case Gzip_CompressionMethods__Deflate:
		tmp5 := NewGzip_ExtraFlagsDeflate()
		err = tmp5.Read(this._io, this, this._root)
		if err != nil {
			return err
		}
		this.ExtraFlags = tmp5
	}
	tmp6, err := this._io.ReadU1()
	if err != nil {
		return err
	}
	this.Os = Gzip_Oses(tmp6)
	if (this.Flags.HasExtra) {
		tmp7 := NewGzip_Extras()
		err = tmp7.Read(this._io, this, this._root)
		if err != nil {
			return err
		}
		this.Extras = tmp7
	}
	if (this.Flags.HasName) {
		tmp8, err := this._io.ReadBytesTerm(0, false, true, true)
		if err != nil {
			return err
		}
		this.Name = tmp8
	}
	if (this.Flags.HasComment) {
		tmp9, err := this._io.ReadBytesTerm(0, false, true, true)
		if err != nil {
			return err
		}
		this.Comment = tmp9
	}
	if (this.Flags.HasHeaderCrc) {
		tmp10, err := this._io.ReadU2le()
		if err != nil {
			return err
		}
		this.HeaderCrc16 = uint16(tmp10)
	}
	tmp11, err := this._io.Size()
	if err != nil {
		return err
	}
	tmp12, err := this._io.Pos()
	if err != nil {
		return err
	}
	tmp13, err := this._io.ReadBytes(int(((tmp11 - tmp12) - 8)))
	if err != nil {
		return err
	}
	tmp13 = tmp13
	this.Body = tmp13
	tmp14, err := this._io.ReadU4le()
	if err != nil {
		return err
	}
	this.BodyCrc32 = uint32(tmp14)
	tmp15, err := this._io.ReadU4le()
	if err != nil {
		return err
	}
	this.LenUncompressed = uint32(tmp15)
	return err
}

/**
 * Compression method used to compress file body. In practice, only
 * one method is widely used: 8 = deflate.
 */

/**
 * Last modification time of a file archived in UNIX timestamp format.
 */

/**
 * Extra flags, specific to compression method chosen.
 */

/**
 * OS used to compress this file.
 */

/**
 * Compressed body of a file archived. Note that we don't make an
 * attempt to decompress it here.
 */

/**
 * CRC32 checksum of an uncompressed file body
 */

/**
 * Size of original uncompressed data in bytes (truncated to 32
 * bits).
 */
type Gzip_Flags struct {
	Reserved1 uint64
	HasComment bool
	HasName bool
	HasExtra bool
	HasHeaderCrc bool
	IsText bool
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip
}
func NewGzip_Flags() *Gzip_Flags {
	return &Gzip_Flags{
	}
}

func (this *Gzip_Flags) Read(io *kaitai.Stream, parent *Gzip, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp16, err := this._io.ReadBitsIntBe(3)
	if err != nil {
		return err
	}
	this.Reserved1 = tmp16
	tmp17, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.HasComment = tmp17 != 0
	tmp18, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.HasName = tmp18 != 0
	tmp19, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.HasExtra = tmp19 != 0
	tmp20, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.HasHeaderCrc = tmp20 != 0
	tmp21, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.IsText = tmp21 != 0
	return err
}

/**
 * If true, optional extra fields are present in the archive.
 */

/**
 * If true, this archive includes a CRC16 checksum for the header.
 */

/**
 * If true, file inside this archive is a text file from
 * compressor's point of view.
 */

type Gzip_ExtraFlagsDeflate_CompressionStrengths int
const (
	Gzip_ExtraFlagsDeflate_CompressionStrengths__Best Gzip_ExtraFlagsDeflate_CompressionStrengths = 2
	Gzip_ExtraFlagsDeflate_CompressionStrengths__Fast Gzip_ExtraFlagsDeflate_CompressionStrengths = 4
)
type Gzip_ExtraFlagsDeflate struct {
	CompressionStrength Gzip_ExtraFlagsDeflate_CompressionStrengths
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip
}
func NewGzip_ExtraFlagsDeflate() *Gzip_ExtraFlagsDeflate {
	return &Gzip_ExtraFlagsDeflate{
	}
}

func (this *Gzip_ExtraFlagsDeflate) Read(io *kaitai.Stream, parent *Gzip, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp22, err := this._io.ReadU1()
	if err != nil {
		return err
	}
	this.CompressionStrength = Gzip_ExtraFlagsDeflate_CompressionStrengths(tmp22)
	return err
}

/**
 * Container for many subfields, constrained by size of stream.
 */
type Gzip_Subfields struct {
	Entries []*Gzip_Subfield
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip_Extras
}
func NewGzip_Subfields() *Gzip_Subfields {
	return &Gzip_Subfields{
	}
}

func (this *Gzip_Subfields) Read(io *kaitai.Stream, parent *Gzip_Extras, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	for i := 1;; i++ {
		tmp23, err := this._io.EOF()
		if err != nil {
			return err
		}
		if tmp23 {
			break
		}
		tmp24 := NewGzip_Subfield()
		err = tmp24.Read(this._io, this, this._root)
		if err != nil {
			return err
		}
		this.Entries = append(this.Entries, tmp24)
	}
	return err
}

/**
 * Every subfield follows typical [TLV scheme](https://en.wikipedia.org/wiki/Type-length-value):
 * 
 * * `id` serves role of "T"ype
 * * `len_data` serves role of "L"ength
 * * `data` serves role of "V"alue
 * 
 * This way it's possible to for arbitrary parser to skip over
 * subfields it does not support.
 */
type Gzip_Subfield struct {
	Id uint16
	LenData uint16
	Data []byte
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip_Subfields
}
func NewGzip_Subfield() *Gzip_Subfield {
	return &Gzip_Subfield{
	}
}

func (this *Gzip_Subfield) Read(io *kaitai.Stream, parent *Gzip_Subfields, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp25, err := this._io.ReadU2le()
	if err != nil {
		return err
	}
	this.Id = uint16(tmp25)
	tmp26, err := this._io.ReadU2le()
	if err != nil {
		return err
	}
	this.LenData = uint16(tmp26)
	tmp27, err := this._io.ReadBytes(int(this.LenData))
	if err != nil {
		return err
	}
	tmp27 = tmp27
	this.Data = tmp27
	return err
}

/**
 * Subfield ID, typically two ASCII letters.
 */
type Gzip_Extras struct {
	LenSubfields uint16
	Subfields *Gzip_Subfields
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip
	_raw_Subfields []byte
}
func NewGzip_Extras() *Gzip_Extras {
	return &Gzip_Extras{
	}
}

func (this *Gzip_Extras) Read(io *kaitai.Stream, parent *Gzip, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp28, err := this._io.ReadU2le()
	if err != nil {
		return err
	}
	this.LenSubfields = uint16(tmp28)
	tmp29, err := this._io.ReadBytes(int(this.LenSubfields))
	if err != nil {
		return err
	}
	tmp29 = tmp29
	this._raw_Subfields = tmp29
	_io__raw_Subfields := kaitai.NewStream(bytes.NewReader(this._raw_Subfields))
	tmp30 := NewGzip_Subfields()
	err = tmp30.Read(_io__raw_Subfields, this, this._root)
	if err != nil {
		return err
	}
	this.Subfields = tmp30
	return err
}