.gz file format: Go parsing library

Gzip is a popular and standard single-file archiving format. It essentially provides a container that stores original file name, timestamp and a few other things (like optional comment), basic CRCs, etc, and a file compressed by a chosen compression algorithm.

As of 2019, there is actually only one working solution for compression algorithms, so it's typically raw DEFLATE stream (without zlib header) in all gzipped files.

This page hosts a formal specification of .gz file format using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Go source code to parse .gz file format

gzip.go

// Code generated by kaitai-struct-compiler from a .ksy source file. DO NOT EDIT.

import (
	"github.com/kaitai-io/kaitai_struct_go_runtime/kaitai"
	"bytes"
)


/**
 * Gzip is a popular and standard single-file archiving format. It
 * essentially provides a container that stores original file name,
 * timestamp and a few other things (like optional comment), basic
 * CRCs, etc, and a file compressed by a chosen compression algorithm.
 * 
 * As of 2019, there is actually only one working solution for
 * compression algorithms, so it's typically raw DEFLATE stream
 * (without zlib header) in all gzipped files.
 * @see <a href="https://www.rfc-editor.org/rfc/rfc1952">Source</a>
 */

type Gzip_CompressionMethods int
const (
	Gzip_CompressionMethods__Deflate Gzip_CompressionMethods = 8
)
var values_Gzip_CompressionMethods = map[Gzip_CompressionMethods]struct{}{8: {}}
func (v Gzip_CompressionMethods) isDefined() bool {
	_, ok := values_Gzip_CompressionMethods[v]
	return ok
}

type Gzip_Oses int
const (
	Gzip_Oses__Fat Gzip_Oses = 0
	Gzip_Oses__Amiga Gzip_Oses = 1
	Gzip_Oses__Vms Gzip_Oses = 2
	Gzip_Oses__Unix Gzip_Oses = 3
	Gzip_Oses__VmCms Gzip_Oses = 4
	Gzip_Oses__AtariTos Gzip_Oses = 5
	Gzip_Oses__Hpfs Gzip_Oses = 6
	Gzip_Oses__Macintosh Gzip_Oses = 7
	Gzip_Oses__ZSystem Gzip_Oses = 8
	Gzip_Oses__CpM Gzip_Oses = 9
	Gzip_Oses__Tops20 Gzip_Oses = 10
	Gzip_Oses__Ntfs Gzip_Oses = 11
	Gzip_Oses__Qdos Gzip_Oses = 12
	Gzip_Oses__AcornRiscos Gzip_Oses = 13
	Gzip_Oses__Unknown Gzip_Oses = 255
)
var values_Gzip_Oses = map[Gzip_Oses]struct{}{0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}, 7: {}, 8: {}, 9: {}, 10: {}, 11: {}, 12: {}, 13: {}, 255: {}}
func (v Gzip_Oses) isDefined() bool {
	_, ok := values_Gzip_Oses[v]
	return ok
}
type Gzip struct {
	Magic []byte
	CompressionMethod Gzip_CompressionMethods
	Flags *Gzip_Flags
	ModTime uint32
	ExtraFlags *Gzip_ExtraFlagsDeflate
	Os Gzip_Oses
	Extras *Gzip_Extras
	Name []byte
	Comment []byte
	HeaderCrc16 uint16
	Body []byte
	BodyCrc32 uint32
	LenUncompressed uint32
	_io *kaitai.Stream
	_root *Gzip
	_parent kaitai.Struct
}
func NewGzip() *Gzip {
	return &Gzip{
	}
}

func (this Gzip) IO_() *kaitai.Stream {
	return this._io
}

func (this *Gzip) Read(io *kaitai.Stream, parent kaitai.Struct, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp1, err := this._io.ReadBytes(int(2))
	if err != nil {
		return err
	}
	tmp1 = tmp1
	this.Magic = tmp1
	if !(bytes.Equal(this.Magic, []uint8{31, 139})) {
		return kaitai.NewValidationNotEqualError([]uint8{31, 139}, this.Magic, this._io, "/seq/0")
	}
	tmp2, err := this._io.ReadU1()
	if err != nil {
		return err
	}
	this.CompressionMethod = Gzip_CompressionMethods(tmp2)
	tmp3 := NewGzip_Flags()
	err = tmp3.Read(this._io, this, this._root)
	if err != nil {
		return err
	}
	this.Flags = tmp3
	tmp4, err := this._io.ReadU4le()
	if err != nil {
		return err
	}
	this.ModTime = uint32(tmp4)
	switch (this.CompressionMethod) {
	case Gzip_CompressionMethods__Deflate:
		tmp5 := NewGzip_ExtraFlagsDeflate()
		err = tmp5.Read(this._io, this, this._root)
		if err != nil {
			return err
		}
		this.ExtraFlags = tmp5
	}
	tmp6, err := this._io.ReadU1()
	if err != nil {
		return err
	}
	this.Os = Gzip_Oses(tmp6)
	if (this.Flags.HasExtra) {
		tmp7 := NewGzip_Extras()
		err = tmp7.Read(this._io, this, this._root)
		if err != nil {
			return err
		}
		this.Extras = tmp7
	}
	if (this.Flags.HasName) {
		tmp8, err := this._io.ReadBytesTerm(0, false, true, true)
		if err != nil {
			return err
		}
		this.Name = tmp8
	}
	if (this.Flags.HasComment) {
		tmp9, err := this._io.ReadBytesTerm(0, false, true, true)
		if err != nil {
			return err
		}
		this.Comment = tmp9
	}
	if (this.Flags.HasHeaderCrc) {
		tmp10, err := this._io.ReadU2le()
		if err != nil {
			return err
		}
		this.HeaderCrc16 = uint16(tmp10)
	}
	tmp11, err := this._io.Size()
	if err != nil {
		return err
	}
	tmp12, err := this._io.Pos()
	if err != nil {
		return err
	}
	tmp13, err := this._io.ReadBytes(int((tmp11 - tmp12) - 8))
	if err != nil {
		return err
	}
	tmp13 = tmp13
	this.Body = tmp13
	tmp14, err := this._io.ReadU4le()
	if err != nil {
		return err
	}
	this.BodyCrc32 = uint32(tmp14)
	tmp15, err := this._io.ReadU4le()
	if err != nil {
		return err
	}
	this.LenUncompressed = uint32(tmp15)
	return err
}

/**
 * Compression method used to compress file body. In practice, only
 * one method is widely used: 8 = deflate.
 */

/**
 * Last modification time of a file archived in UNIX timestamp format.
 */

/**
 * Extra flags, specific to compression method chosen.
 */

/**
 * OS used to compress this file.
 */

/**
 * Compressed body of a file archived. Note that we don't make an
 * attempt to decompress it here.
 */

/**
 * CRC32 checksum of an uncompressed file body
 */

/**
 * Size of original uncompressed data in bytes (truncated to 32
 * bits).
 */

type Gzip_ExtraFlagsDeflate_CompressionStrengths int
const (
	Gzip_ExtraFlagsDeflate_CompressionStrengths__Best Gzip_ExtraFlagsDeflate_CompressionStrengths = 2
	Gzip_ExtraFlagsDeflate_CompressionStrengths__Fast Gzip_ExtraFlagsDeflate_CompressionStrengths = 4
)
var values_Gzip_ExtraFlagsDeflate_CompressionStrengths = map[Gzip_ExtraFlagsDeflate_CompressionStrengths]struct{}{2: {}, 4: {}}
func (v Gzip_ExtraFlagsDeflate_CompressionStrengths) isDefined() bool {
	_, ok := values_Gzip_ExtraFlagsDeflate_CompressionStrengths[v]
	return ok
}
type Gzip_ExtraFlagsDeflate struct {
	CompressionStrength Gzip_ExtraFlagsDeflate_CompressionStrengths
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip
}
func NewGzip_ExtraFlagsDeflate() *Gzip_ExtraFlagsDeflate {
	return &Gzip_ExtraFlagsDeflate{
	}
}

func (this Gzip_ExtraFlagsDeflate) IO_() *kaitai.Stream {
	return this._io
}

func (this *Gzip_ExtraFlagsDeflate) Read(io *kaitai.Stream, parent *Gzip, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp16, err := this._io.ReadU1()
	if err != nil {
		return err
	}
	this.CompressionStrength = Gzip_ExtraFlagsDeflate_CompressionStrengths(tmp16)
	return err
}
type Gzip_Extras struct {
	LenSubfields uint16
	Subfields *Gzip_Subfields
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip
	_raw_Subfields []byte
}
func NewGzip_Extras() *Gzip_Extras {
	return &Gzip_Extras{
	}
}

func (this Gzip_Extras) IO_() *kaitai.Stream {
	return this._io
}

func (this *Gzip_Extras) Read(io *kaitai.Stream, parent *Gzip, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp17, err := this._io.ReadU2le()
	if err != nil {
		return err
	}
	this.LenSubfields = uint16(tmp17)
	tmp18, err := this._io.ReadBytes(int(this.LenSubfields))
	if err != nil {
		return err
	}
	tmp18 = tmp18
	this._raw_Subfields = tmp18
	_io__raw_Subfields := kaitai.NewStream(bytes.NewReader(this._raw_Subfields))
	tmp19 := NewGzip_Subfields()
	err = tmp19.Read(_io__raw_Subfields, this, this._root)
	if err != nil {
		return err
	}
	this.Subfields = tmp19
	return err
}
type Gzip_Flags struct {
	Reserved1 uint64
	HasComment bool
	HasName bool
	HasExtra bool
	HasHeaderCrc bool
	IsText bool
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip
}
func NewGzip_Flags() *Gzip_Flags {
	return &Gzip_Flags{
	}
}

func (this Gzip_Flags) IO_() *kaitai.Stream {
	return this._io
}

func (this *Gzip_Flags) Read(io *kaitai.Stream, parent *Gzip, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp20, err := this._io.ReadBitsIntBe(3)
	if err != nil {
		return err
	}
	this.Reserved1 = tmp20
	tmp21, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.HasComment = tmp21 != 0
	tmp22, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.HasName = tmp22 != 0
	tmp23, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.HasExtra = tmp23 != 0
	tmp24, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.HasHeaderCrc = tmp24 != 0
	tmp25, err := this._io.ReadBitsIntBe(1)
	if err != nil {
		return err
	}
	this.IsText = tmp25 != 0
	return err
}

/**
 * If true, optional extra fields are present in the archive.
 */

/**
 * If true, this archive includes a CRC16 checksum for the header.
 */

/**
 * If true, file inside this archive is a text file from
 * compressor's point of view.
 */

/**
 * Every subfield follows typical [TLV scheme](https://en.wikipedia.org/wiki/Type-length-value):
 * 
 * * `id` serves role of "T"ype
 * * `len_data` serves role of "L"ength
 * * `data` serves role of "V"alue
 * 
 * This way it's possible to for arbitrary parser to skip over
 * subfields it does not support.
 */
type Gzip_Subfield struct {
	Id uint16
	LenData uint16
	Data []byte
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip_Subfields
}
func NewGzip_Subfield() *Gzip_Subfield {
	return &Gzip_Subfield{
	}
}

func (this Gzip_Subfield) IO_() *kaitai.Stream {
	return this._io
}

func (this *Gzip_Subfield) Read(io *kaitai.Stream, parent *Gzip_Subfields, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	tmp26, err := this._io.ReadU2le()
	if err != nil {
		return err
	}
	this.Id = uint16(tmp26)
	tmp27, err := this._io.ReadU2le()
	if err != nil {
		return err
	}
	this.LenData = uint16(tmp27)
	tmp28, err := this._io.ReadBytes(int(this.LenData))
	if err != nil {
		return err
	}
	tmp28 = tmp28
	this.Data = tmp28
	return err
}

/**
 * Subfield ID, typically two ASCII letters.
 */

/**
 * Container for many subfields, constrained by size of stream.
 */
type Gzip_Subfields struct {
	Entries []*Gzip_Subfield
	_io *kaitai.Stream
	_root *Gzip
	_parent *Gzip_Extras
}
func NewGzip_Subfields() *Gzip_Subfields {
	return &Gzip_Subfields{
	}
}

func (this Gzip_Subfields) IO_() *kaitai.Stream {
	return this._io
}

func (this *Gzip_Subfields) Read(io *kaitai.Stream, parent *Gzip_Extras, root *Gzip) (err error) {
	this._io = io
	this._parent = parent
	this._root = root

	for i := 0;; i++ {
		tmp29, err := this._io.EOF()
		if err != nil {
			return err
		}
		if tmp29 {
			break
		}
		tmp30 := NewGzip_Subfield()
		err = tmp30.Read(this._io, this, this._root)
		if err != nil {
			return err
		}
		this.Entries = append(this.Entries, tmp30)
	}
	return err
}