ZIP is a popular archive file format, introduced in 1989 by Phil Katz and originally implemented in PKZIP utility by PKWARE.
Thanks to solid support of it in most desktop environments and operating systems, and algorithms / specs availability in public domain, it quickly became tool of choice for implementing file containers.
For example, Java .jar files, OpenDocument, Office Open XML, EPUB files are actually ZIP archives.
This page hosts a formal specification of ZIP archive file using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.
All parsing code for Ruby generated by Kaitai Struct depends on the Ruby runtime library. You have to install it before you can parse data.
The Ruby runtime library can be installed from RubyGems:
gem install kaitai-struct
Parse a local file and get structure in memory:
data = Zip.from_file("path/to/local/file.zip")
Or parse structure from a string of bytes:
bytes = "\x00\x01\x02..."
data = Zip.new(Kaitai::Struct::Stream.new(bytes))
After that, one can get various attributes from the structure by invoking getter methods like:
data.sections # => get sections
# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
require 'kaitai/struct/struct'
unless Gem::Version.new(Kaitai::Struct::VERSION) >= Gem::Version.new('0.9')
raise "Incompatible Kaitai Struct Ruby API: 0.9 or later is required, but you have #{Kaitai::Struct::VERSION}"
end
##
# ZIP is a popular archive file format, introduced in 1989 by Phil Katz
# and originally implemented in PKZIP utility by PKWARE.
#
# Thanks to solid support of it in most desktop environments and
# operating systems, and algorithms / specs availability in public
# domain, it quickly became tool of choice for implementing file
# containers.
#
# For example, Java .jar files, OpenDocument, Office Open XML, EPUB files
# are actually ZIP archives.
# @see https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT Source
# @see https://users.cs.jmu.edu/buchhofp/forensics/formats/pkzip.html Source
class Zip < Kaitai::Struct::Struct
COMPRESSION = {
0 => :compression_none,
1 => :compression_shrunk,
2 => :compression_reduced_1,
3 => :compression_reduced_2,
4 => :compression_reduced_3,
5 => :compression_reduced_4,
6 => :compression_imploded,
8 => :compression_deflated,
9 => :compression_enhanced_deflated,
10 => :compression_pkware_dcl_imploded,
12 => :compression_bzip2,
14 => :compression_lzma,
18 => :compression_ibm_terse,
19 => :compression_ibm_lz77_z,
93 => :compression_zstandard,
94 => :compression_mp3,
95 => :compression_xz,
96 => :compression_jpeg,
97 => :compression_wavpack,
98 => :compression_ppmd,
99 => :compression_aex_encryption_marker,
}
I__COMPRESSION = COMPRESSION.invert
EXTRA_CODES = {
1 => :extra_codes_zip64,
7 => :extra_codes_av_info,
9 => :extra_codes_os2,
10 => :extra_codes_ntfs,
12 => :extra_codes_openvms,
13 => :extra_codes_pkware_unix,
14 => :extra_codes_file_stream_and_fork_descriptors,
15 => :extra_codes_patch_descriptor,
20 => :extra_codes_pkcs7,
21 => :extra_codes_x509_cert_id_and_signature_for_file,
22 => :extra_codes_x509_cert_id_for_central_dir,
23 => :extra_codes_strong_encryption_header,
24 => :extra_codes_record_management_controls,
25 => :extra_codes_pkcs7_enc_recip_cert_list,
101 => :extra_codes_ibm_s390_uncomp,
102 => :extra_codes_ibm_s390_comp,
18064 => :extra_codes_poszip_4690,
21589 => :extra_codes_extended_timestamp,
25922 => :extra_codes_beos,
30062 => :extra_codes_asi_unix,
30805 => :extra_codes_infozip_unix,
30837 => :extra_codes_infozip_unix_var_size,
39169 => :extra_codes_aex_encryption,
41246 => :extra_codes_apache_commons_compress,
41504 => :extra_codes_microsoft_open_packaging_growth_hint,
64842 => :extra_codes_sms_qdos,
}
I__EXTRA_CODES = EXTRA_CODES.invert
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@sections = []
i = 0
while not @_io.eof?
@sections << PkSection.new(@_io, self, @_root)
i += 1
end
self
end
class LocalFile < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@header = LocalFileHeader.new(@_io, self, @_root)
@body = @_io.read_bytes(header.len_body_compressed)
self
end
attr_reader :header
attr_reader :body
end
class DataDescriptor < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@crc32 = @_io.read_u4le
@len_body_compressed = @_io.read_u4le
@len_body_uncompressed = @_io.read_u4le
self
end
attr_reader :crc32
attr_reader :len_body_compressed
attr_reader :len_body_uncompressed
end
class ExtraField < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@code = Kaitai::Struct::Stream::resolve_enum(Zip::EXTRA_CODES, @_io.read_u2le)
@len_body = @_io.read_u2le
case code
when :extra_codes_ntfs
@_raw_body = @_io.read_bytes(len_body)
_io__raw_body = Kaitai::Struct::Stream.new(@_raw_body)
@body = Ntfs.new(_io__raw_body, self, @_root)
when :extra_codes_extended_timestamp
@_raw_body = @_io.read_bytes(len_body)
_io__raw_body = Kaitai::Struct::Stream.new(@_raw_body)
@body = ExtendedTimestamp.new(_io__raw_body, self, @_root)
when :extra_codes_infozip_unix_var_size
@_raw_body = @_io.read_bytes(len_body)
_io__raw_body = Kaitai::Struct::Stream.new(@_raw_body)
@body = InfozipUnixVarSize.new(_io__raw_body, self, @_root)
else
@body = @_io.read_bytes(len_body)
end
self
end
##
# @see https://github.com/LuaDist/zip/blob/b710806/proginfo/extrafld.txt#L191 Source
class Ntfs < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@reserved = @_io.read_u4le
@attributes = []
i = 0
while not @_io.eof?
@attributes << Attribute.new(@_io, self, @_root)
i += 1
end
self
end
class Attribute < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@tag = @_io.read_u2le
@len_body = @_io.read_u2le
case tag
when 1
@_raw_body = @_io.read_bytes(len_body)
_io__raw_body = Kaitai::Struct::Stream.new(@_raw_body)
@body = Attribute1.new(_io__raw_body, self, @_root)
else
@body = @_io.read_bytes(len_body)
end
self
end
attr_reader :tag
attr_reader :len_body
attr_reader :body
attr_reader :_raw_body
end
class Attribute1 < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@last_mod_time = @_io.read_u8le
@last_access_time = @_io.read_u8le
@creation_time = @_io.read_u8le
self
end
attr_reader :last_mod_time
attr_reader :last_access_time
attr_reader :creation_time
end
attr_reader :reserved
attr_reader :attributes
end
##
# @see https://github.com/LuaDist/zip/blob/b710806/proginfo/extrafld.txt#L817 Source
class ExtendedTimestamp < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@_raw_flags = @_io.read_bytes(1)
_io__raw_flags = Kaitai::Struct::Stream.new(@_raw_flags)
@flags = InfoFlags.new(_io__raw_flags, self, @_root)
if flags.has_mod_time
@mod_time = @_io.read_u4le
end
if flags.has_access_time
@access_time = @_io.read_u4le
end
if flags.has_create_time
@create_time = @_io.read_u4le
end
self
end
class InfoFlags < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@has_mod_time = @_io.read_bits_int_le(1) != 0
@has_access_time = @_io.read_bits_int_le(1) != 0
@has_create_time = @_io.read_bits_int_le(1) != 0
@reserved = @_io.read_bits_int_le(5)
self
end
attr_reader :has_mod_time
attr_reader :has_access_time
attr_reader :has_create_time
attr_reader :reserved
end
attr_reader :flags
##
# Unix timestamp
attr_reader :mod_time
##
# Unix timestamp
attr_reader :access_time
##
# Unix timestamp
attr_reader :create_time
attr_reader :_raw_flags
end
##
# @see https://github.com/LuaDist/zip/blob/b710806/proginfo/extrafld.txt#L1339 Source
class InfozipUnixVarSize < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@version = @_io.read_u1
@len_uid = @_io.read_u1
@uid = @_io.read_bytes(len_uid)
@len_gid = @_io.read_u1
@gid = @_io.read_bytes(len_gid)
self
end
##
# Version of this extra field, currently 1
attr_reader :version
##
# Size of UID field
attr_reader :len_uid
##
# UID (User ID) for a file
attr_reader :uid
##
# Size of GID field
attr_reader :len_gid
##
# GID (Group ID) for a file
attr_reader :gid
end
attr_reader :code
attr_reader :len_body
attr_reader :body
attr_reader :_raw_body
end
##
# @see https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT - 4.3.12
class CentralDirEntry < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@version_made_by = @_io.read_u2le
@version_needed_to_extract = @_io.read_u2le
@flags = @_io.read_u2le
@compression_method = Kaitai::Struct::Stream::resolve_enum(Zip::COMPRESSION, @_io.read_u2le)
@_raw_file_mod_time = @_io.read_bytes(4)
_io__raw_file_mod_time = Kaitai::Struct::Stream.new(@_raw_file_mod_time)
@file_mod_time = DosDatetime.new(_io__raw_file_mod_time)
@crc32 = @_io.read_u4le
@len_body_compressed = @_io.read_u4le
@len_body_uncompressed = @_io.read_u4le
@len_file_name = @_io.read_u2le
@len_extra = @_io.read_u2le
@len_comment = @_io.read_u2le
@disk_number_start = @_io.read_u2le
@int_file_attr = @_io.read_u2le
@ext_file_attr = @_io.read_u4le
@ofs_local_header = @_io.read_s4le
@file_name = (@_io.read_bytes(len_file_name)).force_encoding("UTF-8")
@_raw_extra = @_io.read_bytes(len_extra)
_io__raw_extra = Kaitai::Struct::Stream.new(@_raw_extra)
@extra = Extras.new(_io__raw_extra, self, @_root)
@comment = (@_io.read_bytes(len_comment)).force_encoding("UTF-8")
self
end
def local_header
return @local_header unless @local_header.nil?
_pos = @_io.pos
@_io.seek(ofs_local_header)
@local_header = PkSection.new(@_io, self, @_root)
@_io.seek(_pos)
@local_header
end
attr_reader :version_made_by
attr_reader :version_needed_to_extract
attr_reader :flags
attr_reader :compression_method
attr_reader :file_mod_time
attr_reader :crc32
attr_reader :len_body_compressed
attr_reader :len_body_uncompressed
attr_reader :len_file_name
attr_reader :len_extra
attr_reader :len_comment
attr_reader :disk_number_start
attr_reader :int_file_attr
attr_reader :ext_file_attr
attr_reader :ofs_local_header
attr_reader :file_name
attr_reader :extra
attr_reader :comment
attr_reader :_raw_file_mod_time
attr_reader :_raw_extra
end
class PkSection < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@magic = @_io.read_bytes(2)
raise Kaitai::Struct::ValidationNotEqualError.new([80, 75].pack('C*'), magic, _io, "/types/pk_section/seq/0") if not magic == [80, 75].pack('C*')
@section_type = @_io.read_u2le
case section_type
when 513
@body = CentralDirEntry.new(@_io, self, @_root)
when 1027
@body = LocalFile.new(@_io, self, @_root)
when 1541
@body = EndOfCentralDir.new(@_io, self, @_root)
when 2055
@body = DataDescriptor.new(@_io, self, @_root)
end
self
end
attr_reader :magic
attr_reader :section_type
attr_reader :body
end
class Extras < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@entries = []
i = 0
while not @_io.eof?
@entries << ExtraField.new(@_io, self, @_root)
i += 1
end
self
end
attr_reader :entries
end
class LocalFileHeader < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@version = @_io.read_u2le
@_raw_flags = @_io.read_bytes(2)
_io__raw_flags = Kaitai::Struct::Stream.new(@_raw_flags)
@flags = GpFlags.new(_io__raw_flags, self, @_root)
@compression_method = Kaitai::Struct::Stream::resolve_enum(Zip::COMPRESSION, @_io.read_u2le)
@_raw_file_mod_time = @_io.read_bytes(4)
_io__raw_file_mod_time = Kaitai::Struct::Stream.new(@_raw_file_mod_time)
@file_mod_time = DosDatetime.new(_io__raw_file_mod_time)
@crc32 = @_io.read_u4le
@len_body_compressed = @_io.read_u4le
@len_body_uncompressed = @_io.read_u4le
@len_file_name = @_io.read_u2le
@len_extra = @_io.read_u2le
@file_name = (@_io.read_bytes(len_file_name)).force_encoding("UTF-8")
@_raw_extra = @_io.read_bytes(len_extra)
_io__raw_extra = Kaitai::Struct::Stream.new(@_raw_extra)
@extra = Extras.new(_io__raw_extra, self, @_root)
self
end
##
# @see https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT - 4.4.4
# @see https://users.cs.jmu.edu/buchhofp/forensics/formats/pkzip.html Local file headers
class GpFlags < Kaitai::Struct::Struct
DEFLATE_MODE = {
0 => :deflate_mode_normal,
1 => :deflate_mode_maximum,
2 => :deflate_mode_fast,
3 => :deflate_mode_super_fast,
}
I__DEFLATE_MODE = DEFLATE_MODE.invert
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@file_encrypted = @_io.read_bits_int_le(1) != 0
@comp_options_raw = @_io.read_bits_int_le(2)
@has_data_descriptor = @_io.read_bits_int_le(1) != 0
@reserved_1 = @_io.read_bits_int_le(1) != 0
@comp_patched_data = @_io.read_bits_int_le(1) != 0
@strong_encrypt = @_io.read_bits_int_le(1) != 0
@reserved_2 = @_io.read_bits_int_le(4)
@lang_encoding = @_io.read_bits_int_le(1) != 0
@reserved_3 = @_io.read_bits_int_le(1) != 0
@mask_header_values = @_io.read_bits_int_le(1) != 0
@reserved_4 = @_io.read_bits_int_le(2)
self
end
def deflated_mode
return @deflated_mode unless @deflated_mode.nil?
if ((_parent.compression_method == :compression_deflated) || (_parent.compression_method == :compression_enhanced_deflated))
@deflated_mode = Kaitai::Struct::Stream::resolve_enum(DEFLATE_MODE, comp_options_raw)
end
@deflated_mode
end
##
# 8KiB or 4KiB in bytes
def imploded_dict_byte_size
return @imploded_dict_byte_size unless @imploded_dict_byte_size.nil?
if _parent.compression_method == :compression_imploded
@imploded_dict_byte_size = (((comp_options_raw & 1) != 0 ? 8 : 4) * 1024)
end
@imploded_dict_byte_size
end
def imploded_num_sf_trees
return @imploded_num_sf_trees unless @imploded_num_sf_trees.nil?
if _parent.compression_method == :compression_imploded
@imploded_num_sf_trees = ((comp_options_raw & 2) != 0 ? 3 : 2)
end
@imploded_num_sf_trees
end
def lzma_has_eos_marker
return @lzma_has_eos_marker unless @lzma_has_eos_marker.nil?
if _parent.compression_method == :compression_lzma
@lzma_has_eos_marker = (comp_options_raw & 1) != 0
end
@lzma_has_eos_marker
end
attr_reader :file_encrypted
##
# internal; access derived value instances instead
attr_reader :comp_options_raw
attr_reader :has_data_descriptor
attr_reader :reserved_1
attr_reader :comp_patched_data
attr_reader :strong_encrypt
attr_reader :reserved_2
attr_reader :lang_encoding
attr_reader :reserved_3
attr_reader :mask_header_values
attr_reader :reserved_4
end
attr_reader :version
attr_reader :flags
attr_reader :compression_method
attr_reader :file_mod_time
attr_reader :crc32
attr_reader :len_body_compressed
attr_reader :len_body_uncompressed
attr_reader :len_file_name
attr_reader :len_extra
attr_reader :file_name
attr_reader :extra
attr_reader :_raw_flags
attr_reader :_raw_file_mod_time
attr_reader :_raw_extra
end
class EndOfCentralDir < Kaitai::Struct::Struct
def initialize(_io, _parent = nil, _root = self)
super(_io, _parent, _root)
_read
end
def _read
@disk_of_end_of_central_dir = @_io.read_u2le
@disk_of_central_dir = @_io.read_u2le
@num_central_dir_entries_on_disk = @_io.read_u2le
@num_central_dir_entries_total = @_io.read_u2le
@len_central_dir = @_io.read_u4le
@ofs_central_dir = @_io.read_u4le
@len_comment = @_io.read_u2le
@comment = (@_io.read_bytes(len_comment)).force_encoding("UTF-8")
self
end
attr_reader :disk_of_end_of_central_dir
attr_reader :disk_of_central_dir
attr_reader :num_central_dir_entries_on_disk
attr_reader :num_central_dir_entries_total
attr_reader :len_central_dir
attr_reader :ofs_central_dir
attr_reader :len_comment
attr_reader :comment
end
attr_reader :sections
end