ZIP is a popular archive file format, introduced in 1989 by Phil Katz and originally implemented in PKZIP utility by PKWARE.
Thanks to solid support of it in most desktop environments and operating systems, and algorithms / specs availability in public domain, it quickly became tool of choice for implementing file containers.
For example, Java .jar files, OpenDocument, Office Open XML, EPUB files are actually ZIP archives.
This page hosts a formal specification of ZIP archive file using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.
meta:
id: zip
title: ZIP archive file
file-extension: zip
xref:
forensicswiki: zip
iso: 21320-1
justsolve: ZIP
loc:
- fdd000354
- fdd000355
- fdd000362
- fdd000361
pronom: x-fmt/263
wikidata: Q136218
license: CC0-1.0
ks-version: 0.9
imports:
- /common/dos_datetime
endian: le
bit-endian: le
doc: |
ZIP is a popular archive file format, introduced in 1989 by Phil Katz
and originally implemented in PKZIP utility by PKWARE.
Thanks to solid support of it in most desktop environments and
operating systems, and algorithms / specs availability in public
domain, it quickly became tool of choice for implementing file
containers.
For example, Java .jar files, OpenDocument, Office Open XML, EPUB files
are actually ZIP archives.
doc-ref:
- https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT
- https://users.cs.jmu.edu/buchhofp/forensics/formats/pkzip.html
seq:
- id: sections
type: pk_section
repeat: eos
types:
pk_section:
seq:
- id: magic
contents: 'PK'
- id: section_type
type: u2
- id: body
type:
switch-on: section_type
cases:
0x0201: central_dir_entry
0x0403: local_file
0x0605: end_of_central_dir
0x0807: data_descriptor
data_descriptor:
seq:
- id: crc32
type: u4
- id: len_body_compressed
type: u4
- id: len_body_uncompressed
type: u4
local_file:
seq:
- id: header
type: local_file_header
- id: body
size: header.len_body_compressed
local_file_header:
seq:
- id: version
type: u2
- id: flags
type: gp_flags
size: 2
- id: compression_method
type: u2
enum: compression
- id: file_mod_time
size: 4
type: dos_datetime
- id: crc32
type: u4
- id: len_body_compressed
type: u4
- id: len_body_uncompressed
type: u4
- id: len_file_name
type: u2
- id: len_extra
type: u2
- id: file_name
type: str
size: len_file_name
encoding: UTF-8
- id: extra
size: len_extra
type: extras
types:
gp_flags:
-orig-id: general purpose bit flag
doc-ref:
- https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT - 4.4.4
- https://users.cs.jmu.edu/buchhofp/forensics/formats/pkzip.html Local file headers
seq:
- id: file_encrypted
type: b1
- id: comp_options_raw
type: b2
doc: internal; access derived value instances instead
- id: has_data_descriptor
type: b1
- id: reserved_1
type: b1
- id: comp_patched_data
type: b1
- id: strong_encrypt
type: b1
- id: reserved_2
type: b4
- id: lang_encoding
type: b1
- id: reserved_3
type: b1
- id: mask_header_values
type: b1
- id: reserved_4
type: b2
instances:
deflated_mode:
value: comp_options_raw
enum: deflate_mode
if: |
_parent.compression_method == compression::deflated
or _parent.compression_method == compression::enhanced_deflated
imploded_dict_byte_size:
value: '((comp_options_raw & 0b01) != 0 ? 8 : 4) * 1024'
if: '_parent.compression_method == compression::imploded'
doc: 8KiB or 4KiB in bytes
imploded_num_sf_trees:
value: '(comp_options_raw & 0b10) != 0 ? 3 : 2'
if: '_parent.compression_method == compression::imploded'
lzma_has_eos_marker:
value: '(comp_options_raw & 0b01) != 0'
if: '_parent.compression_method == compression::lzma'
enums:
deflate_mode:
0: normal
1: maximum
2: fast
3: super_fast
central_dir_entry:
doc-ref: https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT - 4.3.12
seq:
- id: version_made_by
type: u2
- id: version_needed_to_extract
type: u2
- id: flags
type: u2
- id: compression_method
type: u2
enum: compression
- id: file_mod_time
size: 4
type: dos_datetime
- id: crc32
type: u4
- id: len_body_compressed
type: u4
- id: len_body_uncompressed
type: u4
- id: len_file_name
type: u2
- id: len_extra
type: u2
- id: len_comment
type: u2
- id: disk_number_start
type: u2
- id: int_file_attr
type: u2
- id: ext_file_attr
type: u4
- id: ofs_local_header
type: s4
- id: file_name
type: str
size: len_file_name
encoding: UTF-8
- id: extra
size: len_extra
type: extras
- id: comment
type: str
size: len_comment
encoding: UTF-8
instances:
local_header:
pos: ofs_local_header
type: pk_section
# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT - 4.3.16
end_of_central_dir:
seq:
- id: disk_of_end_of_central_dir
type: u2
- id: disk_of_central_dir
type: u2
- id: num_central_dir_entries_on_disk
type: u2
- id: num_central_dir_entries_total
type: u2
- id: len_central_dir
type: u4
- id: ofs_central_dir
type: u4
- id: len_comment
type: u2
- id: comment
type: str
size: len_comment
encoding: UTF-8
extras:
seq:
- id: entries
type: extra_field
repeat: eos
extra_field:
seq:
- id: code
type: u2
enum: extra_codes
- id: len_body
type: u2
- id: body
size: len_body
type:
switch-on: code
cases:
'extra_codes::ntfs': ntfs
'extra_codes::extended_timestamp': extended_timestamp
'extra_codes::infozip_unix_var_size': infozip_unix_var_size
types:
ntfs:
doc-ref: 'https://github.com/LuaDist/zip/blob/b710806/proginfo/extrafld.txt#L191'
seq:
- id: reserved
type: u4
- id: attributes
type: attribute
repeat: eos
types:
attribute:
seq:
- id: tag
type: u2
- id: len_body
type: u2
- id: body
size: len_body
type:
switch-on: tag
cases:
1: attribute_1
attribute_1:
seq:
- id: last_mod_time
type: u8
- id: last_access_time
type: u8
- id: creation_time
type: u8
extended_timestamp:
doc-ref: 'https://github.com/LuaDist/zip/blob/b710806/proginfo/extrafld.txt#L817'
seq:
- id: flags
size: 1
type: info_flags
- id: mod_time
type: u4
if: flags.has_mod_time
doc: Unix timestamp
- id: access_time
type: u4
if: flags.has_access_time
doc: Unix timestamp
- id: create_time
type: u4
if: flags.has_create_time
doc: Unix timestamp
types:
info_flags:
seq:
- id: has_mod_time
type: b1
- id: has_access_time
type: b1
- id: has_create_time
type: b1
- id: reserved
type: b5
infozip_unix_var_size:
doc-ref: 'https://github.com/LuaDist/zip/blob/b710806/proginfo/extrafld.txt#L1339'
seq:
- id: version
type: u1
doc: Version of this extra field, currently 1
- id: len_uid
type: u1
doc: Size of UID field
- id: uid
size: len_uid
doc: UID (User ID) for a file
- id: len_gid
type: u1
doc: Size of GID field
- id: gid
size: len_gid
doc: GID (Group ID) for a file
enums:
compression:
0: none
1: shrunk
2: reduced_1
3: reduced_2
4: reduced_3
5: reduced_4
6: imploded
8: deflated
9: enhanced_deflated
10: pkware_dcl_imploded
12: bzip2
14: lzma
18: ibm_terse
19: ibm_lz77_z
93: zstandard
94: mp3
95: xz
96: jpeg
97: wavpack
98: ppmd
99: aex_encryption_marker
extra_codes:
# https://github.com/LuaDist/zip/blob/b710806/proginfo/extrafld.txt
0x0001: zip64
0x0007: av_info
# 0x0008: reserved for extended language encoding data (PFS) (see APPENDIX D)
0x0009: os2
0x000a: ntfs
0x000c: openvms
0x000d: pkware_unix
0x000e: file_stream_and_fork_descriptors
0x000f: patch_descriptor
0x0014: pkcs7
0x0015: x509_cert_id_and_signature_for_file
0x0016: x509_cert_id_for_central_dir
0x0017: strong_encryption_header
0x0018: record_management_controls
0x0019: pkcs7_enc_recip_cert_list
0x0065: ibm_s390_uncomp
0x0066: ibm_s390_comp
0x4690: poszip_4690
0x5455: extended_timestamp
0x6542: beos
0x756e: asi_unix
0x7855: infozip_unix
0x7875: infozip_unix_var_size
0xa11e: apache_commons_compress
0xa220: microsoft_open_packaging_growth_hint
0xfd4a: sms_qdos
0x9901: aex_encryption