Google Protocol Buffers (AKA protobuf) is a popular data serialization scheme used for communication protocols, data storage, etc. There are implementations are available for almost every popular language. The focus points of this scheme are brevity (data is encoded in a very size-efficient manner) and extensibility (one can add keys to the structure, while keeping it readable in previous version of software).
Protobuf uses semi-self-describing encoding scheme for its
messages. It means that it is possible to parse overall structure of
the message (skipping over fields one can't understand), but to
fully understand the message, one needs a protocol definition file
(.proto
). To be specific:
.proto
file provides info on
which symbolic field names these field tags map to.sint32
vs uint32
vs some enum, or string
from bytes
), but
it's enough information to determine how many bytes to
parse. Interpretation of the value should be done according to the
type specified in .proto
file.This page hosts a formal specification of Google Protocol Buffers (protobuf) using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.
digraph {
rankdir=LR;
node [shape=plaintext];
subgraph cluster__google_protobuf {
label="GoogleProtobuf";
graph[style=dotted];
google_protobuf__seq [label=<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
<TR><TD BGCOLOR="#E0FFE0">pos</TD><TD BGCOLOR="#E0FFE0">size</TD><TD BGCOLOR="#E0FFE0">type</TD><TD BGCOLOR="#E0FFE0">id</TD></TR>
<TR><TD PORT="pairs_pos">0</TD><TD PORT="pairs_size">...</TD><TD>Pair</TD><TD PORT="pairs_type">pairs</TD></TR>
<TR><TD COLSPAN="4" PORT="pairs__repeat">repeat to end of stream</TD></TR>
</TABLE>>];
subgraph cluster__pair {
label="GoogleProtobuf::Pair";
graph[style=dotted];
pair__seq [label=<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
<TR><TD BGCOLOR="#E0FFE0">pos</TD><TD BGCOLOR="#E0FFE0">size</TD><TD BGCOLOR="#E0FFE0">type</TD><TD BGCOLOR="#E0FFE0">id</TD></TR>
<TR><TD PORT="key_pos">0</TD><TD PORT="key_size">...</TD><TD>VlqBase128Le</TD><TD PORT="key_type">key</TD></TR>
<TR><TD PORT="value_pos">...</TD><TD PORT="value_size">...</TD><TD>switch (wire_type)</TD><TD PORT="value_type">value</TD></TR>
</TABLE>>];
pair__inst__wire_type [label=<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
<TR><TD BGCOLOR="#E0FFE0">id</TD><TD BGCOLOR="#E0FFE0">value</TD></TR>
<TR><TD>wire_type</TD><TD>Kaitai::Struct::Stream::resolve_enum(WIRE_TYPES, (key.value & 7))</TD></TR>
</TABLE>>];
pair__inst__field_tag [label=<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
<TR><TD BGCOLOR="#E0FFE0">id</TD><TD BGCOLOR="#E0FFE0">value</TD></TR>
<TR><TD>field_tag</TD><TD>(key.value >> 3)</TD></TR>
</TABLE>>];
pair__seq_value_switch [label=<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
<TR><TD BGCOLOR="#F0F2E4">case</TD><TD BGCOLOR="#F0F2E4">type</TD></TR>
<TR><TD>:wire_types_varint</TD><TD PORT="case0">VlqBase128Le</TD></TR>
<TR><TD>:wire_types_len_delimited</TD><TD PORT="case1">DelimitedBytes</TD></TR>
</TABLE>>];
}
subgraph cluster__delimited_bytes {
label="GoogleProtobuf::DelimitedBytes";
graph[style=dotted];
delimited_bytes__seq [label=<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
<TR><TD BGCOLOR="#E0FFE0">pos</TD><TD BGCOLOR="#E0FFE0">size</TD><TD BGCOLOR="#E0FFE0">type</TD><TD BGCOLOR="#E0FFE0">id</TD></TR>
<TR><TD PORT="len_pos">0</TD><TD PORT="len_size">...</TD><TD>VlqBase128Le</TD><TD PORT="len_type">len</TD></TR>
<TR><TD PORT="body_pos">...</TD><TD PORT="body_size">len.value</TD><TD></TD><TD PORT="body_type">body</TD></TR>
</TABLE>>];
}
}
google_protobuf__seq:pairs_type -> pair__seq [style=bold];
pair__seq:key_type -> vlq_base128_le__seq [style=bold];
pair__seq:value_type -> pair__seq_value_switch [style=bold];
pair__seq_value_switch:case0 -> vlq_base128_le__seq [style=bold];
pair__seq_value_switch:case1 -> delimited_bytes__seq [style=bold];
pair__inst__wire_type:wire_type_type -> pair__seq:value_type [color="#404040"];
vlq_base128_le__inst__value:value_type -> pair__inst__wire_type [color="#404040"];
vlq_base128_le__inst__value:value_type -> pair__inst__field_tag [color="#404040"];
delimited_bytes__seq:len_type -> vlq_base128_le__seq [style=bold];
vlq_base128_le__inst__value:value_type -> delimited_bytes__seq:body_size [color="#404040"];
}