SQLite3 database file: Python parsing library

SQLite3 is a popular serverless SQL engine, implemented as a library to be used within other applications. It keeps its databases as regular disk files.

Every database file is segmented into pages. First page (starting at the very beginning) is special: it contains a file-global header which specifies some data relevant to proper parsing (i.e. format versions, size of page, etc). After the header, normal contents of the first page follow.

Each page would be of some type, and generally, they would be reached via the links starting from the first page. First page type (root_page) is always "btree_page".

File extension

["sqlite", "db", "db3", "sqlite3"]

KS implementation details

License: CC0-1.0
Minimal Kaitai Struct required: 0.9

References

This page hosts a formal specification of SQLite3 database file using Kaitai Struct. This specification can be automatically translated into a variety of programming languages to get a parsing library.

Usage

Runtime library

All parsing code for Python generated by Kaitai Struct depends on the Python runtime library. You have to install it before you can parse data.

The Python runtime library can be installed from PyPI:

python3 -m pip install kaitaistruct

Code

Parse a local file and get structure in memory:

data = Sqlite3.from_file("path/to/local/file.sqlite")

Or parse structure from a bytes:

from kaitaistruct import KaitaiStream, BytesIO

raw = b"\x00\x01\x02..."
data = Sqlite3(KaitaiStream(BytesIO(raw)))

After that, one can get various attributes from the structure by invoking getter methods like:

data.len_page_mod # => The database page size in bytes. Must be a power of two between
512 and 32768 inclusive, or the value 1 representing a page size
of 65536.

Python source code to parse SQLite3 database file

sqlite3.py

# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
# type: ignore

import kaitaistruct
from kaitaistruct import KaitaiStruct, KaitaiStream, BytesIO
import vlq_base128_be
from enum import IntEnum


if getattr(kaitaistruct, 'API_VERSION', (0, 9)) < (0, 11):
    raise Exception("Incompatible Kaitai Struct Python API: 0.11 or later is required, but you have %s" % (kaitaistruct.__version__))

class Sqlite3(KaitaiStruct):
    """SQLite3 is a popular serverless SQL engine, implemented as a library
    to be used within other applications. It keeps its databases as
    regular disk files.
    
    Every database file is segmented into pages. First page (starting at
    the very beginning) is special: it contains a file-global header
    which specifies some data relevant to proper parsing (i.e. format
    versions, size of page, etc). After the header, normal contents of
    the first page follow.
    
    Each page would be of some type, and generally, they would be
    reached via the links starting from the first page. First page type
    (`root_page`) is always "btree_page".
    
    .. seealso::
       Source - https://www.sqlite.org/fileformat.html
    """

    class Encodings(IntEnum):
        utf_8 = 1
        utf_16le = 2
        utf_16be = 3

    class Versions(IntEnum):
        legacy = 1
        wal = 2
    def __init__(self, _io, _parent=None, _root=None):
        super(Sqlite3, self).__init__(_io)
        self._parent = _parent
        self._root = _root or self
        self._read()

    def _read(self):
        self.magic = self._io.read_bytes(16)
        if not self.magic == b"\x53\x51\x4C\x69\x74\x65\x20\x66\x6F\x72\x6D\x61\x74\x20\x33\x00":
            raise kaitaistruct.ValidationNotEqualError(b"\x53\x51\x4C\x69\x74\x65\x20\x66\x6F\x72\x6D\x61\x74\x20\x33\x00", self.magic, self._io, u"/seq/0")
        self.len_page_mod = self._io.read_u2be()
        self.write_version = KaitaiStream.resolve_enum(Sqlite3.Versions, self._io.read_u1())
        self.read_version = KaitaiStream.resolve_enum(Sqlite3.Versions, self._io.read_u1())
        self.reserved_space = self._io.read_u1()
        self.max_payload_frac = self._io.read_u1()
        self.min_payload_frac = self._io.read_u1()
        self.leaf_payload_frac = self._io.read_u1()
        self.file_change_counter = self._io.read_u4be()
        self.num_pages = self._io.read_u4be()
        self.first_freelist_trunk_page = self._io.read_u4be()
        self.num_freelist_pages = self._io.read_u4be()
        self.schema_cookie = self._io.read_u4be()
        self.schema_format = self._io.read_u4be()
        self.def_page_cache_size = self._io.read_u4be()
        self.largest_root_page = self._io.read_u4be()
        self.text_encoding = KaitaiStream.resolve_enum(Sqlite3.Encodings, self._io.read_u4be())
        self.user_version = self._io.read_u4be()
        self.is_incremental_vacuum = self._io.read_u4be()
        self.application_id = self._io.read_u4be()
        self.reserved = self._io.read_bytes(20)
        self.version_valid_for = self._io.read_u4be()
        self.sqlite_version_number = self._io.read_u4be()
        self.root_page = Sqlite3.BtreePage(self._io, self, self._root)


    def _fetch_instances(self):
        pass
        self.root_page._fetch_instances()

    class BtreePage(KaitaiStruct):
        def __init__(self, _io, _parent=None, _root=None):
            super(Sqlite3.BtreePage, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self._read()

        def _read(self):
            self.page_type = self._io.read_u1()
            self.first_freeblock = self._io.read_u2be()
            self.num_cells = self._io.read_u2be()
            self.ofs_cells = self._io.read_u2be()
            self.num_frag_free_bytes = self._io.read_u1()
            if  ((self.page_type == 2) or (self.page_type == 5)) :
                pass
                self.right_ptr = self._io.read_u4be()

            self.cells = []
            for i in range(self.num_cells):
                self.cells.append(Sqlite3.RefCell(self._io, self, self._root))



        def _fetch_instances(self):
            pass
            if  ((self.page_type == 2) or (self.page_type == 5)) :
                pass

            for i in range(len(self.cells)):
                pass
                self.cells[i]._fetch_instances()



    class CellIndexInterior(KaitaiStruct):
        """
        .. seealso::
           Source - https://www.sqlite.org/fileformat.html#b_tree_pages
        """
        def __init__(self, _io, _parent=None, _root=None):
            super(Sqlite3.CellIndexInterior, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self._read()

        def _read(self):
            self.left_child_page = self._io.read_u4be()
            self.len_payload = vlq_base128_be.VlqBase128Be(self._io)
            self._raw_payload = self._io.read_bytes(self.len_payload.value)
            _io__raw_payload = KaitaiStream(BytesIO(self._raw_payload))
            self.payload = Sqlite3.CellPayload(_io__raw_payload, self, self._root)


        def _fetch_instances(self):
            pass
            self.len_payload._fetch_instances()
            self.payload._fetch_instances()


    class CellIndexLeaf(KaitaiStruct):
        """
        .. seealso::
           Source - https://www.sqlite.org/fileformat.html#b_tree_pages
        """
        def __init__(self, _io, _parent=None, _root=None):
            super(Sqlite3.CellIndexLeaf, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self._read()

        def _read(self):
            self.len_payload = vlq_base128_be.VlqBase128Be(self._io)
            self._raw_payload = self._io.read_bytes(self.len_payload.value)
            _io__raw_payload = KaitaiStream(BytesIO(self._raw_payload))
            self.payload = Sqlite3.CellPayload(_io__raw_payload, self, self._root)


        def _fetch_instances(self):
            pass
            self.len_payload._fetch_instances()
            self.payload._fetch_instances()


    class CellPayload(KaitaiStruct):
        """
        .. seealso::
           Source - https://sqlite.org/fileformat2.html#record_format
        """
        def __init__(self, _io, _parent=None, _root=None):
            super(Sqlite3.CellPayload, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self._read()

        def _read(self):
            self.len_header_and_len = vlq_base128_be.VlqBase128Be(self._io)
            self._raw_column_serials = self._io.read_bytes(self.len_header_and_len.value - 1)
            _io__raw_column_serials = KaitaiStream(BytesIO(self._raw_column_serials))
            self.column_serials = Sqlite3.Serials(_io__raw_column_serials, self, self._root)
            self.column_contents = []
            for i in range(len(self.column_serials.entries)):
                self.column_contents.append(Sqlite3.ColumnContent(self.column_serials.entries[i], self._io, self, self._root))



        def _fetch_instances(self):
            pass
            self.len_header_and_len._fetch_instances()
            self.column_serials._fetch_instances()
            for i in range(len(self.column_contents)):
                pass
                self.column_contents[i]._fetch_instances()



    class CellTableInterior(KaitaiStruct):
        """
        .. seealso::
           Source - https://www.sqlite.org/fileformat.html#b_tree_pages
        """
        def __init__(self, _io, _parent=None, _root=None):
            super(Sqlite3.CellTableInterior, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self._read()

        def _read(self):
            self.left_child_page = self._io.read_u4be()
            self.row_id = vlq_base128_be.VlqBase128Be(self._io)


        def _fetch_instances(self):
            pass
            self.row_id._fetch_instances()


    class CellTableLeaf(KaitaiStruct):
        """
        .. seealso::
           Source - https://www.sqlite.org/fileformat.html#b_tree_pages
        """
        def __init__(self, _io, _parent=None, _root=None):
            super(Sqlite3.CellTableLeaf, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self._read()

        def _read(self):
            self.len_payload = vlq_base128_be.VlqBase128Be(self._io)
            self.row_id = vlq_base128_be.VlqBase128Be(self._io)
            self._raw_payload = self._io.read_bytes(self.len_payload.value)
            _io__raw_payload = KaitaiStream(BytesIO(self._raw_payload))
            self.payload = Sqlite3.CellPayload(_io__raw_payload, self, self._root)


        def _fetch_instances(self):
            pass
            self.len_payload._fetch_instances()
            self.row_id._fetch_instances()
            self.payload._fetch_instances()


    class ColumnContent(KaitaiStruct):
        def __init__(self, serial_type, _io, _parent=None, _root=None):
            super(Sqlite3.ColumnContent, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self.serial_type = serial_type
            self._read()

        def _read(self):
            if  ((self.serial_type.code.value >= 1) and (self.serial_type.code.value <= 6)) :
                pass
                _on = self.serial_type.code.value
                if _on == 1:
                    pass
                    self.as_int = self._io.read_u1()
                elif _on == 2:
                    pass
                    self.as_int = self._io.read_u2be()
                elif _on == 3:
                    pass
                    self.as_int = self._io.read_bits_int_be(24)
                elif _on == 4:
                    pass
                    self.as_int = self._io.read_u4be()
                elif _on == 5:
                    pass
                    self.as_int = self._io.read_bits_int_be(48)
                elif _on == 6:
                    pass
                    self.as_int = self._io.read_u8be()

            if self.serial_type.code.value == 7:
                pass
                self.as_float = self._io.read_f8be()

            if self.serial_type.is_blob:
                pass
                self.as_blob = self._io.read_bytes(self.serial_type.len_content)

            self.as_str = (self._io.read_bytes(self.serial_type.len_content)).decode(u"UTF-8")


        def _fetch_instances(self):
            pass
            if  ((self.serial_type.code.value >= 1) and (self.serial_type.code.value <= 6)) :
                pass
                _on = self.serial_type.code.value
                if _on == 1:
                    pass
                elif _on == 2:
                    pass
                elif _on == 3:
                    pass
                elif _on == 4:
                    pass
                elif _on == 5:
                    pass
                elif _on == 6:
                    pass

            if self.serial_type.code.value == 7:
                pass

            if self.serial_type.is_blob:
                pass



    class RefCell(KaitaiStruct):
        def __init__(self, _io, _parent=None, _root=None):
            super(Sqlite3.RefCell, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self._read()

        def _read(self):
            self.ofs_body = self._io.read_u2be()


        def _fetch_instances(self):
            pass
            _ = self.body
            if hasattr(self, '_m_body'):
                pass
                _on = self._parent.page_type
                if _on == 10:
                    pass
                    self._m_body._fetch_instances()
                elif _on == 13:
                    pass
                    self._m_body._fetch_instances()
                elif _on == 2:
                    pass
                    self._m_body._fetch_instances()
                elif _on == 5:
                    pass
                    self._m_body._fetch_instances()


        @property
        def body(self):
            if hasattr(self, '_m_body'):
                return self._m_body

            _pos = self._io.pos()
            self._io.seek(self.ofs_body)
            _on = self._parent.page_type
            if _on == 10:
                pass
                self._m_body = Sqlite3.CellIndexLeaf(self._io, self, self._root)
            elif _on == 13:
                pass
                self._m_body = Sqlite3.CellTableLeaf(self._io, self, self._root)
            elif _on == 2:
                pass
                self._m_body = Sqlite3.CellIndexInterior(self._io, self, self._root)
            elif _on == 5:
                pass
                self._m_body = Sqlite3.CellTableInterior(self._io, self, self._root)
            self._io.seek(_pos)
            return getattr(self, '_m_body', None)


    class Serial(KaitaiStruct):
        def __init__(self, _io, _parent=None, _root=None):
            super(Sqlite3.Serial, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self._read()

        def _read(self):
            self.code = vlq_base128_be.VlqBase128Be(self._io)


        def _fetch_instances(self):
            pass
            self.code._fetch_instances()

        @property
        def is_blob(self):
            if hasattr(self, '_m_is_blob'):
                return self._m_is_blob

            self._m_is_blob =  ((self.code.value >= 12) and (self.code.value % 2 == 0)) 
            return getattr(self, '_m_is_blob', None)

        @property
        def is_string(self):
            if hasattr(self, '_m_is_string'):
                return self._m_is_string

            self._m_is_string =  ((self.code.value >= 13) and (self.code.value % 2 == 1)) 
            return getattr(self, '_m_is_string', None)

        @property
        def len_content(self):
            if hasattr(self, '_m_len_content'):
                return self._m_len_content

            if self.code.value >= 12:
                pass
                self._m_len_content = (self.code.value - 12) // 2

            return getattr(self, '_m_len_content', None)


    class Serials(KaitaiStruct):
        def __init__(self, _io, _parent=None, _root=None):
            super(Sqlite3.Serials, self).__init__(_io)
            self._parent = _parent
            self._root = _root
            self._read()

        def _read(self):
            self.entries = []
            i = 0
            while not self._io.is_eof():
                self.entries.append(Sqlite3.Serial(self._io, self, self._root))
                i += 1



        def _fetch_instances(self):
            pass
            for i in range(len(self.entries)):
                pass
                self.entries[i]._fetch_instances()



    @property
    def len_page(self):
        if hasattr(self, '_m_len_page'):
            return self._m_len_page

        self._m_len_page = (65536 if self.len_page_mod == 1 else self.len_page_mod)
        return getattr(self, '_m_len_page', None)