From 8937b51a45917c28a46bb222f16b630f459acf4d Mon Sep 17 00:00:00 2001 From: Draklaw Date: Fri, 10 Nov 2023 18:22:26 +0100 Subject: [PATCH] SimpleCas basic implementation. --- pyproject.toml | 9 ++- src/bsv/command/init.py | 6 ++ src/bsv/simple_cas/__init__.py | 3 + src/bsv/simple_cas/cas.py | 134 +++++++++++++++++++++++++++++++++ src/bsv/simple_cas/util.py | 31 ++++++++ tests/test_simple_cas.py | 76 +++++++++++++++++++ 6 files changed, 257 insertions(+), 2 deletions(-) create mode 100644 src/bsv/simple_cas/cas.py create mode 100644 src/bsv/simple_cas/util.py create mode 100644 tests/test_simple_cas.py diff --git a/pyproject.toml b/pyproject.toml index bf0236d..fe48ab2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,10 +10,15 @@ requires-python = ">=3.11" classifiers = [ # TODO ] +dynamic = ["version"] dependencies = [ - "tomlkit" + "tomlkit", +] + +[project.optional-dependencies] +test = [ + "pytest", ] -dynamic = ["version"] [project.urls] # TODO diff --git a/src/bsv/command/init.py b/src/bsv/command/init.py index 1d14d45..765f1cc 100644 --- a/src/bsv/command/init.py +++ b/src/bsv/command/init.py @@ -87,11 +87,17 @@ def init( bsv_table.add(tomlkit.comment("Mapping between bsv tree and the actual filesystem.")) bsv_table.add("path_map", tomlkit.array()) + cas_table = tomlkit.table() + cas_table.add("type", "simple") + cas_table.add("hash", "sha256") + doc = tomlkit.document() doc.add(tomlkit.comment("bsv repository configuration")) doc.add(tomlkit.comment(f"Created by {getlogin()} on {DateTime.now().isoformat()}.")) doc.add(tomlkit.nl()) doc.add("bsv", bsv_table) + doc.add(tomlkit.nl()) + doc.add("cas", cas_table) config_path = destination / "bsv_config.toml" try: diff --git a/src/bsv/simple_cas/__init__.py b/src/bsv/simple_cas/__init__.py index 3e9d59f..a3e483a 100644 --- a/src/bsv/simple_cas/__init__.py +++ b/src/bsv/simple_cas/__init__.py @@ -14,3 +14,6 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations + + +from bsv.simple_cas.cas import SimpleCas as Cas diff --git a/src/bsv/simple_cas/cas.py b/src/bsv/simple_cas/cas.py new file mode 100644 index 0000000..504fe43 --- /dev/null +++ b/src/bsv/simple_cas/cas.py @@ -0,0 +1,134 @@ +# bsv - Backup, Synchronization, Versioning +# Copyright (C) 2023 Simon Boyé +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from __future__ import annotations + +from dataclasses import dataclass +import hashlib +from pathlib import Path +from typing import Any, BinaryIO, Callable, Optional + +from bsv.simple_cas.util import Hash + + +class SimpleCas: + _root_dir: Path + _hash_factory: Callable[[], Hash] + _digest_size: int + + _index: dict[bytes, IndexItem] + + _is_inside_context: bool = False + + def __init__(self, root_dir: Path, hash_factory: Callable[[], Hash]): + self._root_dir = root_dir + self._hash_factory = hash_factory + self._digest_size = self._hash_factory().digest_size + + self._index = {} + if (self._root_dir / "cas.idx").exists(): + with (self._root_dir / "cas.idx").open("rb") as stream: + while True: + digest = stream.read(self._digest_size) + if not digest: + break + object_type = stream.read(4) + offset = int.from_bytes(stream.read(4)) + size = int.from_bytes(stream.read(4)) + self._index[digest] = IndexItem(object_type, offset, size) + + @classmethod + def from_obj(cls, root_dir: Path, obj: dict[str, Any]) -> SimpleCas: + return SimpleCas( + root_dir = root_dir, + hash_factory = hashlib.new(obj["hash"]), # type: ignore + ) + + def __enter__(self): + assert not self._is_inside_context + self._is_inside_context = True + + def __exit__(self, exc_type, exc_value, traceback): + assert self._is_inside_context + self._is_inside_context = False + + def __len__(self) -> int: + return len(self._index) + + def __contains__(self, digest: bytes) -> bool: + assert len(digest) == self._digest_size + return digest in self._index + + def read(self, digest: bytes) -> Optional[Object]: + item = self._index.get(digest) + if item is None: + return None + + with (self._root_dir / "cas.dat").open("rb") as stream: + stream.seek(item.offset) + assert stream.read(self._digest_size) == digest + object_type = stream.read(4) + assert object_type == item.object_type + size = int.from_bytes(stream.read(4)) + assert size == item.size + data = stream.read(size) + + return Object(object_type, data) + + def write(self, object_type: bytes, data: bytes) -> bytes: + assert len(object_type) == 4 + assert len(data) < 2**32 + + hash = self._hash_factory() + hash.update(object_type) + hash.update(b"\0") + hash.update(len(data).to_bytes(4)) + hash.update(b"\0") + hash.update(data) + digest = hash.digest() + + if digest not in self: + with self._open_writer(digest, object_type, len(data)) as out: + out.write(digest) + out.write(object_type) + out.write(len(data).to_bytes(4)) + out.write(data) + + return digest + + def _open_writer(self, digest: bytes, object_type: bytes, size: int) -> BinaryIO: + dat_file = (self._root_dir / "cas.dat").open("ab") + offset = dat_file.tell() + self._index[digest] = IndexItem(object_type, offset, size) + + with (self._root_dir / "cas.idx").open("ab") as idx_file: + idx_file.write(digest) + idx_file.write(object_type) + idx_file.write(offset.to_bytes(4)) + idx_file.write(size.to_bytes(4)) + + return dat_file + + +@dataclass +class Object: + object_type: bytes + data: bytes + +@dataclass +class IndexItem: + object_type: bytes + offset: int + size: int diff --git a/src/bsv/simple_cas/util.py b/src/bsv/simple_cas/util.py new file mode 100644 index 0000000..fd31b33 --- /dev/null +++ b/src/bsv/simple_cas/util.py @@ -0,0 +1,31 @@ +# bsv - Backup, Synchronization, Versioning +# Copyright (C) 2023 Simon Boyé +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from __future__ import annotations + +from abc import ABC, abstractmethod + + +class Hash(ABC): + name: str + digest_size: int + + @abstractmethod + def update(self, *data: bytes | bytearray | memoryview): + ... + + @abstractmethod + def digest(self) -> bytes: + ... diff --git a/tests/test_simple_cas.py b/tests/test_simple_cas.py new file mode 100644 index 0000000..c219fe7 --- /dev/null +++ b/tests/test_simple_cas.py @@ -0,0 +1,76 @@ +# bsv - Backup, Synchronization, Versioning +# Copyright (C) 2023 Simon Boyé +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from __future__ import annotations +from hashlib import sha256 +from pathlib import Path + +from tempfile import TemporaryDirectory +import pytest + +from bsv.simple_cas.cas import SimpleCas + + +@pytest.fixture +def tmp_dir(): + with TemporaryDirectory(prefix="simple_cas_") as tmp_dir: + yield Path(tmp_dir) + + +@pytest.fixture +def cas(tmp_dir): + cas = SimpleCas( + tmp_dir, + sha256, # type: ignore + ) + with cas: + yield cas + + +def test_simple_cas(tmp_dir: Path): + cas = SimpleCas( + tmp_dir, + sha256, # type: ignore + ) + with cas: + assert len(cas) == 0 + + data = b"Hello World!" + digest = cas.write(b"blob", data) + + assert len(cas) == 1 + assert digest in cas + + obj = cas.read(digest) + assert obj is not None + assert obj.object_type == b"blob" + assert obj.data == data + + cas = SimpleCas( + tmp_dir, + sha256, # type: ignore + ) + with cas: + assert len(cas) == 1 + assert digest in cas + + obj = cas.read(digest) + assert obj is not None + assert obj.object_type == b"blob" + assert obj.data == data + + digest2 = cas.write(b"blob", data) + assert digest2 == digest + assert len(cas) == 1