# bsv - Backup, Synchronization, Versioning # Copyright (C) 2023 Simon Boyé # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations from dataclasses import dataclass from datetime import datetime as DateTime import hashlib from io import BytesIO from pathlib import Path, PurePosixPath import platform import tomllib from typing import TYPE_CHECKING, BinaryIO, Callable, Self, Type from fastcdc import fastcdc import tomlkit from bsv import __version__ from bsv.exception import ConfigError from bsv.object import ObjectInfo from bsv.path_map import PathMap from bsv.simple_cas import SimpleCas from bsv.simple_cas.cas import Digest, SimpleCas from bsv.util import Hash, read_exact, read_exact_or_eof, time_from_timestamp_us, timestamp_us_from_time if TYPE_CHECKING: from bsv.tree_walker import TreeWalker DEFAULT_MIN_CHUNK_SIZE = 1 << 12 DEFAULT_AVG_CHUNK_SIZE = 1 << 16 DEFAULT_MAX_CHUNK_SIZE = 1 << 20 class Repository: _path: Path _name: str _cas: SimpleCas _min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE _avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE _max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE _path_map: PathMap # _remotes: list[object] _context_depth: int = 0 def __init__(self, path: Path): self._path = path with self.config_file.open("rb") as stream: config = tomllib.load(stream) bsv = config.get("bsv", {}) self._name = bsv.get("name") or platform.node() self._cas = make_cas( bsv.get("cas"), self._path, lambda: hashlib.new(bsv.get("hash")), # type: ignore ) self._min_chunk_size = bsv.get("min_chunk_size") self._avg_chunk_size = bsv.get("avg_chunk_size") self._max_chunk_size = bsv.get("max_chunk_size") self._path_map = PathMap.from_obj(bsv.get("path_map", [])) @property def path(self) -> Path: return self._path @property def config_file(self) -> Path: return self.path / "bsv_config.toml" @property def name(self) -> str: return self._name @property def path_map(self) -> PathMap: return self._path_map.clone() def get_blob(self, digest: Digest) -> BlobObject: with self: obj, blob = self._read(digest, object_type=b"blob") return BlobObject( digest = obj.digest, object_type = obj.object_type, size = obj.size, blob = blob, ) def add_blob(self, stream: BinaryIO, *, dry_run: bool=False) -> BlobObject: with self: return self._write(b"blob", stream, dry_run=dry_run) def get_symlink(self, digest: Digest) -> SymlinkObject: with self: obj = self._cas.read(digest, object_type=b"slnk") return SymlinkObject( digest = obj.digest, object_type = obj.object_type, size = obj.size, symlink = Symlink.from_bytes(self, obj.data), ) def add_symlink(self, symlink: Symlink, *, dry_run: bool=False) -> SymlinkObject: with self: data = symlink.to_bytes() return SymlinkObject( digest = self._cas.write(b"slnk", data, dry_run=dry_run), object_type = b"slnk", size = len(data), symlink = symlink, ) def add_symlink_from_fs_target(self, fs_symlink: Path, fs_target: Path, *, dry_run: bool=False) -> SymlinkObject: assert fs_symlink.is_absolute() return self.add_symlink( Symlink( repo = self, is_absolute = fs_target.is_absolute(), target = self._path_map.relative_bsv_path(fs_target, relative_to=fs_symlink), ), dry_run = dry_run, ) def get_tree(self, digest: Digest) -> TreeObject: with self: obj = self._cas.read(digest, object_type=b"tree") return TreeObject( digest = obj.digest, object_type = obj.object_type, size = obj.size, tree = Tree.from_bytes(self, obj.data), ) def add_tree(self, tree: Tree, *, dry_run: bool=False) -> TreeObject: with self: data = tree.to_bytes() return TreeObject( digest = self._cas.write(b"tree", data, dry_run=dry_run), object_type = b"tree", size = len(data), tree = tree, ) def add_tree_from_path(self, path: Path, *, dry_run: bool=False) -> TreeObject: from bsv.tree_walker import TreeWalker walker = TreeWalker(self, dry_run=dry_run) return walker.add_tree(path) def get_snapshot(self, digest: Digest) -> SnapshotObject: with self: obj = self._cas.read(digest, object_type=b"snap") return SnapshotObject( digest = obj.digest, object_type = obj.object_type, size = obj.size, snapshot = Snapshot.from_bytes(self, obj.data), ) def add_snapshot(self, snapshot: Snapshot, *, dry_run: bool=False) -> SnapshotObject: with self: data = snapshot.to_bytes() return SnapshotObject( digest = self._cas.write(b"snap", data, dry_run=dry_run), object_type = b"snap", size = len(data), snapshot = snapshot, ) # def take_snapshot( # self, # parent_digests: list[Digest] = [], # *, # walker_type: Type[TreeWalker] | None = None, # dry_run: bool = False, # ): # from bsv.tree_walker import TreeWalker # walker = (walker_type or TreeWalker)(self, dry_run=dry_run) # # parents = [ # # self.get_snapshot(digest) # # for digest in parent_digests # # ] # parent = self.get_snapshot(parent_digests[0]) if parent_digests else None # snapshot = Snapshot( # repo = self, # tree_digest = walker.add_virtual_tree(self._path_map, parent=), # parents = parent_digests, # repo_name = self._name, # timestamp = timestamp_us_from_time(DateTime.now()), # ) # return self.add_snapshot(snapshot, dry_run=dry_run) def _read(self, digest: Digest, object_type: bytes) -> tuple[ObjectInfo, Blob]: obj = self._cas.read(digest, object_type=object_type) stream = BytesIO(obj.data) return obj, Blob.from_stream(self, stream, digest_size=self._cas._digest_size) def _write(self, object_type: bytes, stream: BinaryIO, *, dry_run: bool=False) -> BlobObject: out = BytesIO() size = 0 chunks = [] for chunk in fastcdc( stream, min_size = self._min_chunk_size, avg_size = self._avg_chunk_size, max_size = self._max_chunk_size, fat = True, ): size += chunk.length digest = self._cas.write(b"chnk", chunk.data, dry_run=dry_run) chunks.append(Chunk(digest, chunk.length)) out.write(digest.digest) out.write(chunk.length.to_bytes(4)) return BlobObject( digest = self._cas.write(object_type, size.to_bytes(8) + out.getvalue()), object_type = object_type, size = 8 + len(out.getvalue()), blob = Blob( repo = self, size = size, chunks = chunks, ) ) def __enter__(self): if self._context_depth == 0: self._cas.__enter__() self._context_depth += 1 return self def __exit__(self, exc_type, exc_value, traceback): self._context_depth -= 1 if self._context_depth == 0: return self._cas.__exit__(exc_type, exc_value, traceback) def create_repository( destination: Path, name: str, cas: str = "simple", hash: str = "sha256", min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE, avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE, max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE, ): from datetime import datetime as DateTime from os import getlogin if not name: raise RuntimeError("repository name cannot be empty") if not destination.parent.exists(): raise RuntimeError(f"destination directory {destination.parent} does not exists") if destination.exists() and not destination.is_dir(): raise RuntimeError(f"destination {destination} exists but is not a directory") if destination.exists() and len(list(destination.iterdir())): raise RuntimeError(f"destination directory {destination} is not empty") try: destination.mkdir(exist_ok=True) except: raise RuntimeError(f"failed to create destination directory {destination}") bsv_table = tomlkit.table() bsv_table.add(tomlkit.comment("Name of the repository.")) bsv_table.add(tomlkit.comment("Ideally, this should be unique among all connected repositories.")) bsv_table.add("name", name) bsv_table.add(tomlkit.nl()) bsv_table.add(tomlkit.comment("Mapping between bsv tree and the actual filesystem.")) bsv_table.add("path_map", tomlkit.array()) bsv_table.add("cas", cas) bsv_table.add("hash", hash) bsv_table.add("min_chunk_size", min_chunk_size) bsv_table.add("avg_chunk_size", avg_chunk_size) bsv_table.add("max_chunk_size", max_chunk_size) doc = tomlkit.document() doc.add(tomlkit.comment("bsv repository configuration")) doc.add(tomlkit.comment(f"Created by {getlogin()} on {DateTime.now().isoformat()}.")) doc.add(tomlkit.nl()) doc.add("bsv", bsv_table) config_path = destination / "bsv_config.toml" try: stream = config_path.open("w", encoding="utf-8") except: raise RuntimeError("failed to open configuration file {config_path}") with stream: tomlkit.dump(doc, stream) return Repository(destination) def make_cas(cas_name: str, path: Path, hash_factory: Callable[[], Hash]) -> SimpleCas: if cas_name == "simple": return SimpleCas(path, hash_factory) raise ConfigError(f"unknown cas name {cas_name}") @dataclass(slots=True) class ChunkedObject: repo: Repository size: int chunks: list[Chunk] @classmethod def from_stream(cls, repo: Repository, stream: BinaryIO, digest_size: int) -> Self: self = cls( repo = repo, size = int.from_bytes(read_exact(stream, 8)), chunks = [], ) while (chunk := Chunk.from_stream(stream, digest_size)) is not None: self.chunks.append(chunk) return self def reader(self) -> ChunkedObjectReader: return ChunkedObjectReader(self) @dataclass(frozen=True, slots=True) class Chunk: digest: Digest size: int @classmethod def from_stream(cls, stream: BinaryIO, digest_size: int) -> Self | None: digest_bytes = read_exact_or_eof(stream, digest_size) if digest_bytes is None: return None digest = Digest(digest_bytes) return cls( digest = digest, size = int.from_bytes(read_exact(stream, 4)), ) class ChunkedObjectReader: _chunked_object: ChunkedObject _chunk_index: int = 0 _chunk_data: bytes = b"" def __init__(self, chunked_object: ChunkedObject): self._chunked_object = chunked_object def read(self, num_bytes: int = -1) -> bytes: chunks = self._chunked_object.chunks parts = [self._chunk_data] size = len(parts[-1]) while (num_bytes < 0 or size < num_bytes) and self._chunk_index < len(chunks): parts.append(self.read1()) size += len(parts[-1]) if num_bytes >= 0: self._chunk_data = parts[-1][num_bytes - size:] else: self._chunk_data = b"" return b"".join(parts) def read1(self) -> bytes: cas = self._chunked_object.repo._cas chunks = self._chunked_object.chunks if self._chunk_index == len(chunks): return b"" object = cas.read(chunks[self._chunk_index].digest, object_type=b"chnk") self._chunk_index += 1 return object.data @dataclass(slots=True) class Blob(ChunkedObject): pass @dataclass(frozen=True, order=True, slots=True) class BlobObject(ObjectInfo): blob: Blob @dataclass(slots=True) class Symlink: repo: Repository is_absolute: bool target: PurePosixPath @classmethod def from_stream(cls, repo: Repository, stream: BinaryIO) -> Self: return cls( repo = repo, is_absolute = bool(read_exact(stream, 1)), target = PurePosixPath(stream.read().decode("utf-8")), ) @classmethod def from_bytes(cls, repo: Repository, bytes: bytes) -> Self: stream = BytesIO(bytes) return cls.from_stream(repo, stream) def write(self, stream: BinaryIO): stream.write(self.is_absolute.to_bytes(1)) stream.write(self.target.as_posix().encode("utf-8")) def to_bytes(self) -> bytes: stream = BytesIO() self.write(stream) return stream.getvalue() @dataclass(frozen=True, order=True, slots=True) class SymlinkObject(ObjectInfo): symlink: Symlink @dataclass class Tree: repo: Repository items: list[TreeItem] @property def total_size(self) -> int: return sum( item.size for item in self.items ) @classmethod def from_stream(cls, repo: Repository, stream: BinaryIO) -> Self: tree = Tree(repo, []) while (item := TreeItem.from_stream(stream, repo._cas._digest_size)) is not None: tree.items.append(item) return tree @classmethod def from_bytes(cls, repo: Repository, data: bytes) -> Self: stream = BytesIO(data) return cls.from_stream(repo, stream) def write(self, stream: BinaryIO): self.items.sort(key=lambda i: i.name) for item in self.items: item.write(stream) def to_bytes(self) -> bytes: stream = BytesIO() self.write(stream) return stream.getvalue() @dataclass(frozen=True, order=True, slots=True) class TreeObject(ObjectInfo): tree: Tree @property def total_size(self) -> int: return self.size + self.tree.total_size @dataclass class TreeItem: digest: Digest object_type: bytes size: int permissions: int modification_timestamp_us: int name: str def __init__( self, digest: Digest, object_type: bytes, size: int, permissions: int, modification_timestamp_us: int, name: str, ): if "/\\" in name: raise ValueError(f"invalid tree item name {name}") self.digest = digest self.object_type = object_type self.size = size self.permissions = permissions self.modification_timestamp_us = modification_timestamp_us self.name = name @property def modification_time(self) -> DateTime: return time_from_timestamp_us(self.modification_timestamp_us) @modification_time.setter def modification_time(self, time: DateTime): self.modification_timestamp_us = timestamp_us_from_time(time) @classmethod def from_stream(cls, stream: BinaryIO, digest_size: int) -> Self | None: digest_bytes = read_exact_or_eof(stream, digest_size) if digest_bytes is None: return None return TreeItem( digest = Digest(digest_bytes), object_type = read_exact(stream, 4), size = int.from_bytes(read_exact(stream, 8)), permissions = int.from_bytes(read_exact(stream, 2)), modification_timestamp_us = int.from_bytes(read_exact(stream, 8), signed=True), name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), ) def write(self, stream: BinaryIO): stream.write(self.digest.digest) stream.write(self.object_type) stream.write(self.size.to_bytes(8)) stream.write(self.permissions.to_bytes(2)) stream.write(self.modification_timestamp_us.to_bytes(8, signed=True)) name_bytes = self.name.encode("utf-8") stream.write(len(name_bytes).to_bytes(2)) stream.write(name_bytes) @dataclass class Snapshot: repo: Repository tree_digest: Digest parents: list[Digest] repo_name: str timestamp_us: int def __post_init__(self): assert len(self.parents) < 256 @property def time(self) -> DateTime: return time_from_timestamp_us(self.timestamp_us) @time.setter def time(self, time: DateTime): self.timestamp_us = timestamp_us_from_time(time) @classmethod def from_stream(cls, repo: Repository, stream: BinaryIO) -> Self: return Snapshot( repo = repo, tree_digest = Digest(read_exact(stream, repo._cas._digest_size)), parents = [ Digest(read_exact(stream, repo._cas._digest_size)) for _ in range(int.from_bytes(read_exact(stream, 1))) ], repo_name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), timestamp_us = int.from_bytes(read_exact(stream, 8), signed=True), ) @classmethod def from_bytes(cls, repo: Repository, data: bytes) -> Self: stream = BytesIO(data) return cls.from_stream(repo, stream) def write(self, stream: BinaryIO): assert len(self.parents) < 256 stream.write(self.tree_digest.digest) stream.write(len(self.parents).to_bytes(1)) for parent in self.parents: stream.write(parent.digest) repo_name_bytes = self.repo_name.encode("utf-8") stream.write(len(repo_name_bytes).to_bytes(2)) stream.write(repo_name_bytes) stream.write(self.timestamp_us.to_bytes(8, signed=True)) def to_bytes(self) -> bytes: stream = BytesIO() self.write(stream) return stream.getvalue() @dataclass(frozen=True, order=True, slots=True) class SnapshotObject(ObjectInfo): snapshot: Snapshot