pybsv/src/bsv/repository.py


								# bsv - Backup, Synchronization, Versioning

								# Copyright (C) 2023  Simon Boyé

								#

								# This program is free software: you can redistribute it and/or modify

								# it under the terms of the GNU Affero General Public License as published by

								# the Free Software Foundation, either version 3 of the License, or

								# (at your option) any later version.

								#

								# This program is distributed in the hope that it will be useful,

								# but WITHOUT ANY WARRANTY; without even the implied warranty of

								# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

								# GNU Affero General Public License for more details.

								#

								# You should have received a copy of the GNU Affero General Public License

								# along with this program.  If not, see <https://www.gnu.org/licenses/>.

								from __future__ import annotations


								from dataclasses import dataclass

								from datetime import datetime as DateTime

								import hashlib

								from io import BytesIO

								from pathlib import Path, PurePosixPath

								import platform

								import tomllib

								from typing import TYPE_CHECKING, BinaryIO, Callable, Self, Type


								from fastcdc import fastcdc

								import tomlkit


								from bsv import __version__

								from bsv.exception import ConfigError

								from bsv.object import ObjectInfo

								from bsv.path_map import PathMap

								from bsv.simple_cas import SimpleCas

								from bsv.simple_cas.cas import Digest, SimpleCas

								from bsv.util import Hash, read_exact, read_exact_or_eof, time_from_timestamp_us, timestamp_us_from_time


								if TYPE_CHECKING:

								    from bsv.tree_walker import TreeWalker


								DEFAULT_MIN_CHUNK_SIZE = 1 << 12

								DEFAULT_AVG_CHUNK_SIZE = 1 << 16

								DEFAULT_MAX_CHUNK_SIZE = 1 << 20


								class Repository:

								    _path: Path

								    _name: str


								    _cas: SimpleCas

								    _min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE

								    _avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE

								    _max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE


								    _path_map: PathMap

								    # _remotes: list[object]


								    _context_depth: int = 0


								    def __init__(self, path: Path):

								        self._path = path


								        with self.config_file.open("rb") as stream:

								            config = tomllib.load(stream)


								        bsv = config.get("bsv", {})


								        self._name = bsv.get("name") or platform.node()


								        self._cas = make_cas(

								            bsv.get("cas"),

								            self._path,

								            lambda: hashlib.new(bsv.get("hash")), # type: ignore

								        )

								        self._min_chunk_size = bsv.get("min_chunk_size")

								        self._avg_chunk_size = bsv.get("avg_chunk_size")

								        self._max_chunk_size = bsv.get("max_chunk_size")


								        self._path_map = PathMap.from_obj(bsv.get("path_map", []))


								    @property

								    def path(self) -> Path:

								        return self._path


								    @property

								    def config_file(self) -> Path:

								        return self.path / "bsv_config.toml"


								    @property

								    def name(self) -> str:

								        return self._name


								    @property

								    def path_map(self) -> PathMap:

								        return self._path_map.clone()


								    def get_blob(self, digest: Digest) -> BlobObject:

								        with self:

								            obj, blob = self._read(digest, object_type=b"blob")

								            return BlobObject(

								                digest = obj.digest,

								                object_type = obj.object_type,

								                size = obj.size,

								                blob = blob,

								            )


								    def add_blob(self, stream: BinaryIO, *, dry_run: bool=False) -> BlobObject:

								        with self:

								            return self._write(b"blob", stream, dry_run=dry_run)


								    def get_symlink(self, digest: Digest) -> SymlinkObject:

								        with self:

								            obj = self._cas.read(digest, object_type=b"slnk")

								            return SymlinkObject(

								                digest = obj.digest,

								                object_type = obj.object_type,

								                size = obj.size,

								                symlink = Symlink.from_bytes(self, obj.data),

								            )


								    def add_symlink(self, symlink: Symlink, *, dry_run: bool=False) -> SymlinkObject:

								        with self:

								            data = symlink.to_bytes()

								            return SymlinkObject(

								                digest = self._cas.write(b"slnk", data, dry_run=dry_run),

								                object_type = b"slnk",

								                size = len(data),

								                symlink = symlink,

								            )


								    def add_symlink_from_fs_target(self, fs_symlink: Path, fs_target: Path, *, dry_run: bool=False) -> SymlinkObject:

								        assert fs_symlink.is_absolute()

								        return self.add_symlink(

								            Symlink(

								                repo = self,

								                is_absolute = fs_target.is_absolute(),

								                target = self._path_map.relative_bsv_path(fs_target, relative_to=fs_symlink),

								            ),

								            dry_run = dry_run,

								        )


								    def get_tree(self, digest: Digest) -> TreeObject:

								        with self:

								            obj = self._cas.read(digest, object_type=b"tree")

								            return TreeObject(

								                digest = obj.digest,

								                object_type = obj.object_type,

								                size = obj.size,

								                tree = Tree.from_bytes(self, obj.data),

								            )


								    def add_tree(self, tree: Tree, *, dry_run: bool=False) -> TreeObject:

								        with self:

								            data = tree.to_bytes()

								            return TreeObject(

								                digest = self._cas.write(b"tree", data, dry_run=dry_run),

								                object_type = b"tree",

								                size = len(data),

								                tree = tree,

								            )


								    def add_tree_from_path(self, path: Path, *, dry_run: bool=False) -> TreeObject:

								        from bsv.tree_walker import TreeWalker

								        walker = TreeWalker(self, dry_run=dry_run)

								        return walker.add_tree(path)


								    def get_snapshot(self, digest: Digest) -> SnapshotObject:

								        with self:

								            obj = self._cas.read(digest, object_type=b"snap")

								            return SnapshotObject(

								                digest = obj.digest,

								                object_type = obj.object_type,

								                size = obj.size,

								                snapshot = Snapshot.from_bytes(self, obj.data),

								            )


								    def add_snapshot(self, snapshot: Snapshot, *, dry_run: bool=False) -> SnapshotObject:

								        with self:

								            data = snapshot.to_bytes()

								            return SnapshotObject(

								                digest = self._cas.write(b"snap", data, dry_run=dry_run),

								                object_type = b"snap",

								                size = len(data),

								                snapshot = snapshot,

								            )


								    # def take_snapshot(

								    #     self,

								    #     parent_digests: list[Digest] = [],

								    #     *,

								    #     walker_type: Type[TreeWalker] | None = None,

								    #     dry_run: bool = False,

								    # ):

								    #     from bsv.tree_walker import TreeWalker


								    #     walker = (walker_type or TreeWalker)(self, dry_run=dry_run)


								    #     # parents = [

								    #     #     self.get_snapshot(digest)

								    #     #     for digest in parent_digests

								    #     # ]

								    #     parent = self.get_snapshot(parent_digests[0]) if parent_digests else None


								    #     snapshot = Snapshot(

								    #         repo = self,

								    #         tree_digest = walker.add_virtual_tree(self._path_map, parent=),

								    #         parents = parent_digests,

								    #         repo_name = self._name,

								    #         timestamp = timestamp_us_from_time(DateTime.now()),

								    #     )

								    #     return self.add_snapshot(snapshot, dry_run=dry_run)


								    def _read(self, digest: Digest, object_type: bytes) -> tuple[ObjectInfo, Blob]:

								        obj = self._cas.read(digest, object_type=object_type)

								        stream = BytesIO(obj.data)

								        return obj, Blob.from_stream(self, stream, digest_size=self._cas._digest_size)


								    def _write(self, object_type: bytes, stream: BinaryIO, *, dry_run: bool=False) -> BlobObject:

								        out = BytesIO()

								        size = 0

								        chunks = []

								        for chunk in fastcdc(

								            stream,

								            min_size = self._min_chunk_size,

								            avg_size = self._avg_chunk_size,

								            max_size = self._max_chunk_size,

								            fat = True,

								        ):

								            size += chunk.length

								            digest = self._cas.write(b"chnk", chunk.data, dry_run=dry_run)

								            chunks.append(Chunk(digest, chunk.length))

								            out.write(digest.digest)

								            out.write(chunk.length.to_bytes(4))

								        return BlobObject(

								            digest = self._cas.write(object_type, size.to_bytes(8) + out.getvalue()),

								            object_type = object_type,

								            size = 8 + len(out.getvalue()),

								            blob = Blob(

								                repo = self,

								                size = size,

								                chunks = chunks,

								            )

								        )


								    def __enter__(self):

								        if self._context_depth == 0:

								            self._cas.__enter__()

								        self._context_depth += 1

								        return self


								    def __exit__(self, exc_type, exc_value, traceback):

								        self._context_depth -= 1

								        if self._context_depth == 0:

								            return self._cas.__exit__(exc_type, exc_value, traceback)


								def create_repository(

								    destination: Path,

								    name: str,

								    cas: str = "simple",

								    hash: str = "sha256",

								    min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE,

								    avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE,

								    max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE,

								):

								    from datetime import datetime as DateTime

								    from os import getlogin


								    if not name:

								        raise RuntimeError("repository name cannot be empty")

								    if not destination.parent.exists():

								        raise RuntimeError(f"destination directory {destination.parent} does not exists")

								    if destination.exists() and not destination.is_dir():

								        raise RuntimeError(f"destination {destination} exists but is not a directory")

								    if destination.exists() and len(list(destination.iterdir())):

								        raise RuntimeError(f"destination directory {destination} is not empty")


								    try:

								        destination.mkdir(exist_ok=True)

								    except:

								        raise RuntimeError(f"failed to create destination directory {destination}")


								    bsv_table = tomlkit.table()

								    bsv_table.add(tomlkit.comment("Name of the repository."))

								    bsv_table.add(tomlkit.comment("Ideally, this should be unique among all connected repositories."))

								    bsv_table.add("name", name)

								    bsv_table.add(tomlkit.nl())

								    bsv_table.add(tomlkit.comment("Mapping between bsv tree and the actual filesystem."))

								    bsv_table.add("path_map", tomlkit.array())

								    bsv_table.add("cas", cas)

								    bsv_table.add("hash", hash)

								    bsv_table.add("min_chunk_size", min_chunk_size)

								    bsv_table.add("avg_chunk_size", avg_chunk_size)

								    bsv_table.add("max_chunk_size", max_chunk_size)


								    doc = tomlkit.document()

								    doc.add(tomlkit.comment("bsv repository configuration"))

								    doc.add(tomlkit.comment(f"Created by {getlogin()} on {DateTime.now().isoformat()}."))

								    doc.add(tomlkit.nl())

								    doc.add("bsv", bsv_table)


								    config_path = destination / "bsv_config.toml"

								    try:

								        stream = config_path.open("w", encoding="utf-8")

								    except:

								        raise RuntimeError("failed to open configuration file {config_path}")


								    with stream:

								        tomlkit.dump(doc, stream)


								    return Repository(destination)


								def make_cas(cas_name: str, path: Path, hash_factory: Callable[[], Hash]) -> SimpleCas:

								    if cas_name == "simple":

								        return SimpleCas(path, hash_factory)

								    raise ConfigError(f"unknown cas name {cas_name}")


								@dataclass(slots=True)

								class ChunkedObject:

								    repo: Repository

								    size: int

								    chunks: list[Chunk]


								    @classmethod

								    def from_stream(cls, repo: Repository, stream: BinaryIO, digest_size: int) -> Self:

								        self = cls(

								            repo = repo,

								            size = int.from_bytes(read_exact(stream, 8)),

								            chunks = [],

								        )

								        while (chunk := Chunk.from_stream(stream, digest_size)) is not None:

								            self.chunks.append(chunk)

								        return self


								    def reader(self) -> ChunkedObjectReader:

								        return ChunkedObjectReader(self)


								@dataclass(frozen=True, slots=True)

								class Chunk:

								    digest: Digest

								    size: int


								    @classmethod

								    def from_stream(cls, stream: BinaryIO, digest_size: int) -> Self | None:

								        digest_bytes = read_exact_or_eof(stream, digest_size)

								        if digest_bytes is None:

								            return None

								        digest = Digest(digest_bytes)


								        return cls(

								            digest = digest,

								            size = int.from_bytes(read_exact(stream, 4)),

								        )


								class ChunkedObjectReader:

								    _chunked_object: ChunkedObject

								    _chunk_index: int = 0

								    _chunk_data: bytes = b""


								    def __init__(self, chunked_object: ChunkedObject):

								        self._chunked_object = chunked_object


								    def read(self, num_bytes: int = -1) -> bytes:

								        chunks = self._chunked_object.chunks

								        parts = [self._chunk_data]

								        size = len(parts[-1])

								        while (num_bytes < 0 or size < num_bytes) and self._chunk_index < len(chunks):

								            parts.append(self.read1())

								            size += len(parts[-1])

								        if num_bytes >= 0:

								            self._chunk_data = parts[-1][num_bytes - size:]

								        else:

								            self._chunk_data = b""

								        return b"".join(parts)


								    def read1(self) -> bytes:

								        cas = self._chunked_object.repo._cas

								        chunks = self._chunked_object.chunks

								        if self._chunk_index == len(chunks):

								            return b""

								        object = cas.read(chunks[self._chunk_index].digest, object_type=b"chnk")

								        self._chunk_index += 1

								        return object.data


								@dataclass(slots=True)

								class Blob(ChunkedObject):

								    pass


								@dataclass(frozen=True, order=True, slots=True)

								class BlobObject(ObjectInfo):

								    blob: Blob


								@dataclass(slots=True)

								class Symlink:

								    repo: Repository

								    is_absolute: bool

								    target: PurePosixPath


								    @classmethod

								    def from_stream(cls, repo: Repository, stream: BinaryIO) -> Self:

								        return cls(

								            repo = repo,

								            is_absolute = bool(read_exact(stream, 1)),

								            target = PurePosixPath(stream.read().decode("utf-8")),

								        )


								    @classmethod

								    def from_bytes(cls, repo: Repository, bytes: bytes) -> Self:

								        stream = BytesIO(bytes)

								        return cls.from_stream(repo, stream)


								    def write(self, stream: BinaryIO):

								        stream.write(self.is_absolute.to_bytes(1))

								        stream.write(self.target.as_posix().encode("utf-8"))


								    def to_bytes(self) -> bytes:

								        stream = BytesIO()

								        self.write(stream)

								        return stream.getvalue()


								@dataclass(frozen=True, order=True, slots=True)

								class SymlinkObject(ObjectInfo):

								    symlink: Symlink


								@dataclass

								class Tree:

								    repo: Repository

								    items: list[TreeItem]


								    @property

								    def total_size(self) -> int:

								        return sum(

								            item.size

								            for item in self.items

								        )


								    @classmethod

								    def from_stream(cls, repo: Repository, stream: BinaryIO) -> Self:

								        tree = Tree(repo, [])

								        while (item := TreeItem.from_stream(stream, repo._cas._digest_size)) is not None:

								            tree.items.append(item)

								        return tree


								    @classmethod

								    def from_bytes(cls, repo: Repository, data: bytes) -> Self:

								        stream = BytesIO(data)

								        return cls.from_stream(repo, stream)


								    def write(self, stream: BinaryIO):

								        self.items.sort(key=lambda i: i.name)

								        for item in self.items:

								            item.write(stream)


								    def to_bytes(self) -> bytes:

								        stream = BytesIO()

								        self.write(stream)

								        return stream.getvalue()


								@dataclass(frozen=True, order=True, slots=True)

								class TreeObject(ObjectInfo):

								    tree: Tree


								    @property

								    def total_size(self) -> int:

								        return self.size + self.tree.total_size


								@dataclass

								class TreeItem:

								    digest: Digest

								    object_type: bytes

								    size: int

								    permissions: int

								    modification_timestamp_us: int

								    name: str


								    def __init__(

								        self,

								        digest: Digest,

								        object_type: bytes,

								        size: int,

								        permissions: int,

								        modification_timestamp_us: int,

								        name: str,

								    ):

								        if "/\\" in name:

								            raise ValueError(f"invalid tree item name {name}")

								        self.digest = digest

								        self.object_type = object_type

								        self.size = size

								        self.permissions = permissions

								        self.modification_timestamp_us = modification_timestamp_us

								        self.name = name


								    @property

								    def modification_time(self) -> DateTime:

								        return time_from_timestamp_us(self.modification_timestamp_us)

								    @modification_time.setter

								    def modification_time(self, time: DateTime):

								        self.modification_timestamp_us = timestamp_us_from_time(time)


								    @classmethod

								    def from_stream(cls, stream: BinaryIO, digest_size: int) -> Self | None:

								        digest_bytes = read_exact_or_eof(stream, digest_size)

								        if digest_bytes is None:

								            return None

								        return TreeItem(

								            digest = Digest(digest_bytes),

								            object_type = read_exact(stream, 4),

								            size = int.from_bytes(read_exact(stream, 8)),

								            permissions = int.from_bytes(read_exact(stream, 2)),

								            modification_timestamp_us = int.from_bytes(read_exact(stream, 8), signed=True),

								            name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),

								        )


								    def write(self, stream: BinaryIO):

								        stream.write(self.digest.digest)

								        stream.write(self.object_type)

								        stream.write(self.size.to_bytes(8))

								        stream.write(self.permissions.to_bytes(2))

								        stream.write(self.modification_timestamp_us.to_bytes(8, signed=True))

								        name_bytes = self.name.encode("utf-8")

								        stream.write(len(name_bytes).to_bytes(2))

								        stream.write(name_bytes)


								@dataclass

								class Snapshot:

								    repo: Repository

								    tree_digest: Digest

								    parents: list[Digest]

								    repo_name: str

								    timestamp_us: int


								    def __post_init__(self):

								        assert len(self.parents) < 256


								    @property

								    def time(self) -> DateTime:

								        return time_from_timestamp_us(self.timestamp_us)

								    @time.setter

								    def time(self, time: DateTime):

								        self.timestamp_us = timestamp_us_from_time(time)


								    @classmethod

								    def from_stream(cls, repo: Repository, stream: BinaryIO) -> Self:

								        return Snapshot(

								            repo = repo,

								            tree_digest = Digest(read_exact(stream, repo._cas._digest_size)),

								            parents = [

								                Digest(read_exact(stream, repo._cas._digest_size))

								                for _ in range(int.from_bytes(read_exact(stream, 1)))

								            ],

								            repo_name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),

								            timestamp_us = int.from_bytes(read_exact(stream, 8), signed=True),

								        )


								    @classmethod

								    def from_bytes(cls, repo: Repository, data: bytes) -> Self:

								        stream = BytesIO(data)

								        return cls.from_stream(repo, stream)


								    def write(self, stream: BinaryIO):

								        assert len(self.parents) < 256

								        stream.write(self.tree_digest.digest)

								        stream.write(len(self.parents).to_bytes(1))

								        for parent in self.parents:

								            stream.write(parent.digest)

								        repo_name_bytes = self.repo_name.encode("utf-8")

								        stream.write(len(repo_name_bytes).to_bytes(2))

								        stream.write(repo_name_bytes)

								        stream.write(self.timestamp_us.to_bytes(8, signed=True))


								    def to_bytes(self) -> bytes:

								        stream = BytesIO()

								        self.write(stream)

								        return stream.getvalue()


								@dataclass(frozen=True, order=True, slots=True)

								class SnapshotObject(ObjectInfo):

								    snapshot: Snapshot