From 17bef2e63a971e79cc7f8d4692dbe46abed7986b Mon Sep 17 00:00:00 2001 From: Draklaw Date: Sun, 19 Nov 2023 18:40:30 +0100 Subject: [PATCH] TreeWalker. --- src/bsv/object.py | 5 +- src/bsv/repository.py | 49 ++++++------ src/bsv/simple_cas/cas.py | 4 +- src/bsv/tree_walker.py | 157 ++++++++++++++++++++++++++++++++++++++ src/bsv/util.py | 19 +++++ tests/test_repository.py | 90 +++++++++++++++++++--- 6 files changed, 289 insertions(+), 35 deletions(-) create mode 100644 src/bsv/tree_walker.py diff --git a/src/bsv/object.py b/src/bsv/object.py index 4b817fd..cccf2aa 100644 --- a/src/bsv/object.py +++ b/src/bsv/object.py @@ -20,7 +20,10 @@ from dataclasses import dataclass @dataclass(frozen=True, order=True, slots=True) class Digest: - digest: bytes + digest: bytes = b"" + + def __bool__(self) -> bool: + return bool(self.digest) def __repr__(self) -> str: return self.digest.hex() diff --git a/src/bsv/repository.py b/src/bsv/repository.py index ec52f25..065b942 100644 --- a/src/bsv/repository.py +++ b/src/bsv/repository.py @@ -97,32 +97,37 @@ class Repository: with self: return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore - def add_blob(self, stream: BinaryIO) -> Digest: + def add_blob(self, stream: BinaryIO, dry_run: bool=False) -> Digest: with self: - return self._write(b"blob", stream) + return self._write(b"blob", stream, dry_run=dry_run) def get_tree(self, digest: Digest) -> Tree: with self: return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data) - def add_tree(self, tree: Tree) -> Digest: + def add_tree(self, tree: Tree, dry_run: bool=False) -> Digest: with self: - return self._cas.write(b"tree", tree.to_bytes()) + return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run) + + def add_tree_from_path(self, path: Path, dry_run: bool=False) -> Digest: + from bsv.tree_walker import TreeWalker + walker = TreeWalker(self, dry_run=dry_run) + return walker.add_tree(path) def get_snapshot(self, digest: Digest) -> Snapshot: with self: return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data) - def add_snapshot(self, snapshot: Snapshot) -> Digest: + def add_snapshot(self, snapshot: Snapshot, dry_run: bool=False) -> Digest: with self: - return self._cas.write(b"snap", snapshot.to_bytes()) + return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run) def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject: obj = self._cas.read(digest, object_type=object_type) stream = BytesIO(obj.data) return cls.from_stream(self, stream, digest_size=self._cas._digest_size) - def _write(self, object_type: bytes, stream: BinaryIO) -> Digest: + def _write(self, object_type: bytes, stream: BinaryIO, dry_run: bool=False) -> Digest: out = BytesIO() size = 0 for chunk in fastcdc( @@ -133,7 +138,7 @@ class Repository: fat = True, ): size += chunk.length - digest = self._cas.write(b"chnk", chunk.data) + digest = self._cas.write(b"chnk", chunk.data, dry_run=dry_run) out.write(digest.digest) out.write(chunk.length.to_bytes(4)) return self._cas.write(object_type, size.to_bytes(8) + out.getvalue()) @@ -323,34 +328,30 @@ class Tree: @dataclass class TreeItem: - name: str digest: Digest + object_type: bytes + size: int permissions: int - creation_timestamp: int modification_timestamp: int + name: str def __init__( self, - name: str, digest: Digest, + object_type: bytes, + size: int, permissions: int, - creation_timestamp: int, modification_timestamp: int, + name: str, ): if "/\\" in name: raise ValueError(f"invalid tree item name {name}") - self.name = name self.digest = digest + self.object_type = object_type + self.size = size self.permissions = permissions - self.creation_timestamp = creation_timestamp self.modification_timestamp = modification_timestamp - - @property - def creation_time(self) -> DateTime: - return time_from_timestamp(self.creation_timestamp) - @creation_time.setter - def creation_time(self, time: DateTime): - self.creation_timestamp = timestamp_from_time(time) + self.name = name @property def modification_time(self) -> DateTime: @@ -366,16 +367,18 @@ class TreeItem: return None return TreeItem( digest = Digest(digest_bytes), + object_type = read_exact(stream, 4), + size = int.from_bytes(read_exact(stream, 8)), permissions = int.from_bytes(read_exact(stream, 2)), - creation_timestamp = int.from_bytes(read_exact(stream, 8), signed=True), modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True), name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), ) def write(self, stream: BinaryIO): stream.write(self.digest.digest) + stream.write(self.object_type) + stream.write(self.size.to_bytes(8)) stream.write(self.permissions.to_bytes(2)) - stream.write(self.creation_timestamp.to_bytes(8, signed=True)) stream.write(self.modification_timestamp.to_bytes(8, signed=True)) name_bytes = self.name.encode("utf-8") stream.write(len(name_bytes).to_bytes(2)) diff --git a/src/bsv/simple_cas/cas.py b/src/bsv/simple_cas/cas.py index f0b3577..3581a48 100644 --- a/src/bsv/simple_cas/cas.py +++ b/src/bsv/simple_cas/cas.py @@ -96,7 +96,7 @@ class SimpleCas: return Object(digest, object_type, size, data) - def write(self, object_type: bytes, data: bytes) -> Digest: + def write(self, object_type: bytes, data: bytes, dry_run: bool=False) -> Digest: assert len(object_type) == 4 assert len(data) < 2**32 @@ -108,7 +108,7 @@ class SimpleCas: hash.update(data) digest = Digest(hash.digest()) - if digest not in self: + if not dry_run and digest not in self: with self._open_writer(digest, object_type, len(data)) as out: out.write(digest.digest) out.write(object_type) diff --git a/src/bsv/tree_walker.py b/src/bsv/tree_walker.py new file mode 100644 index 0000000..228303d --- /dev/null +++ b/src/bsv/tree_walker.py @@ -0,0 +1,157 @@ +# bsv - Backup, Synchronization, Versioning +# Copyright (C) 2023 Simon Boyé +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from __future__ import annotations + +from enum import Enum +from os import stat_result +from pathlib import Path +import stat + +from bsv.object import Digest +from bsv.repository import Repository, Tree, TreeItem +from bsv.util import is_bsv_repository, object_type_from_mode + + +class Action(Enum): + ADD = "add" + UPDATE = "update" + IGNORE = "ignore" + ERROR = "error" + +class IgnoreCause(Enum): + IGNORE_RULE = "ignore_rule" + UNCHANGED = "unchanged" + UNSUPPORTED_TYPE = "unsupported_type" + + +class TreeWalker: + _repo: Repository + _dry_run: bool = False + + def __init__(self, repo: Repository, dry_run: bool=False): + self._repo = repo + self._dry_run = dry_run + + def add_tree(self, path: Path) -> Digest: + pstat = path.stat(follow_symlinks=False) + if self.ignore(path, pstat): + self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE) + return Digest() + return self._add_tree(path, pstat) + + def _add_tree(self, path: Path, pstat: stat_result) -> Digest: + tree = Tree(self._repo, []) + for item in sorted(path.iterdir()): + digest = Digest() + try: + istat = item.lstat() + if self.ignore(item, istat): + self.report(Action.IGNORE, item, istat, IgnoreCause.IGNORE_RULE) + continue + object_type = object_type_from_mode(istat.st_mode) + if object_type == b"slnk": + digest = self._add_symlink(item, istat) + elif object_type == b"tree": + digest = self._add_tree(item, istat) + elif object_type == b"blob": + digest = self._add_blob(item, istat) + else: + self.report(Action.IGNORE, item, istat, IgnoreCause.UNSUPPORTED_TYPE) + continue + except Exception as err: + self.report(Action.ERROR, item, None, err) + continue + + if digest: + self.report(Action.ADD, path, pstat) + tree.items.append(TreeItem( + digest = digest, + object_type = object_type, + size = istat.st_size, + permissions = stat.S_IMODE(istat.st_mode), + modification_timestamp = istat.st_mtime_ns, + name = item.name, + )) + + return self._repo.add_tree(tree, dry_run=self._dry_run) + + + def _add_symlink(self, path: Path, pstat: stat_result) -> Digest: + # TODO: Store symlink relative to current dir ? + # * What about symlink that points outside of the backup dirs + # * Should symlinks that points inside the backup dirs but in another + # mount-point adjusted ? + # * Should absolute symlink be restored as absolute ? + self.report(Action.ADD, path, pstat) + return self._repo._cas.write( + b"slnk", + path.readlink().as_posix().encode("utf-8"), + dry_run = self._dry_run, + ) + + def _add_blob(self, path: Path, pstat: stat_result) -> Digest: + self.report(Action.ADD, path, pstat) + with path.open("rb") as stream: + return self._repo.add_blob(stream, dry_run=self._dry_run) + + + def ignore(self, path: Path, pstat: stat_result) -> bool: + return is_bsv_repository(path) + + def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | None=None): + match action, info: + case (Action.ADD, None): + print(f"Add: {path}") + case (Action.IGNORE, IgnoreCause.IGNORE_RULE): + print(f"Ignore (rule): {path}") + case (Action.IGNORE, IgnoreCause.UNCHANGED): + print(f"Ignore (unchanged): {path}") + case (Action.IGNORE, IgnoreCause.UNSUPPORTED_TYPE) if pstat is not None: + assert pstat is not None + print(f"Ignore (unsupported type {path_type_name(pstat)}): {path}") + case (Action.ERROR, _) if isinstance(info, Exception): + print(f"Error {info}: {path}") + case _: + raise ValueError("TreeWalker.report(): unsupported parameter combination") + + +def path_type_name(pstat: stat_result) -> str: + parts = [] + + if stat.S_ISBLK(pstat.st_mode): + parts.append("block_device") + if stat.S_ISCHR(pstat.st_mode): + parts.append("char_device") + if stat.S_ISDIR(pstat.st_mode): + parts.append("dir") + if stat.S_ISDOOR(pstat.st_mode): + parts.append("door") + if stat.S_ISFIFO(pstat.st_mode): + parts.append("fifo") + if stat.S_ISLNK(pstat.st_mode): + parts.append("symlink") + if stat.S_ISPORT(pstat.st_mode): + parts.append("port") + if stat.S_ISREG(pstat.st_mode): + parts.append("file") + if stat.S_ISSOCK(pstat.st_mode): + parts.append("socket") + if stat.S_ISWHT(pstat.st_mode): + parts.append("whiteout") + + if not parts: + return "unknown" + return ", ".join(parts) diff --git a/src/bsv/util.py b/src/bsv/util.py index ed2dac2..2c10877 100644 --- a/src/bsv/util.py +++ b/src/bsv/util.py @@ -17,6 +17,8 @@ from __future__ import annotations from abc import ABC, abstractmethod from datetime import UTC, datetime as DateTime, timedelta as TimeDelta +from pathlib import Path +import stat from typing import BinaryIO @@ -45,6 +47,23 @@ def read_exact_or_eof(stream: BinaryIO, num_bytes: int) -> bytes | None: return data +def is_bsv_repository(path: Path) -> bool: + return (path / "bsv_config.toml").is_file() + + +def object_type_from_path(path: Path) -> bytes: + return object_type_from_mode(path.stat(follow_symlinks=False).st_mode) + +def object_type_from_mode(mode: int) -> bytes: + if stat.S_ISLNK(mode): + return b"slnk" + elif stat.S_ISDIR(mode): + return b"tree" + elif stat.S_ISREG(mode): + return b"blob" + return b"" + + class Hash(ABC): name: str digest_size: int diff --git a/tests/test_repository.py b/tests/test_repository.py index fe6bdea..a46e143 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -15,7 +15,7 @@ # along with this program. If not, see . from __future__ import annotations from datetime import UTC, datetime -from io import BytesIO +from os import stat_result from pathlib import Path from random import randbytes from typing import Iterator @@ -25,6 +25,7 @@ from tempfile import TemporaryDirectory from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_from_time from bsv.simple_cas.cas import Digest +from bsv.tree_walker import Action, IgnoreCause, TreeWalker @pytest.fixture @@ -61,18 +62,20 @@ def test_read_write_tree(repo: Repository): repo, [ TreeItem( - "xyz", - Digest(bytes([42]) * repo._cas._digest_size), - 0o744, - creation_timestamp = timestamp_from_time(now), + digest = Digest(bytes([42]) * repo._cas._digest_size), + object_type = b"blob", + size = 123, + permissions = 0o744, modification_timestamp = timestamp_from_time(now), + name = "xyz", ), TreeItem( - "foobar", - Digest(bytes([123]) * repo._cas._digest_size), - 0o777, - creation_timestamp = timestamp_from_time(now), + digest = Digest(bytes([123]) * repo._cas._digest_size), + object_type = b"slnk", + size = 42, + permissions = 0o777, modification_timestamp = timestamp_from_time(now), + name = "foobar", ), ] ) @@ -97,6 +100,75 @@ def test_read_write_snapshot(repo: Repository): assert repo.get_snapshot(digest) == snapshot +class TestTreeWalker(TreeWalker): + reports: list + + def __init__(self, repo: Repository, dry_run: bool=False): + super().__init__(repo, dry_run) + self.reports = [] + + def report( + self, + action: Action, + path: Path, + pstat: stat_result | None, + info: IgnoreCause | Exception | None = None + ): + super().report(action, path, pstat, info) + self.reports.append((action, path, pstat, info)) + + +def test_add_tree(tmp_dir: Path, repo: Repository): + dir = tmp_dir / "test" + structure = { + "folder": { + "sub_folder": { + "empty_folder": {}, + "foo.txt": b"Hello World!\n", + }, + "test.py": b"print(\"Hello World!\")\n", + "bar.dat": bytes(range(256)), + }, + "Another test with long name and spaces and a bang !": b"Should works.\n", + "bsv_repo": { + "bsv_config.toml": b"[bsv]\n", + }, + } + + create_file_structure(dir, structure) + + walker = TestTreeWalker(repo) + dir_digest = walker.add_tree(dir) + + def check(digest: Digest, value: dict | bytes): + if isinstance(value, dict): + tree = repo.get_tree(digest) + assert tree + assert list(map(lambda i: i.name, tree.items)) == sorted(value.keys()) + for item in tree.items: + check(item.digest, value[item.name]) + elif isinstance(value, bytes): + blob = repo.get_blob(digest) + data = blob.reader().read() + assert data == value + + expected = dict(structure) + del expected["bsv_repo"] + check(dir_digest, expected) + + +def create_file_structure(dst: Path, value: dict | bytes): + assert not dst.exists() + if isinstance(value, dict): + dst.mkdir() + for name, item in value.items(): + create_file_structure(dst / name, item) + elif isinstance(value, bytes): + dst.write_bytes(value) + else: + raise TypeError(f"invalid type {type(value).__name__} for parameter value") + + def make_random_file(path: Path, size: int): with path.open("wb") as stream: for chunk_size in iter_chunks(size):