diff --git a/src/bsv/object.py b/src/bsv/object.py
index 4b817fd..cccf2aa 100644
--- a/src/bsv/object.py
+++ b/src/bsv/object.py
@@ -20,7 +20,10 @@ from dataclasses import dataclass
@dataclass(frozen=True, order=True, slots=True)
class Digest:
- digest: bytes
+ digest: bytes = b""
+
+ def __bool__(self) -> bool:
+ return bool(self.digest)
def __repr__(self) -> str:
return self.digest.hex()
diff --git a/src/bsv/repository.py b/src/bsv/repository.py
index ec52f25..065b942 100644
--- a/src/bsv/repository.py
+++ b/src/bsv/repository.py
@@ -97,32 +97,37 @@ class Repository:
with self:
return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore
- def add_blob(self, stream: BinaryIO) -> Digest:
+ def add_blob(self, stream: BinaryIO, dry_run: bool=False) -> Digest:
with self:
- return self._write(b"blob", stream)
+ return self._write(b"blob", stream, dry_run=dry_run)
def get_tree(self, digest: Digest) -> Tree:
with self:
return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data)
- def add_tree(self, tree: Tree) -> Digest:
+ def add_tree(self, tree: Tree, dry_run: bool=False) -> Digest:
with self:
- return self._cas.write(b"tree", tree.to_bytes())
+ return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run)
+
+ def add_tree_from_path(self, path: Path, dry_run: bool=False) -> Digest:
+ from bsv.tree_walker import TreeWalker
+ walker = TreeWalker(self, dry_run=dry_run)
+ return walker.add_tree(path)
def get_snapshot(self, digest: Digest) -> Snapshot:
with self:
return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data)
- def add_snapshot(self, snapshot: Snapshot) -> Digest:
+ def add_snapshot(self, snapshot: Snapshot, dry_run: bool=False) -> Digest:
with self:
- return self._cas.write(b"snap", snapshot.to_bytes())
+ return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run)
def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject:
obj = self._cas.read(digest, object_type=object_type)
stream = BytesIO(obj.data)
return cls.from_stream(self, stream, digest_size=self._cas._digest_size)
- def _write(self, object_type: bytes, stream: BinaryIO) -> Digest:
+ def _write(self, object_type: bytes, stream: BinaryIO, dry_run: bool=False) -> Digest:
out = BytesIO()
size = 0
for chunk in fastcdc(
@@ -133,7 +138,7 @@ class Repository:
fat = True,
):
size += chunk.length
- digest = self._cas.write(b"chnk", chunk.data)
+ digest = self._cas.write(b"chnk", chunk.data, dry_run=dry_run)
out.write(digest.digest)
out.write(chunk.length.to_bytes(4))
return self._cas.write(object_type, size.to_bytes(8) + out.getvalue())
@@ -323,34 +328,30 @@ class Tree:
@dataclass
class TreeItem:
- name: str
digest: Digest
+ object_type: bytes
+ size: int
permissions: int
- creation_timestamp: int
modification_timestamp: int
+ name: str
def __init__(
self,
- name: str,
digest: Digest,
+ object_type: bytes,
+ size: int,
permissions: int,
- creation_timestamp: int,
modification_timestamp: int,
+ name: str,
):
if "/\\" in name:
raise ValueError(f"invalid tree item name {name}")
- self.name = name
self.digest = digest
+ self.object_type = object_type
+ self.size = size
self.permissions = permissions
- self.creation_timestamp = creation_timestamp
self.modification_timestamp = modification_timestamp
-
- @property
- def creation_time(self) -> DateTime:
- return time_from_timestamp(self.creation_timestamp)
- @creation_time.setter
- def creation_time(self, time: DateTime):
- self.creation_timestamp = timestamp_from_time(time)
+ self.name = name
@property
def modification_time(self) -> DateTime:
@@ -366,16 +367,18 @@ class TreeItem:
return None
return TreeItem(
digest = Digest(digest_bytes),
+ object_type = read_exact(stream, 4),
+ size = int.from_bytes(read_exact(stream, 8)),
permissions = int.from_bytes(read_exact(stream, 2)),
- creation_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),
)
def write(self, stream: BinaryIO):
stream.write(self.digest.digest)
+ stream.write(self.object_type)
+ stream.write(self.size.to_bytes(8))
stream.write(self.permissions.to_bytes(2))
- stream.write(self.creation_timestamp.to_bytes(8, signed=True))
stream.write(self.modification_timestamp.to_bytes(8, signed=True))
name_bytes = self.name.encode("utf-8")
stream.write(len(name_bytes).to_bytes(2))
diff --git a/src/bsv/simple_cas/cas.py b/src/bsv/simple_cas/cas.py
index f0b3577..3581a48 100644
--- a/src/bsv/simple_cas/cas.py
+++ b/src/bsv/simple_cas/cas.py
@@ -96,7 +96,7 @@ class SimpleCas:
return Object(digest, object_type, size, data)
- def write(self, object_type: bytes, data: bytes) -> Digest:
+ def write(self, object_type: bytes, data: bytes, dry_run: bool=False) -> Digest:
assert len(object_type) == 4
assert len(data) < 2**32
@@ -108,7 +108,7 @@ class SimpleCas:
hash.update(data)
digest = Digest(hash.digest())
- if digest not in self:
+ if not dry_run and digest not in self:
with self._open_writer(digest, object_type, len(data)) as out:
out.write(digest.digest)
out.write(object_type)
diff --git a/src/bsv/tree_walker.py b/src/bsv/tree_walker.py
new file mode 100644
index 0000000..228303d
--- /dev/null
+++ b/src/bsv/tree_walker.py
@@ -0,0 +1,157 @@
+# bsv - Backup, Synchronization, Versioning
+# Copyright (C) 2023 Simon Boyé
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+from __future__ import annotations
+
+from enum import Enum
+from os import stat_result
+from pathlib import Path
+import stat
+
+from bsv.object import Digest
+from bsv.repository import Repository, Tree, TreeItem
+from bsv.util import is_bsv_repository, object_type_from_mode
+
+
+class Action(Enum):
+ ADD = "add"
+ UPDATE = "update"
+ IGNORE = "ignore"
+ ERROR = "error"
+
+class IgnoreCause(Enum):
+ IGNORE_RULE = "ignore_rule"
+ UNCHANGED = "unchanged"
+ UNSUPPORTED_TYPE = "unsupported_type"
+
+
+class TreeWalker:
+ _repo: Repository
+ _dry_run: bool = False
+
+ def __init__(self, repo: Repository, dry_run: bool=False):
+ self._repo = repo
+ self._dry_run = dry_run
+
+ def add_tree(self, path: Path) -> Digest:
+ pstat = path.stat(follow_symlinks=False)
+ if self.ignore(path, pstat):
+ self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE)
+ return Digest()
+ return self._add_tree(path, pstat)
+
+ def _add_tree(self, path: Path, pstat: stat_result) -> Digest:
+ tree = Tree(self._repo, [])
+ for item in sorted(path.iterdir()):
+ digest = Digest()
+ try:
+ istat = item.lstat()
+ if self.ignore(item, istat):
+ self.report(Action.IGNORE, item, istat, IgnoreCause.IGNORE_RULE)
+ continue
+ object_type = object_type_from_mode(istat.st_mode)
+ if object_type == b"slnk":
+ digest = self._add_symlink(item, istat)
+ elif object_type == b"tree":
+ digest = self._add_tree(item, istat)
+ elif object_type == b"blob":
+ digest = self._add_blob(item, istat)
+ else:
+ self.report(Action.IGNORE, item, istat, IgnoreCause.UNSUPPORTED_TYPE)
+ continue
+ except Exception as err:
+ self.report(Action.ERROR, item, None, err)
+ continue
+
+ if digest:
+ self.report(Action.ADD, path, pstat)
+ tree.items.append(TreeItem(
+ digest = digest,
+ object_type = object_type,
+ size = istat.st_size,
+ permissions = stat.S_IMODE(istat.st_mode),
+ modification_timestamp = istat.st_mtime_ns,
+ name = item.name,
+ ))
+
+ return self._repo.add_tree(tree, dry_run=self._dry_run)
+
+
+ def _add_symlink(self, path: Path, pstat: stat_result) -> Digest:
+ # TODO: Store symlink relative to current dir ?
+ # * What about symlink that points outside of the backup dirs
+ # * Should symlinks that points inside the backup dirs but in another
+ # mount-point adjusted ?
+ # * Should absolute symlink be restored as absolute ?
+ self.report(Action.ADD, path, pstat)
+ return self._repo._cas.write(
+ b"slnk",
+ path.readlink().as_posix().encode("utf-8"),
+ dry_run = self._dry_run,
+ )
+
+ def _add_blob(self, path: Path, pstat: stat_result) -> Digest:
+ self.report(Action.ADD, path, pstat)
+ with path.open("rb") as stream:
+ return self._repo.add_blob(stream, dry_run=self._dry_run)
+
+
+ def ignore(self, path: Path, pstat: stat_result) -> bool:
+ return is_bsv_repository(path)
+
+ def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | None=None):
+ match action, info:
+ case (Action.ADD, None):
+ print(f"Add: {path}")
+ case (Action.IGNORE, IgnoreCause.IGNORE_RULE):
+ print(f"Ignore (rule): {path}")
+ case (Action.IGNORE, IgnoreCause.UNCHANGED):
+ print(f"Ignore (unchanged): {path}")
+ case (Action.IGNORE, IgnoreCause.UNSUPPORTED_TYPE) if pstat is not None:
+ assert pstat is not None
+ print(f"Ignore (unsupported type {path_type_name(pstat)}): {path}")
+ case (Action.ERROR, _) if isinstance(info, Exception):
+ print(f"Error {info}: {path}")
+ case _:
+ raise ValueError("TreeWalker.report(): unsupported parameter combination")
+
+
+def path_type_name(pstat: stat_result) -> str:
+ parts = []
+
+ if stat.S_ISBLK(pstat.st_mode):
+ parts.append("block_device")
+ if stat.S_ISCHR(pstat.st_mode):
+ parts.append("char_device")
+ if stat.S_ISDIR(pstat.st_mode):
+ parts.append("dir")
+ if stat.S_ISDOOR(pstat.st_mode):
+ parts.append("door")
+ if stat.S_ISFIFO(pstat.st_mode):
+ parts.append("fifo")
+ if stat.S_ISLNK(pstat.st_mode):
+ parts.append("symlink")
+ if stat.S_ISPORT(pstat.st_mode):
+ parts.append("port")
+ if stat.S_ISREG(pstat.st_mode):
+ parts.append("file")
+ if stat.S_ISSOCK(pstat.st_mode):
+ parts.append("socket")
+ if stat.S_ISWHT(pstat.st_mode):
+ parts.append("whiteout")
+
+ if not parts:
+ return "unknown"
+ return ", ".join(parts)
diff --git a/src/bsv/util.py b/src/bsv/util.py
index ed2dac2..2c10877 100644
--- a/src/bsv/util.py
+++ b/src/bsv/util.py
@@ -17,6 +17,8 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from datetime import UTC, datetime as DateTime, timedelta as TimeDelta
+from pathlib import Path
+import stat
from typing import BinaryIO
@@ -45,6 +47,23 @@ def read_exact_or_eof(stream: BinaryIO, num_bytes: int) -> bytes | None:
return data
+def is_bsv_repository(path: Path) -> bool:
+ return (path / "bsv_config.toml").is_file()
+
+
+def object_type_from_path(path: Path) -> bytes:
+ return object_type_from_mode(path.stat(follow_symlinks=False).st_mode)
+
+def object_type_from_mode(mode: int) -> bytes:
+ if stat.S_ISLNK(mode):
+ return b"slnk"
+ elif stat.S_ISDIR(mode):
+ return b"tree"
+ elif stat.S_ISREG(mode):
+ return b"blob"
+ return b""
+
+
class Hash(ABC):
name: str
digest_size: int
diff --git a/tests/test_repository.py b/tests/test_repository.py
index fe6bdea..a46e143 100644
--- a/tests/test_repository.py
+++ b/tests/test_repository.py
@@ -15,7 +15,7 @@
# along with this program. If not, see .
from __future__ import annotations
from datetime import UTC, datetime
-from io import BytesIO
+from os import stat_result
from pathlib import Path
from random import randbytes
from typing import Iterator
@@ -25,6 +25,7 @@ from tempfile import TemporaryDirectory
from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_from_time
from bsv.simple_cas.cas import Digest
+from bsv.tree_walker import Action, IgnoreCause, TreeWalker
@pytest.fixture
@@ -61,18 +62,20 @@ def test_read_write_tree(repo: Repository):
repo,
[
TreeItem(
- "xyz",
- Digest(bytes([42]) * repo._cas._digest_size),
- 0o744,
- creation_timestamp = timestamp_from_time(now),
+ digest = Digest(bytes([42]) * repo._cas._digest_size),
+ object_type = b"blob",
+ size = 123,
+ permissions = 0o744,
modification_timestamp = timestamp_from_time(now),
+ name = "xyz",
),
TreeItem(
- "foobar",
- Digest(bytes([123]) * repo._cas._digest_size),
- 0o777,
- creation_timestamp = timestamp_from_time(now),
+ digest = Digest(bytes([123]) * repo._cas._digest_size),
+ object_type = b"slnk",
+ size = 42,
+ permissions = 0o777,
modification_timestamp = timestamp_from_time(now),
+ name = "foobar",
),
]
)
@@ -97,6 +100,75 @@ def test_read_write_snapshot(repo: Repository):
assert repo.get_snapshot(digest) == snapshot
+class TestTreeWalker(TreeWalker):
+ reports: list
+
+ def __init__(self, repo: Repository, dry_run: bool=False):
+ super().__init__(repo, dry_run)
+ self.reports = []
+
+ def report(
+ self,
+ action: Action,
+ path: Path,
+ pstat: stat_result | None,
+ info: IgnoreCause | Exception | None = None
+ ):
+ super().report(action, path, pstat, info)
+ self.reports.append((action, path, pstat, info))
+
+
+def test_add_tree(tmp_dir: Path, repo: Repository):
+ dir = tmp_dir / "test"
+ structure = {
+ "folder": {
+ "sub_folder": {
+ "empty_folder": {},
+ "foo.txt": b"Hello World!\n",
+ },
+ "test.py": b"print(\"Hello World!\")\n",
+ "bar.dat": bytes(range(256)),
+ },
+ "Another test with long name and spaces and a bang !": b"Should works.\n",
+ "bsv_repo": {
+ "bsv_config.toml": b"[bsv]\n",
+ },
+ }
+
+ create_file_structure(dir, structure)
+
+ walker = TestTreeWalker(repo)
+ dir_digest = walker.add_tree(dir)
+
+ def check(digest: Digest, value: dict | bytes):
+ if isinstance(value, dict):
+ tree = repo.get_tree(digest)
+ assert tree
+ assert list(map(lambda i: i.name, tree.items)) == sorted(value.keys())
+ for item in tree.items:
+ check(item.digest, value[item.name])
+ elif isinstance(value, bytes):
+ blob = repo.get_blob(digest)
+ data = blob.reader().read()
+ assert data == value
+
+ expected = dict(structure)
+ del expected["bsv_repo"]
+ check(dir_digest, expected)
+
+
+def create_file_structure(dst: Path, value: dict | bytes):
+ assert not dst.exists()
+ if isinstance(value, dict):
+ dst.mkdir()
+ for name, item in value.items():
+ create_file_structure(dst / name, item)
+ elif isinstance(value, bytes):
+ dst.write_bytes(value)
+ else:
+ raise TypeError(f"invalid type {type(value).__name__} for parameter value")
+
+
def make_random_file(path: Path, size: int):
with path.open("wb") as stream:
for chunk_size in iter_chunks(size):