diff --git a/src/bsv/path_map.py b/src/bsv/path_map.py new file mode 100644 index 0000000..fd72487 --- /dev/null +++ b/src/bsv/path_map.py @@ -0,0 +1,58 @@ +# bsv - Backup, Synchronization, Versioning +# Copyright (C) 2023 Simon Boyé +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from __future__ import annotations +from dataclasses import dataclass +from itertools import pairwise + +from pathlib import Path, PurePosixPath +from typing import Any + + +@dataclass(order=True, frozen=True, slots=True) +class PathPair: + bsv: PurePosixPath + fs: Path + + def __post_init__(self): + if not self.bsv.is_absolute() or not self.fs.is_absolute(): + raise ValueError("paths in path_map must be absolute") + + @classmethod + def from_obj(cls, obj: dict[str, str]) -> PathPair: + return cls( + bsv = PurePosixPath(obj["bsv"]), + fs = Path(obj["fs"]), + ) + + +class PathMap: + paths: list[PathPair] + + def __init__(self, paths: list[PathPair]=[]): + self.paths = sorted(paths) + for path0, path1 in pairwise(self.paths): + if path0 == path1 or path1.bsv.relative_to(path0.bsv): + raise ValueError("bsv paths must be unique and independent") + + @classmethod + def from_obj(cls, obj: list[dict[str, str]]) -> PathMap: + return cls([ + PathPair.from_obj(item) + for item in obj + ]) + + def clone(self) -> PathMap: + return PathMap(self.paths) diff --git a/src/bsv/repository.py b/src/bsv/repository.py index 065b942..3332ba2 100644 --- a/src/bsv/repository.py +++ b/src/bsv/repository.py @@ -19,19 +19,23 @@ from dataclasses import dataclass from datetime import datetime as DateTime import hashlib from io import BytesIO -from pathlib import Path, PurePosixPath +from pathlib import Path import platform import tomllib -from typing import Any, BinaryIO, Callable, Type +from typing import TYPE_CHECKING, BinaryIO, Callable, Type from fastcdc import fastcdc import tomlkit from bsv import __version__ from bsv.exception import ConfigError +from bsv.path_map import PathMap from bsv.simple_cas import SimpleCas from bsv.simple_cas.cas import Digest, SimpleCas -from bsv.util import Hash, read_exact, read_exact_or_eof, time_from_timestamp, timestamp_from_time +from bsv.util import Hash, read_exact, read_exact_or_eof, time_from_timestamp_us, timestamp_us_from_time + +if TYPE_CHECKING: + from bsv.tree_walker import TreeWalker DEFAULT_MIN_CHUNK_SIZE = 1 << 12 @@ -48,7 +52,7 @@ class Repository: _avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE _max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE - _path_map: list[PathPair] + _path_map: PathMap # _remotes: list[object] _context_depth: int = 0 @@ -72,10 +76,7 @@ class Repository: self._avg_chunk_size = bsv.get("avg_chunk_size") self._max_chunk_size = bsv.get("max_chunk_size") - self._path_map = [ - PathPair.from_obj(pair) - for pair in bsv.get("path_map", []) - ] + self._path_map = PathMap.from_obj(bsv.get("path_map", [])) @property def path(self) -> Path: @@ -90,14 +91,14 @@ class Repository: return self._name @property - def path_map(self) -> list[PathPair]: - return list(self._path_map) + def path_map(self) -> PathMap: + return self._path_map.clone() def get_blob(self, digest: Digest) -> Blob: with self: return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore - def add_blob(self, stream: BinaryIO, dry_run: bool=False) -> Digest: + def add_blob(self, stream: BinaryIO, *, dry_run: bool=False) -> Digest: with self: return self._write(b"blob", stream, dry_run=dry_run) @@ -105,11 +106,11 @@ class Repository: with self: return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data) - def add_tree(self, tree: Tree, dry_run: bool=False) -> Digest: + def add_tree(self, tree: Tree, *, dry_run: bool=False) -> Digest: with self: return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run) - def add_tree_from_path(self, path: Path, dry_run: bool=False) -> Digest: + def add_tree_from_path(self, path: Path, *, dry_run: bool=False) -> Digest: from bsv.tree_walker import TreeWalker walker = TreeWalker(self, dry_run=dry_run) return walker.add_tree(path) @@ -118,16 +119,44 @@ class Repository: with self: return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data) - def add_snapshot(self, snapshot: Snapshot, dry_run: bool=False) -> Digest: + def add_snapshot(self, snapshot: Snapshot, *, dry_run: bool=False) -> Digest: with self: return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run) + # def take_snapshot( + # self, + # parent_digests: list[Digest] = [], + # *, + # walker_type: Type[TreeWalker] | None = None, + # dry_run: bool = False, + # ): + # from bsv.tree_walker import TreeWalker + + # walker = (walker_type or TreeWalker)(self, dry_run=dry_run) + + # # parents = [ + # # self.get_snapshot(digest) + # # for digest in parent_digests + # # ] + # parent = self.get_snapshot(parent_digests[0]) if parent_digests else None + + # snapshot = Snapshot( + # repo = self, + # tree_digest = walker.add_virtual_tree(self._path_map, parent=), + # parents = parent_digests, + # repo_name = self._name, + # timestamp = timestamp_us_from_time(DateTime.now()), + # ) + # return self.add_snapshot(snapshot, dry_run=dry_run) + + + def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject: obj = self._cas.read(digest, object_type=object_type) stream = BytesIO(obj.data) return cls.from_stream(self, stream, digest_size=self._cas._digest_size) - def _write(self, object_type: bytes, stream: BinaryIO, dry_run: bool=False) -> Digest: + def _write(self, object_type: bytes, stream: BinaryIO, *, dry_run: bool=False) -> Digest: out = BytesIO() size = 0 for chunk in fastcdc( @@ -332,7 +361,7 @@ class TreeItem: object_type: bytes size: int permissions: int - modification_timestamp: int + modification_timestamp_us: int name: str def __init__( @@ -341,7 +370,7 @@ class TreeItem: object_type: bytes, size: int, permissions: int, - modification_timestamp: int, + modification_timestamp_us: int, name: str, ): if "/\\" in name: @@ -350,15 +379,15 @@ class TreeItem: self.object_type = object_type self.size = size self.permissions = permissions - self.modification_timestamp = modification_timestamp + self.modification_timestamp_us = modification_timestamp_us self.name = name @property def modification_time(self) -> DateTime: - return time_from_timestamp(self.modification_timestamp) + return time_from_timestamp_us(self.modification_timestamp_us) @modification_time.setter def modification_time(self, time: DateTime): - self.modification_timestamp = timestamp_from_time(time) + self.modification_timestamp_us = timestamp_us_from_time(time) @classmethod def from_stream(cls, stream: BinaryIO, digest_size: int) -> TreeItem | None: @@ -370,7 +399,7 @@ class TreeItem: object_type = read_exact(stream, 4), size = int.from_bytes(read_exact(stream, 8)), permissions = int.from_bytes(read_exact(stream, 2)), - modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True), + modification_timestamp_us = int.from_bytes(read_exact(stream, 8), signed=True), name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), ) @@ -379,7 +408,7 @@ class TreeItem: stream.write(self.object_type) stream.write(self.size.to_bytes(8)) stream.write(self.permissions.to_bytes(2)) - stream.write(self.modification_timestamp.to_bytes(8, signed=True)) + stream.write(self.modification_timestamp_us.to_bytes(8, signed=True)) name_bytes = self.name.encode("utf-8") stream.write(len(name_bytes).to_bytes(2)) stream.write(name_bytes) @@ -391,23 +420,31 @@ class TreeItem: class Snapshot: repo: Repository tree_digest: Digest + parents: list[Digest] repo_name: str - timestamp: int + timestamp_us: int + + def __post_init__(self): + assert len(self.parents) < 256 @property def time(self) -> DateTime: - return time_from_timestamp(self.timestamp) + return time_from_timestamp_us(self.timestamp_us) @time.setter def time(self, time: DateTime): - self.timestamp = timestamp_from_time(time) + self.timestamp_us = timestamp_us_from_time(time) @classmethod def from_stream(cls, repo: Repository, stream: BinaryIO) -> Snapshot: return Snapshot( repo = repo, tree_digest = Digest(read_exact(stream, repo._cas._digest_size)), + parents = [ + Digest(read_exact(stream, repo._cas._digest_size)) + for _ in range(int.from_bytes(read_exact(stream, 1))) + ], repo_name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), - timestamp = int.from_bytes(read_exact(stream, 8), signed=True), + timestamp_us = int.from_bytes(read_exact(stream, 8), signed=True), ) @classmethod @@ -416,40 +453,17 @@ class Snapshot: return cls.from_stream(repo, stream) def write(self, stream: BinaryIO): + assert len(self.parents) < 256 stream.write(self.tree_digest.digest) + stream.write(len(self.parents).to_bytes(1)) + for parent in self.parents: + stream.write(parent.digest) repo_name_bytes = self.repo_name.encode("utf-8") stream.write(len(repo_name_bytes).to_bytes(2)) stream.write(repo_name_bytes) - stream.write(self.timestamp.to_bytes(8, signed=True)) + stream.write(self.timestamp_us.to_bytes(8, signed=True)) def to_bytes(self) -> bytes: stream = BytesIO() self.write(stream) return stream.getvalue() - - - - -class PathPair: - bsv: PurePosixPath - fs: Path - - def __init__(self, bsv: PurePosixPath, fs: Path): - self.bsv = bsv - self.fs = fs - - @classmethod - def from_obj(cls, obj: dict[str, Any]) -> PathPair: - bsv = PurePosixPath(obj["bsv"]) - fs = Path(obj["fs"]) - - if not bsv.is_absolute() or not fs.is_absolute(): - raise ValueError("paths in path_map must be absolute") - - return cls( - bsv = obj["bsv"], - fs = obj["fs"], - ) - - def __lt__(self, rhs: PathPair) -> bool: - return self.bsv < rhs.bsv diff --git a/src/bsv/tree_walker.py b/src/bsv/tree_walker.py index 228303d..df25cb4 100644 --- a/src/bsv/tree_walker.py +++ b/src/bsv/tree_walker.py @@ -15,12 +15,14 @@ # along with this program. If not, see . from __future__ import annotations +from datetime import datetime as DateTime, timedelta as TimeDelta from enum import Enum from os import stat_result from pathlib import Path import stat from bsv.object import Digest +from bsv.path_map import PathMap from bsv.repository import Repository, Tree, TreeItem from bsv.util import is_bsv_repository, object_type_from_mode @@ -28,9 +30,20 @@ from bsv.util import is_bsv_repository, object_type_from_mode class Action(Enum): ADD = "add" UPDATE = "update" + REMOVE = "remove" IGNORE = "ignore" ERROR = "error" + @classmethod + def from_digests(cls, digest: Digest, source_digest: Digest | None) -> tuple[Action, IgnoreCause | None]: + assert digest + if not source_digest: + return Action.ADD, None + elif source_digest == digest: + return Action.IGNORE, IgnoreCause.UNCHANGED + else: + return Action.UPDATE, None + class IgnoreCause(Enum): IGNORE_RULE = "ignore_rule" UNCHANGED = "unchanged" @@ -39,88 +52,177 @@ class IgnoreCause(Enum): class TreeWalker: _repo: Repository + _time_rounding_us: int = 2000000 + _force_hash: bool = False _dry_run: bool = False - def __init__(self, repo: Repository, dry_run: bool=False): + def __init__( + self, + repo: Repository, + *, + time_rounding_us: int = 2000000, + force_hash: bool = False, + dry_run: bool = False, + ): self._repo = repo + self._time_rounding_us = time_rounding_us + self._force_hash = force_hash self._dry_run = dry_run - def add_tree(self, path: Path) -> Digest: + # def add_virtual_tree(self, paths: PathMap) -> Digest: + # root = {} + # for pair in paths.paths: + # vdir = root + # for part in pair.bsv.parts[:-1]: + # vdir = vdir.setdefault(part, {}) + # vdir[pair.bsv.parts[-1]] = pair.fs + + # return self._add_virtual_tree(root) + + # def _add_virtual_tree(self, vtree: dict[str, dict | Path]) -> Digest: + # tree = Tree(self._repo, []) + # for name, value in vtree.items(): + # if isinstance(value, dict): + # digest = self._add_virtual_tree(value) + # elif isinstance(value, Path): + # digest = self.add_tree(value) + # else: + # raise TypeError(f"unexpected type {type(vtree).__name__} for vtree") + # tree.items.append(TreeItem( + # digest = digest, + # object_type = b"tree", + # size = 0, + # permissions = 0o766, + # modification_timestamp = timestamp_us_from_time(DateTime.now()), + # name = name, + # )) + # return self._repo.add_tree(tree, dry_run=self._dry_run) + + def add_tree(self, path: Path, *, source_digest: Digest | None=None) -> Digest: pstat = path.stat(follow_symlinks=False) if self.ignore(path, pstat): self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE) return Digest() - return self._add_tree(path, pstat) + return self._add_tree(path, pstat, source_digest=source_digest) + + def _add_tree(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> Digest: + source = self._repo.get_tree(source_digest) if source_digest else None - def _add_tree(self, path: Path, pstat: stat_result) -> Digest: tree = Tree(self._repo, []) - for item in sorted(path.iterdir()): - digest = Digest() - try: - istat = item.lstat() - if self.ignore(item, istat): - self.report(Action.IGNORE, item, istat, IgnoreCause.IGNORE_RULE) - continue - object_type = object_type_from_mode(istat.st_mode) - if object_type == b"slnk": - digest = self._add_symlink(item, istat) - elif object_type == b"tree": - digest = self._add_tree(item, istat) - elif object_type == b"blob": - digest = self._add_blob(item, istat) - else: - self.report(Action.IGNORE, item, istat, IgnoreCause.UNSUPPORTED_TYPE) + subpaths = sorted(path.iterdir()) + + subpath_index = 0 + source_item_index = 0 + + while subpath_index < len(subpaths) or (source and source_item_index < len(source.items)): + subpath = subpaths[subpath_index] if subpath_index < len(subpaths) else None + source_item = source.items[source_item_index] if source and source_item_index < len(source.items) else None + + if subpath and source_item: + if subpath.name < source_item.name: + source_item = None + elif subpath.name > source_item.name: + subpath = None + + if subpath is not None: + subpath_index += 1 + if source_item is not None: + source_item_index += 1 + + if subpath is not None: + digest = Digest() + try: + istat = subpath.lstat() + + if self.ignore(subpath, istat, source=source_item): + self.report(Action.IGNORE, subpath, istat, IgnoreCause.IGNORE_RULE) + continue + + if (source_item is not None and + not self._force_hash and + not stat.S_ISDIR(istat.st_mode) and + pstat.st_size == source_item.size and + pstat.st_mtime_ns // (1000 * self._time_rounding_us) == source_item.modification_timestamp_us // self._time_rounding_us + ): + self.report(Action.IGNORE, subpath, istat, IgnoreCause.UNCHANGED) + tree.items.append(source_item) + continue + + sub_source_digest = source_item and source_item.digest + object_type = object_type_from_mode(istat.st_mode) + if object_type == b"slnk": + digest = self._add_symlink(subpath, istat, source_digest=sub_source_digest) + elif object_type == b"tree": + digest = self._add_tree(subpath, istat, source_digest=sub_source_digest) + elif object_type == b"blob": + digest = self._add_blob(subpath, istat, source_digest=sub_source_digest) + else: + self.report(Action.IGNORE, subpath, istat, IgnoreCause.UNSUPPORTED_TYPE) + continue + except Exception as err: + self.report(Action.ERROR, subpath, None, err) continue - except Exception as err: - self.report(Action.ERROR, item, None, err) - continue - if digest: - self.report(Action.ADD, path, pstat) - tree.items.append(TreeItem( - digest = digest, - object_type = object_type, - size = istat.st_size, - permissions = stat.S_IMODE(istat.st_mode), - modification_timestamp = istat.st_mtime_ns, - name = item.name, - )) + if digest: + tree.items.append(TreeItem( + digest = digest, + object_type = object_type, + size = istat.st_size, + permissions = stat.S_IMODE(istat.st_mode), + modification_timestamp_us = istat.st_mtime_ns // 1000, + name = subpath.name, + )) + elif source_item: + self.report(Action.REMOVE, path / source_item.name, None, source_item) - return self._repo.add_tree(tree, dry_run=self._dry_run) + digest = self._repo.add_tree(tree, dry_run=self._dry_run) + action, info = Action.from_digests(digest, source_digest) + self.report(action, path, pstat, info) + return digest - def _add_symlink(self, path: Path, pstat: stat_result) -> Digest: + + def _add_symlink(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> Digest: # TODO: Store symlink relative to current dir ? # * What about symlink that points outside of the backup dirs # * Should symlinks that points inside the backup dirs but in another # mount-point adjusted ? # * Should absolute symlink be restored as absolute ? - self.report(Action.ADD, path, pstat) - return self._repo._cas.write( + digest = self._repo._cas.write( b"slnk", path.readlink().as_posix().encode("utf-8"), dry_run = self._dry_run, ) - def _add_blob(self, path: Path, pstat: stat_result) -> Digest: - self.report(Action.ADD, path, pstat) + action, info = Action.from_digests(digest, source_digest) + self.report(action, path, pstat, info) + return digest + + def _add_blob(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> Digest: with path.open("rb") as stream: - return self._repo.add_blob(stream, dry_run=self._dry_run) + digest = self._repo.add_blob(stream, dry_run=self._dry_run) + action, info = Action.from_digests(digest, source_digest) + self.report(action, path, pstat, info) + return digest - def ignore(self, path: Path, pstat: stat_result) -> bool: + + def ignore(self, path: Path, pstat: stat_result, *, source: TreeItem | None=None) -> bool: return is_bsv_repository(path) - def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | None=None): + def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | TreeItem | None=None): match action, info: case (Action.ADD, None): print(f"Add: {path}") + case (Action.UPDATE, None): + print(f"Add: {path}") + case (Action.REMOVE, item) if isinstance(item, TreeItem): + print(f"Remove: {path / item.name}") case (Action.IGNORE, IgnoreCause.IGNORE_RULE): print(f"Ignore (rule): {path}") case (Action.IGNORE, IgnoreCause.UNCHANGED): print(f"Ignore (unchanged): {path}") case (Action.IGNORE, IgnoreCause.UNSUPPORTED_TYPE) if pstat is not None: - assert pstat is not None print(f"Ignore (unsupported type {path_type_name(pstat)}): {path}") case (Action.ERROR, _) if isinstance(info, Exception): print(f"Error {info}: {path}") diff --git a/src/bsv/util.py b/src/bsv/util.py index 2c10877..60372d8 100644 --- a/src/bsv/util.py +++ b/src/bsv/util.py @@ -25,10 +25,10 @@ from typing import BinaryIO EPOCH = DateTime(1970, 1, 1, tzinfo=UTC) -def time_from_timestamp(timestamp: int) -> DateTime: +def time_from_timestamp_us(timestamp: int) -> DateTime: return EPOCH + TimeDelta(microseconds=timestamp) -def timestamp_from_time(time: DateTime) -> int: +def timestamp_us_from_time(time: DateTime) -> int: return (time.astimezone(UTC) - EPOCH) // TimeDelta(microseconds=1) diff --git a/tests/test_repository.py b/tests/test_repository.py index a46e143..4ba4e09 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -18,12 +18,13 @@ from datetime import UTC, datetime from os import stat_result from pathlib import Path from random import randbytes +from shutil import rmtree from typing import Iterator import pytest from tempfile import TemporaryDirectory -from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_from_time +from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_us_from_time from bsv.simple_cas.cas import Digest from bsv.tree_walker import Action, IgnoreCause, TreeWalker @@ -66,7 +67,7 @@ def test_read_write_tree(repo: Repository): object_type = b"blob", size = 123, permissions = 0o744, - modification_timestamp = timestamp_from_time(now), + modification_timestamp_us = timestamp_us_from_time(now), name = "xyz", ), TreeItem( @@ -74,7 +75,7 @@ def test_read_write_tree(repo: Repository): object_type = b"slnk", size = 42, permissions = 0o777, - modification_timestamp = timestamp_from_time(now), + modification_timestamp_us = timestamp_us_from_time(now), name = "foobar", ), ] @@ -90,8 +91,12 @@ def test_read_write_snapshot(repo: Repository): snapshot = Snapshot( repo = repo, tree_digest = Digest(bytes([42]) * repo._cas._digest_size), + parents = [ + Digest(bytes([123]) * repo._cas._digest_size), + Digest(bytes([124]) * repo._cas._digest_size), + ], repo_name = "test_repo", - timestamp = timestamp_from_time(datetime.now()), + timestamp_us = timestamp_us_from_time(datetime.now()), ) assert Snapshot.from_bytes(repo, snapshot.to_bytes()) == snapshot @@ -104,7 +109,7 @@ class TestTreeWalker(TreeWalker): reports: list def __init__(self, repo: Repository, dry_run: bool=False): - super().__init__(repo, dry_run) + super().__init__(repo, dry_run=dry_run) self.reports = [] def report( @@ -115,12 +120,12 @@ class TestTreeWalker(TreeWalker): info: IgnoreCause | Exception | None = None ): super().report(action, path, pstat, info) - self.reports.append((action, path, pstat, info)) + self.reports.append((action, path, info if action != Action.REMOVE else None)) def test_add_tree(tmp_dir: Path, repo: Repository): - dir = tmp_dir / "test" - structure = { + dir = tmp_dir / "test0" + structure0 = { "folder": { "sub_folder": { "empty_folder": {}, @@ -134,11 +139,28 @@ def test_add_tree(tmp_dir: Path, repo: Repository): "bsv_config.toml": b"[bsv]\n", }, } + structure1 = { + "folder": { + "sub_folder": { + "empty_folder": {}, + "foo.txt": b"Hello World!\n", + }, + "bar.dat": bytes(range(256)) * 2, + }, + "new_file": b"whatever", + "Another test with long name and spaces and a bang !": b"Should works.\n", + "bsv_repo": { + "bsv_config.toml": b"[bsv]\n", + }, + } - create_file_structure(dir, structure) + expected0 = dict(structure0) + del expected0["bsv_repo"] - walker = TestTreeWalker(repo) - dir_digest = walker.add_tree(dir) + expected1 = dict(structure1) + del expected1["bsv_repo"] + + create_file_structure(dir, structure0) def check(digest: Digest, value: dict | bytes): if isinstance(value, dict): @@ -152,19 +174,79 @@ def test_add_tree(tmp_dir: Path, repo: Repository): data = blob.reader().read() assert data == value - expected = dict(structure) - del expected["bsv_repo"] - check(dir_digest, expected) + walker = TestTreeWalker(repo) + dir_digest0 = walker.add_tree(dir) + assert walker.reports == [ + (Action.ADD, dir / "Another test with long name and spaces and a bang !", None), + (Action.IGNORE, dir / "bsv_repo", IgnoreCause.IGNORE_RULE), + (Action.ADD, dir / "folder/bar.dat", None), + (Action.ADD, dir / "folder/sub_folder/empty_folder", None), + (Action.ADD, dir / "folder/sub_folder/foo.txt", None), + (Action.ADD, dir / "folder/sub_folder", None), + (Action.ADD, dir / "folder/test.py", None), + (Action.ADD, dir / "folder", None), + (Action.ADD, dir, None), + ] + check(dir_digest0, expected0) + + create_file_structure(dir, structure1) + + walker.reports.clear() + dir_digest1 = walker.add_tree(dir, source_digest=dir_digest0) + assert walker.reports == [ + (Action.IGNORE, dir / "Another test with long name and spaces and a bang !", IgnoreCause.UNCHANGED), + (Action.IGNORE, dir / "bsv_repo", IgnoreCause.IGNORE_RULE), + (Action.UPDATE, dir / "folder/bar.dat", None), + (Action.IGNORE, dir / "folder/sub_folder/empty_folder", IgnoreCause.UNCHANGED), + (Action.IGNORE, dir / "folder/sub_folder/foo.txt", IgnoreCause.UNCHANGED), + (Action.IGNORE, dir / "folder/sub_folder", IgnoreCause.UNCHANGED), + (Action.REMOVE, dir / "folder/test.py", None), + (Action.UPDATE, dir / "folder", None), + (Action.ADD, dir / "new_file", None), + (Action.UPDATE, dir, None), + ] + check(dir_digest1, expected1) def create_file_structure(dst: Path, value: dict | bytes): - assert not dst.exists() - if isinstance(value, dict): - dst.mkdir() - for name, item in value.items(): - create_file_structure(dst / name, item) - elif isinstance(value, bytes): - dst.write_bytes(value) + if isinstance(value, bytes): + if dst.is_dir(): + rmtree(str(dst)) + if not dst.is_file() or dst.read_bytes() != value: + dst.write_bytes(value) + elif isinstance(value, dict): + if dst.is_file(): + dst.unlink() + if not dst.is_dir(): + dst.mkdir() + + items = sorted(value.items()) + fs_paths = sorted(dst.iterdir()) + + item_index = 0 + fs_path_index = 0 + + while item_index < len(value) or fs_path_index < len(fs_paths): + name, subitem = items[item_index] if item_index < len(items) else (None, None) + fs_path = fs_paths[fs_path_index] if fs_path_index < len(fs_paths) else None + + if name and fs_path: + if name < fs_path.name: + fs_path = None + elif name > fs_path.name: + name = None + + if name: + item_index += 1 + if fs_path: + fs_path_index += 1 + + if name: + create_file_structure(dst / name, subitem) # type: ignore + elif fs_path and fs_path.is_dir(): + rmtree(fs_path) + elif fs_path: + fs_path.unlink() else: raise TypeError(f"invalid type {type(value).__name__} for parameter value")