Browse Source

Improve TreeWalker algorithm.

master
Draklaw 2 years ago
parent
commit
073fd5e567
  1. 58
      src/bsv/path_map.py
  2. 122
      src/bsv/repository.py
  3. 154
      src/bsv/tree_walker.py
  4. 4
      src/bsv/util.py
  5. 122
      tests/test_repository.py

58
src/bsv/path_map.py

@ -0,0 +1,58 @@
# bsv - Backup, Synchronization, Versioning
# Copyright (C) 2023 Simon Boyé
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations
from dataclasses import dataclass
from itertools import pairwise
from pathlib import Path, PurePosixPath
from typing import Any
@dataclass(order=True, frozen=True, slots=True)
class PathPair:
bsv: PurePosixPath
fs: Path
def __post_init__(self):
if not self.bsv.is_absolute() or not self.fs.is_absolute():
raise ValueError("paths in path_map must be absolute")
@classmethod
def from_obj(cls, obj: dict[str, str]) -> PathPair:
return cls(
bsv = PurePosixPath(obj["bsv"]),
fs = Path(obj["fs"]),
)
class PathMap:
paths: list[PathPair]
def __init__(self, paths: list[PathPair]=[]):
self.paths = sorted(paths)
for path0, path1 in pairwise(self.paths):
if path0 == path1 or path1.bsv.relative_to(path0.bsv):
raise ValueError("bsv paths must be unique and independent")
@classmethod
def from_obj(cls, obj: list[dict[str, str]]) -> PathMap:
return cls([
PathPair.from_obj(item)
for item in obj
])
def clone(self) -> PathMap:
return PathMap(self.paths)

122
src/bsv/repository.py

@ -19,19 +19,23 @@ from dataclasses import dataclass
from datetime import datetime as DateTime from datetime import datetime as DateTime
import hashlib import hashlib
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePosixPath from pathlib import Path
import platform import platform
import tomllib import tomllib
from typing import Any, BinaryIO, Callable, Type from typing import TYPE_CHECKING, BinaryIO, Callable, Type
from fastcdc import fastcdc from fastcdc import fastcdc
import tomlkit import tomlkit
from bsv import __version__ from bsv import __version__
from bsv.exception import ConfigError from bsv.exception import ConfigError
from bsv.path_map import PathMap
from bsv.simple_cas import SimpleCas from bsv.simple_cas import SimpleCas
from bsv.simple_cas.cas import Digest, SimpleCas from bsv.simple_cas.cas import Digest, SimpleCas
from bsv.util import Hash, read_exact, read_exact_or_eof, time_from_timestamp, timestamp_from_time from bsv.util import Hash, read_exact, read_exact_or_eof, time_from_timestamp_us, timestamp_us_from_time
if TYPE_CHECKING:
from bsv.tree_walker import TreeWalker
DEFAULT_MIN_CHUNK_SIZE = 1 << 12 DEFAULT_MIN_CHUNK_SIZE = 1 << 12
@ -48,7 +52,7 @@ class Repository:
_avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE _avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE
_max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE _max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE
_path_map: list[PathPair] _path_map: PathMap
# _remotes: list[object] # _remotes: list[object]
_context_depth: int = 0 _context_depth: int = 0
@ -72,10 +76,7 @@ class Repository:
self._avg_chunk_size = bsv.get("avg_chunk_size") self._avg_chunk_size = bsv.get("avg_chunk_size")
self._max_chunk_size = bsv.get("max_chunk_size") self._max_chunk_size = bsv.get("max_chunk_size")
self._path_map = [ self._path_map = PathMap.from_obj(bsv.get("path_map", []))
PathPair.from_obj(pair)
for pair in bsv.get("path_map", [])
]
@property @property
def path(self) -> Path: def path(self) -> Path:
@ -90,14 +91,14 @@ class Repository:
return self._name return self._name
@property @property
def path_map(self) -> list[PathPair]: def path_map(self) -> PathMap:
return list(self._path_map) return self._path_map.clone()
def get_blob(self, digest: Digest) -> Blob: def get_blob(self, digest: Digest) -> Blob:
with self: with self:
return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore
def add_blob(self, stream: BinaryIO, dry_run: bool=False) -> Digest: def add_blob(self, stream: BinaryIO, *, dry_run: bool=False) -> Digest:
with self: with self:
return self._write(b"blob", stream, dry_run=dry_run) return self._write(b"blob", stream, dry_run=dry_run)
@ -105,11 +106,11 @@ class Repository:
with self: with self:
return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data) return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data)
def add_tree(self, tree: Tree, dry_run: bool=False) -> Digest: def add_tree(self, tree: Tree, *, dry_run: bool=False) -> Digest:
with self: with self:
return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run) return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run)
def add_tree_from_path(self, path: Path, dry_run: bool=False) -> Digest: def add_tree_from_path(self, path: Path, *, dry_run: bool=False) -> Digest:
from bsv.tree_walker import TreeWalker from bsv.tree_walker import TreeWalker
walker = TreeWalker(self, dry_run=dry_run) walker = TreeWalker(self, dry_run=dry_run)
return walker.add_tree(path) return walker.add_tree(path)
@ -118,16 +119,44 @@ class Repository:
with self: with self:
return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data) return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data)
def add_snapshot(self, snapshot: Snapshot, dry_run: bool=False) -> Digest: def add_snapshot(self, snapshot: Snapshot, *, dry_run: bool=False) -> Digest:
with self: with self:
return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run) return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run)
# def take_snapshot(
# self,
# parent_digests: list[Digest] = [],
# *,
# walker_type: Type[TreeWalker] | None = None,
# dry_run: bool = False,
# ):
# from bsv.tree_walker import TreeWalker
# walker = (walker_type or TreeWalker)(self, dry_run=dry_run)
# # parents = [
# # self.get_snapshot(digest)
# # for digest in parent_digests
# # ]
# parent = self.get_snapshot(parent_digests[0]) if parent_digests else None
# snapshot = Snapshot(
# repo = self,
# tree_digest = walker.add_virtual_tree(self._path_map, parent=),
# parents = parent_digests,
# repo_name = self._name,
# timestamp = timestamp_us_from_time(DateTime.now()),
# )
# return self.add_snapshot(snapshot, dry_run=dry_run)
def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject: def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject:
obj = self._cas.read(digest, object_type=object_type) obj = self._cas.read(digest, object_type=object_type)
stream = BytesIO(obj.data) stream = BytesIO(obj.data)
return cls.from_stream(self, stream, digest_size=self._cas._digest_size) return cls.from_stream(self, stream, digest_size=self._cas._digest_size)
def _write(self, object_type: bytes, stream: BinaryIO, dry_run: bool=False) -> Digest: def _write(self, object_type: bytes, stream: BinaryIO, *, dry_run: bool=False) -> Digest:
out = BytesIO() out = BytesIO()
size = 0 size = 0
for chunk in fastcdc( for chunk in fastcdc(
@ -332,7 +361,7 @@ class TreeItem:
object_type: bytes object_type: bytes
size: int size: int
permissions: int permissions: int
modification_timestamp: int modification_timestamp_us: int
name: str name: str
def __init__( def __init__(
@ -341,7 +370,7 @@ class TreeItem:
object_type: bytes, object_type: bytes,
size: int, size: int,
permissions: int, permissions: int,
modification_timestamp: int, modification_timestamp_us: int,
name: str, name: str,
): ):
if "/\\" in name: if "/\\" in name:
@ -350,15 +379,15 @@ class TreeItem:
self.object_type = object_type self.object_type = object_type
self.size = size self.size = size
self.permissions = permissions self.permissions = permissions
self.modification_timestamp = modification_timestamp self.modification_timestamp_us = modification_timestamp_us
self.name = name self.name = name
@property @property
def modification_time(self) -> DateTime: def modification_time(self) -> DateTime:
return time_from_timestamp(self.modification_timestamp) return time_from_timestamp_us(self.modification_timestamp_us)
@modification_time.setter @modification_time.setter
def modification_time(self, time: DateTime): def modification_time(self, time: DateTime):
self.modification_timestamp = timestamp_from_time(time) self.modification_timestamp_us = timestamp_us_from_time(time)
@classmethod @classmethod
def from_stream(cls, stream: BinaryIO, digest_size: int) -> TreeItem | None: def from_stream(cls, stream: BinaryIO, digest_size: int) -> TreeItem | None:
@ -370,7 +399,7 @@ class TreeItem:
object_type = read_exact(stream, 4), object_type = read_exact(stream, 4),
size = int.from_bytes(read_exact(stream, 8)), size = int.from_bytes(read_exact(stream, 8)),
permissions = int.from_bytes(read_exact(stream, 2)), permissions = int.from_bytes(read_exact(stream, 2)),
modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True), modification_timestamp_us = int.from_bytes(read_exact(stream, 8), signed=True),
name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),
) )
@ -379,7 +408,7 @@ class TreeItem:
stream.write(self.object_type) stream.write(self.object_type)
stream.write(self.size.to_bytes(8)) stream.write(self.size.to_bytes(8))
stream.write(self.permissions.to_bytes(2)) stream.write(self.permissions.to_bytes(2))
stream.write(self.modification_timestamp.to_bytes(8, signed=True)) stream.write(self.modification_timestamp_us.to_bytes(8, signed=True))
name_bytes = self.name.encode("utf-8") name_bytes = self.name.encode("utf-8")
stream.write(len(name_bytes).to_bytes(2)) stream.write(len(name_bytes).to_bytes(2))
stream.write(name_bytes) stream.write(name_bytes)
@ -391,23 +420,31 @@ class TreeItem:
class Snapshot: class Snapshot:
repo: Repository repo: Repository
tree_digest: Digest tree_digest: Digest
parents: list[Digest]
repo_name: str repo_name: str
timestamp: int timestamp_us: int
def __post_init__(self):
assert len(self.parents) < 256
@property @property
def time(self) -> DateTime: def time(self) -> DateTime:
return time_from_timestamp(self.timestamp) return time_from_timestamp_us(self.timestamp_us)
@time.setter @time.setter
def time(self, time: DateTime): def time(self, time: DateTime):
self.timestamp = timestamp_from_time(time) self.timestamp_us = timestamp_us_from_time(time)
@classmethod @classmethod
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Snapshot: def from_stream(cls, repo: Repository, stream: BinaryIO) -> Snapshot:
return Snapshot( return Snapshot(
repo = repo, repo = repo,
tree_digest = Digest(read_exact(stream, repo._cas._digest_size)), tree_digest = Digest(read_exact(stream, repo._cas._digest_size)),
parents = [
Digest(read_exact(stream, repo._cas._digest_size))
for _ in range(int.from_bytes(read_exact(stream, 1)))
],
repo_name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), repo_name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),
timestamp = int.from_bytes(read_exact(stream, 8), signed=True), timestamp_us = int.from_bytes(read_exact(stream, 8), signed=True),
) )
@classmethod @classmethod
@ -416,40 +453,17 @@ class Snapshot:
return cls.from_stream(repo, stream) return cls.from_stream(repo, stream)
def write(self, stream: BinaryIO): def write(self, stream: BinaryIO):
assert len(self.parents) < 256
stream.write(self.tree_digest.digest) stream.write(self.tree_digest.digest)
stream.write(len(self.parents).to_bytes(1))
for parent in self.parents:
stream.write(parent.digest)
repo_name_bytes = self.repo_name.encode("utf-8") repo_name_bytes = self.repo_name.encode("utf-8")
stream.write(len(repo_name_bytes).to_bytes(2)) stream.write(len(repo_name_bytes).to_bytes(2))
stream.write(repo_name_bytes) stream.write(repo_name_bytes)
stream.write(self.timestamp.to_bytes(8, signed=True)) stream.write(self.timestamp_us.to_bytes(8, signed=True))
def to_bytes(self) -> bytes: def to_bytes(self) -> bytes:
stream = BytesIO() stream = BytesIO()
self.write(stream) self.write(stream)
return stream.getvalue() return stream.getvalue()
class PathPair:
bsv: PurePosixPath
fs: Path
def __init__(self, bsv: PurePosixPath, fs: Path):
self.bsv = bsv
self.fs = fs
@classmethod
def from_obj(cls, obj: dict[str, Any]) -> PathPair:
bsv = PurePosixPath(obj["bsv"])
fs = Path(obj["fs"])
if not bsv.is_absolute() or not fs.is_absolute():
raise ValueError("paths in path_map must be absolute")
return cls(
bsv = obj["bsv"],
fs = obj["fs"],
)
def __lt__(self, rhs: PathPair) -> bool:
return self.bsv < rhs.bsv

154
src/bsv/tree_walker.py

@ -15,12 +15,14 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations from __future__ import annotations
from datetime import datetime as DateTime, timedelta as TimeDelta
from enum import Enum from enum import Enum
from os import stat_result from os import stat_result
from pathlib import Path from pathlib import Path
import stat import stat
from bsv.object import Digest from bsv.object import Digest
from bsv.path_map import PathMap
from bsv.repository import Repository, Tree, TreeItem from bsv.repository import Repository, Tree, TreeItem
from bsv.util import is_bsv_repository, object_type_from_mode from bsv.util import is_bsv_repository, object_type_from_mode
@ -28,9 +30,20 @@ from bsv.util import is_bsv_repository, object_type_from_mode
class Action(Enum): class Action(Enum):
ADD = "add" ADD = "add"
UPDATE = "update" UPDATE = "update"
REMOVE = "remove"
IGNORE = "ignore" IGNORE = "ignore"
ERROR = "error" ERROR = "error"
@classmethod
def from_digests(cls, digest: Digest, source_digest: Digest | None) -> tuple[Action, IgnoreCause | None]:
assert digest
if not source_digest:
return Action.ADD, None
elif source_digest == digest:
return Action.IGNORE, IgnoreCause.UNCHANGED
else:
return Action.UPDATE, None
class IgnoreCause(Enum): class IgnoreCause(Enum):
IGNORE_RULE = "ignore_rule" IGNORE_RULE = "ignore_rule"
UNCHANGED = "unchanged" UNCHANGED = "unchanged"
@ -39,88 +52,177 @@ class IgnoreCause(Enum):
class TreeWalker: class TreeWalker:
_repo: Repository _repo: Repository
_time_rounding_us: int = 2000000
_force_hash: bool = False
_dry_run: bool = False _dry_run: bool = False
def __init__(self, repo: Repository, dry_run: bool=False): def __init__(
self,
repo: Repository,
*,
time_rounding_us: int = 2000000,
force_hash: bool = False,
dry_run: bool = False,
):
self._repo = repo self._repo = repo
self._time_rounding_us = time_rounding_us
self._force_hash = force_hash
self._dry_run = dry_run self._dry_run = dry_run
def add_tree(self, path: Path) -> Digest: # def add_virtual_tree(self, paths: PathMap) -> Digest:
# root = {}
# for pair in paths.paths:
# vdir = root
# for part in pair.bsv.parts[:-1]:
# vdir = vdir.setdefault(part, {})
# vdir[pair.bsv.parts[-1]] = pair.fs
# return self._add_virtual_tree(root)
# def _add_virtual_tree(self, vtree: dict[str, dict | Path]) -> Digest:
# tree = Tree(self._repo, [])
# for name, value in vtree.items():
# if isinstance(value, dict):
# digest = self._add_virtual_tree(value)
# elif isinstance(value, Path):
# digest = self.add_tree(value)
# else:
# raise TypeError(f"unexpected type {type(vtree).__name__} for vtree")
# tree.items.append(TreeItem(
# digest = digest,
# object_type = b"tree",
# size = 0,
# permissions = 0o766,
# modification_timestamp = timestamp_us_from_time(DateTime.now()),
# name = name,
# ))
# return self._repo.add_tree(tree, dry_run=self._dry_run)
def add_tree(self, path: Path, *, source_digest: Digest | None=None) -> Digest:
pstat = path.stat(follow_symlinks=False) pstat = path.stat(follow_symlinks=False)
if self.ignore(path, pstat): if self.ignore(path, pstat):
self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE) self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE)
return Digest() return Digest()
return self._add_tree(path, pstat) return self._add_tree(path, pstat, source_digest=source_digest)
def _add_tree(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> Digest:
source = self._repo.get_tree(source_digest) if source_digest else None
def _add_tree(self, path: Path, pstat: stat_result) -> Digest:
tree = Tree(self._repo, []) tree = Tree(self._repo, [])
for item in sorted(path.iterdir()): subpaths = sorted(path.iterdir())
subpath_index = 0
source_item_index = 0
while subpath_index < len(subpaths) or (source and source_item_index < len(source.items)):
subpath = subpaths[subpath_index] if subpath_index < len(subpaths) else None
source_item = source.items[source_item_index] if source and source_item_index < len(source.items) else None
if subpath and source_item:
if subpath.name < source_item.name:
source_item = None
elif subpath.name > source_item.name:
subpath = None
if subpath is not None:
subpath_index += 1
if source_item is not None:
source_item_index += 1
if subpath is not None:
digest = Digest() digest = Digest()
try: try:
istat = item.lstat() istat = subpath.lstat()
if self.ignore(item, istat):
self.report(Action.IGNORE, item, istat, IgnoreCause.IGNORE_RULE) if self.ignore(subpath, istat, source=source_item):
self.report(Action.IGNORE, subpath, istat, IgnoreCause.IGNORE_RULE)
continue continue
if (source_item is not None and
not self._force_hash and
not stat.S_ISDIR(istat.st_mode) and
pstat.st_size == source_item.size and
pstat.st_mtime_ns // (1000 * self._time_rounding_us) == source_item.modification_timestamp_us // self._time_rounding_us
):
self.report(Action.IGNORE, subpath, istat, IgnoreCause.UNCHANGED)
tree.items.append(source_item)
continue
sub_source_digest = source_item and source_item.digest
object_type = object_type_from_mode(istat.st_mode) object_type = object_type_from_mode(istat.st_mode)
if object_type == b"slnk": if object_type == b"slnk":
digest = self._add_symlink(item, istat) digest = self._add_symlink(subpath, istat, source_digest=sub_source_digest)
elif object_type == b"tree": elif object_type == b"tree":
digest = self._add_tree(item, istat) digest = self._add_tree(subpath, istat, source_digest=sub_source_digest)
elif object_type == b"blob": elif object_type == b"blob":
digest = self._add_blob(item, istat) digest = self._add_blob(subpath, istat, source_digest=sub_source_digest)
else: else:
self.report(Action.IGNORE, item, istat, IgnoreCause.UNSUPPORTED_TYPE) self.report(Action.IGNORE, subpath, istat, IgnoreCause.UNSUPPORTED_TYPE)
continue continue
except Exception as err: except Exception as err:
self.report(Action.ERROR, item, None, err) self.report(Action.ERROR, subpath, None, err)
continue continue
if digest: if digest:
self.report(Action.ADD, path, pstat)
tree.items.append(TreeItem( tree.items.append(TreeItem(
digest = digest, digest = digest,
object_type = object_type, object_type = object_type,
size = istat.st_size, size = istat.st_size,
permissions = stat.S_IMODE(istat.st_mode), permissions = stat.S_IMODE(istat.st_mode),
modification_timestamp = istat.st_mtime_ns, modification_timestamp_us = istat.st_mtime_ns // 1000,
name = item.name, name = subpath.name,
)) ))
elif source_item:
self.report(Action.REMOVE, path / source_item.name, None, source_item)
return self._repo.add_tree(tree, dry_run=self._dry_run) digest = self._repo.add_tree(tree, dry_run=self._dry_run)
action, info = Action.from_digests(digest, source_digest)
self.report(action, path, pstat, info)
return digest
def _add_symlink(self, path: Path, pstat: stat_result) -> Digest:
def _add_symlink(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> Digest:
# TODO: Store symlink relative to current dir ? # TODO: Store symlink relative to current dir ?
# * What about symlink that points outside of the backup dirs # * What about symlink that points outside of the backup dirs
# * Should symlinks that points inside the backup dirs but in another # * Should symlinks that points inside the backup dirs but in another
# mount-point adjusted ? # mount-point adjusted ?
# * Should absolute symlink be restored as absolute ? # * Should absolute symlink be restored as absolute ?
self.report(Action.ADD, path, pstat) digest = self._repo._cas.write(
return self._repo._cas.write(
b"slnk", b"slnk",
path.readlink().as_posix().encode("utf-8"), path.readlink().as_posix().encode("utf-8"),
dry_run = self._dry_run, dry_run = self._dry_run,
) )
def _add_blob(self, path: Path, pstat: stat_result) -> Digest: action, info = Action.from_digests(digest, source_digest)
self.report(Action.ADD, path, pstat) self.report(action, path, pstat, info)
return digest
def _add_blob(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> Digest:
with path.open("rb") as stream: with path.open("rb") as stream:
return self._repo.add_blob(stream, dry_run=self._dry_run) digest = self._repo.add_blob(stream, dry_run=self._dry_run)
action, info = Action.from_digests(digest, source_digest)
self.report(action, path, pstat, info)
return digest
def ignore(self, path: Path, pstat: stat_result) -> bool:
def ignore(self, path: Path, pstat: stat_result, *, source: TreeItem | None=None) -> bool:
return is_bsv_repository(path) return is_bsv_repository(path)
def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | None=None): def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | TreeItem | None=None):
match action, info: match action, info:
case (Action.ADD, None): case (Action.ADD, None):
print(f"Add: {path}") print(f"Add: {path}")
case (Action.UPDATE, None):
print(f"Add: {path}")
case (Action.REMOVE, item) if isinstance(item, TreeItem):
print(f"Remove: {path / item.name}")
case (Action.IGNORE, IgnoreCause.IGNORE_RULE): case (Action.IGNORE, IgnoreCause.IGNORE_RULE):
print(f"Ignore (rule): {path}") print(f"Ignore (rule): {path}")
case (Action.IGNORE, IgnoreCause.UNCHANGED): case (Action.IGNORE, IgnoreCause.UNCHANGED):
print(f"Ignore (unchanged): {path}") print(f"Ignore (unchanged): {path}")
case (Action.IGNORE, IgnoreCause.UNSUPPORTED_TYPE) if pstat is not None: case (Action.IGNORE, IgnoreCause.UNSUPPORTED_TYPE) if pstat is not None:
assert pstat is not None
print(f"Ignore (unsupported type {path_type_name(pstat)}): {path}") print(f"Ignore (unsupported type {path_type_name(pstat)}): {path}")
case (Action.ERROR, _) if isinstance(info, Exception): case (Action.ERROR, _) if isinstance(info, Exception):
print(f"Error {info}: {path}") print(f"Error {info}: {path}")

4
src/bsv/util.py

@ -25,10 +25,10 @@ from typing import BinaryIO
EPOCH = DateTime(1970, 1, 1, tzinfo=UTC) EPOCH = DateTime(1970, 1, 1, tzinfo=UTC)
def time_from_timestamp(timestamp: int) -> DateTime: def time_from_timestamp_us(timestamp: int) -> DateTime:
return EPOCH + TimeDelta(microseconds=timestamp) return EPOCH + TimeDelta(microseconds=timestamp)
def timestamp_from_time(time: DateTime) -> int: def timestamp_us_from_time(time: DateTime) -> int:
return (time.astimezone(UTC) - EPOCH) // TimeDelta(microseconds=1) return (time.astimezone(UTC) - EPOCH) // TimeDelta(microseconds=1)

122
tests/test_repository.py

@ -18,12 +18,13 @@ from datetime import UTC, datetime
from os import stat_result from os import stat_result
from pathlib import Path from pathlib import Path
from random import randbytes from random import randbytes
from shutil import rmtree
from typing import Iterator from typing import Iterator
import pytest import pytest
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_from_time from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_us_from_time
from bsv.simple_cas.cas import Digest from bsv.simple_cas.cas import Digest
from bsv.tree_walker import Action, IgnoreCause, TreeWalker from bsv.tree_walker import Action, IgnoreCause, TreeWalker
@ -66,7 +67,7 @@ def test_read_write_tree(repo: Repository):
object_type = b"blob", object_type = b"blob",
size = 123, size = 123,
permissions = 0o744, permissions = 0o744,
modification_timestamp = timestamp_from_time(now), modification_timestamp_us = timestamp_us_from_time(now),
name = "xyz", name = "xyz",
), ),
TreeItem( TreeItem(
@ -74,7 +75,7 @@ def test_read_write_tree(repo: Repository):
object_type = b"slnk", object_type = b"slnk",
size = 42, size = 42,
permissions = 0o777, permissions = 0o777,
modification_timestamp = timestamp_from_time(now), modification_timestamp_us = timestamp_us_from_time(now),
name = "foobar", name = "foobar",
), ),
] ]
@ -90,8 +91,12 @@ def test_read_write_snapshot(repo: Repository):
snapshot = Snapshot( snapshot = Snapshot(
repo = repo, repo = repo,
tree_digest = Digest(bytes([42]) * repo._cas._digest_size), tree_digest = Digest(bytes([42]) * repo._cas._digest_size),
parents = [
Digest(bytes([123]) * repo._cas._digest_size),
Digest(bytes([124]) * repo._cas._digest_size),
],
repo_name = "test_repo", repo_name = "test_repo",
timestamp = timestamp_from_time(datetime.now()), timestamp_us = timestamp_us_from_time(datetime.now()),
) )
assert Snapshot.from_bytes(repo, snapshot.to_bytes()) == snapshot assert Snapshot.from_bytes(repo, snapshot.to_bytes()) == snapshot
@ -104,7 +109,7 @@ class TestTreeWalker(TreeWalker):
reports: list reports: list
def __init__(self, repo: Repository, dry_run: bool=False): def __init__(self, repo: Repository, dry_run: bool=False):
super().__init__(repo, dry_run) super().__init__(repo, dry_run=dry_run)
self.reports = [] self.reports = []
def report( def report(
@ -115,12 +120,12 @@ class TestTreeWalker(TreeWalker):
info: IgnoreCause | Exception | None = None info: IgnoreCause | Exception | None = None
): ):
super().report(action, path, pstat, info) super().report(action, path, pstat, info)
self.reports.append((action, path, pstat, info)) self.reports.append((action, path, info if action != Action.REMOVE else None))
def test_add_tree(tmp_dir: Path, repo: Repository): def test_add_tree(tmp_dir: Path, repo: Repository):
dir = tmp_dir / "test" dir = tmp_dir / "test0"
structure = { structure0 = {
"folder": { "folder": {
"sub_folder": { "sub_folder": {
"empty_folder": {}, "empty_folder": {},
@ -134,11 +139,28 @@ def test_add_tree(tmp_dir: Path, repo: Repository):
"bsv_config.toml": b"[bsv]\n", "bsv_config.toml": b"[bsv]\n",
}, },
} }
structure1 = {
"folder": {
"sub_folder": {
"empty_folder": {},
"foo.txt": b"Hello World!\n",
},
"bar.dat": bytes(range(256)) * 2,
},
"new_file": b"whatever",
"Another test with long name and spaces and a bang !": b"Should works.\n",
"bsv_repo": {
"bsv_config.toml": b"[bsv]\n",
},
}
create_file_structure(dir, structure) expected0 = dict(structure0)
del expected0["bsv_repo"]
walker = TestTreeWalker(repo) expected1 = dict(structure1)
dir_digest = walker.add_tree(dir) del expected1["bsv_repo"]
create_file_structure(dir, structure0)
def check(digest: Digest, value: dict | bytes): def check(digest: Digest, value: dict | bytes):
if isinstance(value, dict): if isinstance(value, dict):
@ -152,19 +174,79 @@ def test_add_tree(tmp_dir: Path, repo: Repository):
data = blob.reader().read() data = blob.reader().read()
assert data == value assert data == value
expected = dict(structure) walker = TestTreeWalker(repo)
del expected["bsv_repo"] dir_digest0 = walker.add_tree(dir)
check(dir_digest, expected) assert walker.reports == [
(Action.ADD, dir / "Another test with long name and spaces and a bang !", None),
(Action.IGNORE, dir / "bsv_repo", IgnoreCause.IGNORE_RULE),
(Action.ADD, dir / "folder/bar.dat", None),
(Action.ADD, dir / "folder/sub_folder/empty_folder", None),
(Action.ADD, dir / "folder/sub_folder/foo.txt", None),
(Action.ADD, dir / "folder/sub_folder", None),
(Action.ADD, dir / "folder/test.py", None),
(Action.ADD, dir / "folder", None),
(Action.ADD, dir, None),
]
check(dir_digest0, expected0)
create_file_structure(dir, structure1)
walker.reports.clear()
dir_digest1 = walker.add_tree(dir, source_digest=dir_digest0)
assert walker.reports == [
(Action.IGNORE, dir / "Another test with long name and spaces and a bang !", IgnoreCause.UNCHANGED),
(Action.IGNORE, dir / "bsv_repo", IgnoreCause.IGNORE_RULE),
(Action.UPDATE, dir / "folder/bar.dat", None),
(Action.IGNORE, dir / "folder/sub_folder/empty_folder", IgnoreCause.UNCHANGED),
(Action.IGNORE, dir / "folder/sub_folder/foo.txt", IgnoreCause.UNCHANGED),
(Action.IGNORE, dir / "folder/sub_folder", IgnoreCause.UNCHANGED),
(Action.REMOVE, dir / "folder/test.py", None),
(Action.UPDATE, dir / "folder", None),
(Action.ADD, dir / "new_file", None),
(Action.UPDATE, dir, None),
]
check(dir_digest1, expected1)
def create_file_structure(dst: Path, value: dict | bytes): def create_file_structure(dst: Path, value: dict | bytes):
assert not dst.exists() if isinstance(value, bytes):
if isinstance(value, dict): if dst.is_dir():
dst.mkdir() rmtree(str(dst))
for name, item in value.items(): if not dst.is_file() or dst.read_bytes() != value:
create_file_structure(dst / name, item)
elif isinstance(value, bytes):
dst.write_bytes(value) dst.write_bytes(value)
elif isinstance(value, dict):
if dst.is_file():
dst.unlink()
if not dst.is_dir():
dst.mkdir()
items = sorted(value.items())
fs_paths = sorted(dst.iterdir())
item_index = 0
fs_path_index = 0
while item_index < len(value) or fs_path_index < len(fs_paths):
name, subitem = items[item_index] if item_index < len(items) else (None, None)
fs_path = fs_paths[fs_path_index] if fs_path_index < len(fs_paths) else None
if name and fs_path:
if name < fs_path.name:
fs_path = None
elif name > fs_path.name:
name = None
if name:
item_index += 1
if fs_path:
fs_path_index += 1
if name:
create_file_structure(dst / name, subitem) # type: ignore
elif fs_path and fs_path.is_dir():
rmtree(fs_path)
elif fs_path:
fs_path.unlink()
else: else:
raise TypeError(f"invalid type {type(value).__name__} for parameter value") raise TypeError(f"invalid type {type(value).__name__} for parameter value")

Loading…
Cancel
Save