TreeWalker.
This commit is contained in:
@@ -20,7 +20,10 @@ from dataclasses import dataclass
|
||||
|
||||
@dataclass(frozen=True, order=True, slots=True)
|
||||
class Digest:
|
||||
digest: bytes
|
||||
digest: bytes = b""
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return bool(self.digest)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.digest.hex()
|
||||
|
||||
@@ -97,32 +97,37 @@ class Repository:
|
||||
with self:
|
||||
return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore
|
||||
|
||||
def add_blob(self, stream: BinaryIO) -> Digest:
|
||||
def add_blob(self, stream: BinaryIO, dry_run: bool=False) -> Digest:
|
||||
with self:
|
||||
return self._write(b"blob", stream)
|
||||
return self._write(b"blob", stream, dry_run=dry_run)
|
||||
|
||||
def get_tree(self, digest: Digest) -> Tree:
|
||||
with self:
|
||||
return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data)
|
||||
|
||||
def add_tree(self, tree: Tree) -> Digest:
|
||||
def add_tree(self, tree: Tree, dry_run: bool=False) -> Digest:
|
||||
with self:
|
||||
return self._cas.write(b"tree", tree.to_bytes())
|
||||
return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run)
|
||||
|
||||
def add_tree_from_path(self, path: Path, dry_run: bool=False) -> Digest:
|
||||
from bsv.tree_walker import TreeWalker
|
||||
walker = TreeWalker(self, dry_run=dry_run)
|
||||
return walker.add_tree(path)
|
||||
|
||||
def get_snapshot(self, digest: Digest) -> Snapshot:
|
||||
with self:
|
||||
return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data)
|
||||
|
||||
def add_snapshot(self, snapshot: Snapshot) -> Digest:
|
||||
def add_snapshot(self, snapshot: Snapshot, dry_run: bool=False) -> Digest:
|
||||
with self:
|
||||
return self._cas.write(b"snap", snapshot.to_bytes())
|
||||
return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run)
|
||||
|
||||
def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject:
|
||||
obj = self._cas.read(digest, object_type=object_type)
|
||||
stream = BytesIO(obj.data)
|
||||
return cls.from_stream(self, stream, digest_size=self._cas._digest_size)
|
||||
|
||||
def _write(self, object_type: bytes, stream: BinaryIO) -> Digest:
|
||||
def _write(self, object_type: bytes, stream: BinaryIO, dry_run: bool=False) -> Digest:
|
||||
out = BytesIO()
|
||||
size = 0
|
||||
for chunk in fastcdc(
|
||||
@@ -133,7 +138,7 @@ class Repository:
|
||||
fat = True,
|
||||
):
|
||||
size += chunk.length
|
||||
digest = self._cas.write(b"chnk", chunk.data)
|
||||
digest = self._cas.write(b"chnk", chunk.data, dry_run=dry_run)
|
||||
out.write(digest.digest)
|
||||
out.write(chunk.length.to_bytes(4))
|
||||
return self._cas.write(object_type, size.to_bytes(8) + out.getvalue())
|
||||
@@ -323,34 +328,30 @@ class Tree:
|
||||
|
||||
@dataclass
|
||||
class TreeItem:
|
||||
name: str
|
||||
digest: Digest
|
||||
object_type: bytes
|
||||
size: int
|
||||
permissions: int
|
||||
creation_timestamp: int
|
||||
modification_timestamp: int
|
||||
name: str
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
digest: Digest,
|
||||
object_type: bytes,
|
||||
size: int,
|
||||
permissions: int,
|
||||
creation_timestamp: int,
|
||||
modification_timestamp: int,
|
||||
name: str,
|
||||
):
|
||||
if "/\\" in name:
|
||||
raise ValueError(f"invalid tree item name {name}")
|
||||
self.name = name
|
||||
self.digest = digest
|
||||
self.object_type = object_type
|
||||
self.size = size
|
||||
self.permissions = permissions
|
||||
self.creation_timestamp = creation_timestamp
|
||||
self.modification_timestamp = modification_timestamp
|
||||
|
||||
@property
|
||||
def creation_time(self) -> DateTime:
|
||||
return time_from_timestamp(self.creation_timestamp)
|
||||
@creation_time.setter
|
||||
def creation_time(self, time: DateTime):
|
||||
self.creation_timestamp = timestamp_from_time(time)
|
||||
self.name = name
|
||||
|
||||
@property
|
||||
def modification_time(self) -> DateTime:
|
||||
@@ -366,16 +367,18 @@ class TreeItem:
|
||||
return None
|
||||
return TreeItem(
|
||||
digest = Digest(digest_bytes),
|
||||
object_type = read_exact(stream, 4),
|
||||
size = int.from_bytes(read_exact(stream, 8)),
|
||||
permissions = int.from_bytes(read_exact(stream, 2)),
|
||||
creation_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
|
||||
modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
|
||||
name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),
|
||||
)
|
||||
|
||||
def write(self, stream: BinaryIO):
|
||||
stream.write(self.digest.digest)
|
||||
stream.write(self.object_type)
|
||||
stream.write(self.size.to_bytes(8))
|
||||
stream.write(self.permissions.to_bytes(2))
|
||||
stream.write(self.creation_timestamp.to_bytes(8, signed=True))
|
||||
stream.write(self.modification_timestamp.to_bytes(8, signed=True))
|
||||
name_bytes = self.name.encode("utf-8")
|
||||
stream.write(len(name_bytes).to_bytes(2))
|
||||
|
||||
@@ -96,7 +96,7 @@ class SimpleCas:
|
||||
|
||||
return Object(digest, object_type, size, data)
|
||||
|
||||
def write(self, object_type: bytes, data: bytes) -> Digest:
|
||||
def write(self, object_type: bytes, data: bytes, dry_run: bool=False) -> Digest:
|
||||
assert len(object_type) == 4
|
||||
assert len(data) < 2**32
|
||||
|
||||
@@ -108,7 +108,7 @@ class SimpleCas:
|
||||
hash.update(data)
|
||||
digest = Digest(hash.digest())
|
||||
|
||||
if digest not in self:
|
||||
if not dry_run and digest not in self:
|
||||
with self._open_writer(digest, object_type, len(data)) as out:
|
||||
out.write(digest.digest)
|
||||
out.write(object_type)
|
||||
|
||||
157
src/bsv/tree_walker.py
Normal file
157
src/bsv/tree_walker.py
Normal file
@@ -0,0 +1,157 @@
|
||||
# bsv - Backup, Synchronization, Versioning
|
||||
# Copyright (C) 2023 Simon Boyé
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from os import stat_result
|
||||
from pathlib import Path
|
||||
import stat
|
||||
|
||||
from bsv.object import Digest
|
||||
from bsv.repository import Repository, Tree, TreeItem
|
||||
from bsv.util import is_bsv_repository, object_type_from_mode
|
||||
|
||||
|
||||
class Action(Enum):
|
||||
ADD = "add"
|
||||
UPDATE = "update"
|
||||
IGNORE = "ignore"
|
||||
ERROR = "error"
|
||||
|
||||
class IgnoreCause(Enum):
|
||||
IGNORE_RULE = "ignore_rule"
|
||||
UNCHANGED = "unchanged"
|
||||
UNSUPPORTED_TYPE = "unsupported_type"
|
||||
|
||||
|
||||
class TreeWalker:
|
||||
_repo: Repository
|
||||
_dry_run: bool = False
|
||||
|
||||
def __init__(self, repo: Repository, dry_run: bool=False):
|
||||
self._repo = repo
|
||||
self._dry_run = dry_run
|
||||
|
||||
def add_tree(self, path: Path) -> Digest:
|
||||
pstat = path.stat(follow_symlinks=False)
|
||||
if self.ignore(path, pstat):
|
||||
self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE)
|
||||
return Digest()
|
||||
return self._add_tree(path, pstat)
|
||||
|
||||
def _add_tree(self, path: Path, pstat: stat_result) -> Digest:
|
||||
tree = Tree(self._repo, [])
|
||||
for item in sorted(path.iterdir()):
|
||||
digest = Digest()
|
||||
try:
|
||||
istat = item.lstat()
|
||||
if self.ignore(item, istat):
|
||||
self.report(Action.IGNORE, item, istat, IgnoreCause.IGNORE_RULE)
|
||||
continue
|
||||
object_type = object_type_from_mode(istat.st_mode)
|
||||
if object_type == b"slnk":
|
||||
digest = self._add_symlink(item, istat)
|
||||
elif object_type == b"tree":
|
||||
digest = self._add_tree(item, istat)
|
||||
elif object_type == b"blob":
|
||||
digest = self._add_blob(item, istat)
|
||||
else:
|
||||
self.report(Action.IGNORE, item, istat, IgnoreCause.UNSUPPORTED_TYPE)
|
||||
continue
|
||||
except Exception as err:
|
||||
self.report(Action.ERROR, item, None, err)
|
||||
continue
|
||||
|
||||
if digest:
|
||||
self.report(Action.ADD, path, pstat)
|
||||
tree.items.append(TreeItem(
|
||||
digest = digest,
|
||||
object_type = object_type,
|
||||
size = istat.st_size,
|
||||
permissions = stat.S_IMODE(istat.st_mode),
|
||||
modification_timestamp = istat.st_mtime_ns,
|
||||
name = item.name,
|
||||
))
|
||||
|
||||
return self._repo.add_tree(tree, dry_run=self._dry_run)
|
||||
|
||||
|
||||
def _add_symlink(self, path: Path, pstat: stat_result) -> Digest:
|
||||
# TODO: Store symlink relative to current dir ?
|
||||
# * What about symlink that points outside of the backup dirs
|
||||
# * Should symlinks that points inside the backup dirs but in another
|
||||
# mount-point adjusted ?
|
||||
# * Should absolute symlink be restored as absolute ?
|
||||
self.report(Action.ADD, path, pstat)
|
||||
return self._repo._cas.write(
|
||||
b"slnk",
|
||||
path.readlink().as_posix().encode("utf-8"),
|
||||
dry_run = self._dry_run,
|
||||
)
|
||||
|
||||
def _add_blob(self, path: Path, pstat: stat_result) -> Digest:
|
||||
self.report(Action.ADD, path, pstat)
|
||||
with path.open("rb") as stream:
|
||||
return self._repo.add_blob(stream, dry_run=self._dry_run)
|
||||
|
||||
|
||||
def ignore(self, path: Path, pstat: stat_result) -> bool:
|
||||
return is_bsv_repository(path)
|
||||
|
||||
def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | None=None):
|
||||
match action, info:
|
||||
case (Action.ADD, None):
|
||||
print(f"Add: {path}")
|
||||
case (Action.IGNORE, IgnoreCause.IGNORE_RULE):
|
||||
print(f"Ignore (rule): {path}")
|
||||
case (Action.IGNORE, IgnoreCause.UNCHANGED):
|
||||
print(f"Ignore (unchanged): {path}")
|
||||
case (Action.IGNORE, IgnoreCause.UNSUPPORTED_TYPE) if pstat is not None:
|
||||
assert pstat is not None
|
||||
print(f"Ignore (unsupported type {path_type_name(pstat)}): {path}")
|
||||
case (Action.ERROR, _) if isinstance(info, Exception):
|
||||
print(f"Error {info}: {path}")
|
||||
case _:
|
||||
raise ValueError("TreeWalker.report(): unsupported parameter combination")
|
||||
|
||||
|
||||
def path_type_name(pstat: stat_result) -> str:
|
||||
parts = []
|
||||
|
||||
if stat.S_ISBLK(pstat.st_mode):
|
||||
parts.append("block_device")
|
||||
if stat.S_ISCHR(pstat.st_mode):
|
||||
parts.append("char_device")
|
||||
if stat.S_ISDIR(pstat.st_mode):
|
||||
parts.append("dir")
|
||||
if stat.S_ISDOOR(pstat.st_mode):
|
||||
parts.append("door")
|
||||
if stat.S_ISFIFO(pstat.st_mode):
|
||||
parts.append("fifo")
|
||||
if stat.S_ISLNK(pstat.st_mode):
|
||||
parts.append("symlink")
|
||||
if stat.S_ISPORT(pstat.st_mode):
|
||||
parts.append("port")
|
||||
if stat.S_ISREG(pstat.st_mode):
|
||||
parts.append("file")
|
||||
if stat.S_ISSOCK(pstat.st_mode):
|
||||
parts.append("socket")
|
||||
if stat.S_ISWHT(pstat.st_mode):
|
||||
parts.append("whiteout")
|
||||
|
||||
if not parts:
|
||||
return "unknown"
|
||||
return ", ".join(parts)
|
||||
@@ -17,6 +17,8 @@ from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import UTC, datetime as DateTime, timedelta as TimeDelta
|
||||
from pathlib import Path
|
||||
import stat
|
||||
from typing import BinaryIO
|
||||
|
||||
|
||||
@@ -45,6 +47,23 @@ def read_exact_or_eof(stream: BinaryIO, num_bytes: int) -> bytes | None:
|
||||
return data
|
||||
|
||||
|
||||
def is_bsv_repository(path: Path) -> bool:
|
||||
return (path / "bsv_config.toml").is_file()
|
||||
|
||||
|
||||
def object_type_from_path(path: Path) -> bytes:
|
||||
return object_type_from_mode(path.stat(follow_symlinks=False).st_mode)
|
||||
|
||||
def object_type_from_mode(mode: int) -> bytes:
|
||||
if stat.S_ISLNK(mode):
|
||||
return b"slnk"
|
||||
elif stat.S_ISDIR(mode):
|
||||
return b"tree"
|
||||
elif stat.S_ISREG(mode):
|
||||
return b"blob"
|
||||
return b""
|
||||
|
||||
|
||||
class Hash(ABC):
|
||||
name: str
|
||||
digest_size: int
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
from __future__ import annotations
|
||||
from datetime import UTC, datetime
|
||||
from io import BytesIO
|
||||
from os import stat_result
|
||||
from pathlib import Path
|
||||
from random import randbytes
|
||||
from typing import Iterator
|
||||
@@ -25,6 +25,7 @@ from tempfile import TemporaryDirectory
|
||||
|
||||
from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_from_time
|
||||
from bsv.simple_cas.cas import Digest
|
||||
from bsv.tree_walker import Action, IgnoreCause, TreeWalker
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -61,18 +62,20 @@ def test_read_write_tree(repo: Repository):
|
||||
repo,
|
||||
[
|
||||
TreeItem(
|
||||
"xyz",
|
||||
Digest(bytes([42]) * repo._cas._digest_size),
|
||||
0o744,
|
||||
creation_timestamp = timestamp_from_time(now),
|
||||
digest = Digest(bytes([42]) * repo._cas._digest_size),
|
||||
object_type = b"blob",
|
||||
size = 123,
|
||||
permissions = 0o744,
|
||||
modification_timestamp = timestamp_from_time(now),
|
||||
name = "xyz",
|
||||
),
|
||||
TreeItem(
|
||||
"foobar",
|
||||
Digest(bytes([123]) * repo._cas._digest_size),
|
||||
0o777,
|
||||
creation_timestamp = timestamp_from_time(now),
|
||||
digest = Digest(bytes([123]) * repo._cas._digest_size),
|
||||
object_type = b"slnk",
|
||||
size = 42,
|
||||
permissions = 0o777,
|
||||
modification_timestamp = timestamp_from_time(now),
|
||||
name = "foobar",
|
||||
),
|
||||
]
|
||||
)
|
||||
@@ -97,6 +100,75 @@ def test_read_write_snapshot(repo: Repository):
|
||||
assert repo.get_snapshot(digest) == snapshot
|
||||
|
||||
|
||||
class TestTreeWalker(TreeWalker):
|
||||
reports: list
|
||||
|
||||
def __init__(self, repo: Repository, dry_run: bool=False):
|
||||
super().__init__(repo, dry_run)
|
||||
self.reports = []
|
||||
|
||||
def report(
|
||||
self,
|
||||
action: Action,
|
||||
path: Path,
|
||||
pstat: stat_result | None,
|
||||
info: IgnoreCause | Exception | None = None
|
||||
):
|
||||
super().report(action, path, pstat, info)
|
||||
self.reports.append((action, path, pstat, info))
|
||||
|
||||
|
||||
def test_add_tree(tmp_dir: Path, repo: Repository):
|
||||
dir = tmp_dir / "test"
|
||||
structure = {
|
||||
"folder": {
|
||||
"sub_folder": {
|
||||
"empty_folder": {},
|
||||
"foo.txt": b"Hello World!\n",
|
||||
},
|
||||
"test.py": b"print(\"Hello World!\")\n",
|
||||
"bar.dat": bytes(range(256)),
|
||||
},
|
||||
"Another test with long name and spaces and a bang !": b"Should works.\n",
|
||||
"bsv_repo": {
|
||||
"bsv_config.toml": b"[bsv]\n",
|
||||
},
|
||||
}
|
||||
|
||||
create_file_structure(dir, structure)
|
||||
|
||||
walker = TestTreeWalker(repo)
|
||||
dir_digest = walker.add_tree(dir)
|
||||
|
||||
def check(digest: Digest, value: dict | bytes):
|
||||
if isinstance(value, dict):
|
||||
tree = repo.get_tree(digest)
|
||||
assert tree
|
||||
assert list(map(lambda i: i.name, tree.items)) == sorted(value.keys())
|
||||
for item in tree.items:
|
||||
check(item.digest, value[item.name])
|
||||
elif isinstance(value, bytes):
|
||||
blob = repo.get_blob(digest)
|
||||
data = blob.reader().read()
|
||||
assert data == value
|
||||
|
||||
expected = dict(structure)
|
||||
del expected["bsv_repo"]
|
||||
check(dir_digest, expected)
|
||||
|
||||
|
||||
def create_file_structure(dst: Path, value: dict | bytes):
|
||||
assert not dst.exists()
|
||||
if isinstance(value, dict):
|
||||
dst.mkdir()
|
||||
for name, item in value.items():
|
||||
create_file_structure(dst / name, item)
|
||||
elif isinstance(value, bytes):
|
||||
dst.write_bytes(value)
|
||||
else:
|
||||
raise TypeError(f"invalid type {type(value).__name__} for parameter value")
|
||||
|
||||
|
||||
def make_random_file(path: Path, size: int):
|
||||
with path.open("wb") as stream:
|
||||
for chunk_size in iter_chunks(size):
|
||||
|
||||
Reference in New Issue
Block a user