Browse Source

TreeWalker.

master
Draklaw 2 years ago
parent
commit
17bef2e63a
  1. 5
      src/bsv/object.py
  2. 49
      src/bsv/repository.py
  3. 4
      src/bsv/simple_cas/cas.py
  4. 157
      src/bsv/tree_walker.py
  5. 19
      src/bsv/util.py
  6. 90
      tests/test_repository.py

5
src/bsv/object.py

@ -20,7 +20,10 @@ from dataclasses import dataclass
@dataclass(frozen=True, order=True, slots=True)
class Digest:
digest: bytes
digest: bytes = b""
def __bool__(self) -> bool:
return bool(self.digest)
def __repr__(self) -> str:
return self.digest.hex()

49
src/bsv/repository.py

@ -97,32 +97,37 @@ class Repository:
with self:
return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore
def add_blob(self, stream: BinaryIO) -> Digest:
def add_blob(self, stream: BinaryIO, dry_run: bool=False) -> Digest:
with self:
return self._write(b"blob", stream)
return self._write(b"blob", stream, dry_run=dry_run)
def get_tree(self, digest: Digest) -> Tree:
with self:
return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data)
def add_tree(self, tree: Tree) -> Digest:
def add_tree(self, tree: Tree, dry_run: bool=False) -> Digest:
with self:
return self._cas.write(b"tree", tree.to_bytes())
return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run)
def add_tree_from_path(self, path: Path, dry_run: bool=False) -> Digest:
from bsv.tree_walker import TreeWalker
walker = TreeWalker(self, dry_run=dry_run)
return walker.add_tree(path)
def get_snapshot(self, digest: Digest) -> Snapshot:
with self:
return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data)
def add_snapshot(self, snapshot: Snapshot) -> Digest:
def add_snapshot(self, snapshot: Snapshot, dry_run: bool=False) -> Digest:
with self:
return self._cas.write(b"snap", snapshot.to_bytes())
return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run)
def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject:
obj = self._cas.read(digest, object_type=object_type)
stream = BytesIO(obj.data)
return cls.from_stream(self, stream, digest_size=self._cas._digest_size)
def _write(self, object_type: bytes, stream: BinaryIO) -> Digest:
def _write(self, object_type: bytes, stream: BinaryIO, dry_run: bool=False) -> Digest:
out = BytesIO()
size = 0
for chunk in fastcdc(
@ -133,7 +138,7 @@ class Repository:
fat = True,
):
size += chunk.length
digest = self._cas.write(b"chnk", chunk.data)
digest = self._cas.write(b"chnk", chunk.data, dry_run=dry_run)
out.write(digest.digest)
out.write(chunk.length.to_bytes(4))
return self._cas.write(object_type, size.to_bytes(8) + out.getvalue())
@ -323,34 +328,30 @@ class Tree:
@dataclass
class TreeItem:
name: str
digest: Digest
object_type: bytes
size: int
permissions: int
creation_timestamp: int
modification_timestamp: int
name: str
def __init__(
self,
name: str,
digest: Digest,
object_type: bytes,
size: int,
permissions: int,
creation_timestamp: int,
modification_timestamp: int,
name: str,
):
if "/\\" in name:
raise ValueError(f"invalid tree item name {name}")
self.name = name
self.digest = digest
self.object_type = object_type
self.size = size
self.permissions = permissions
self.creation_timestamp = creation_timestamp
self.modification_timestamp = modification_timestamp
@property
def creation_time(self) -> DateTime:
return time_from_timestamp(self.creation_timestamp)
@creation_time.setter
def creation_time(self, time: DateTime):
self.creation_timestamp = timestamp_from_time(time)
self.name = name
@property
def modification_time(self) -> DateTime:
@ -366,16 +367,18 @@ class TreeItem:
return None
return TreeItem(
digest = Digest(digest_bytes),
object_type = read_exact(stream, 4),
size = int.from_bytes(read_exact(stream, 8)),
permissions = int.from_bytes(read_exact(stream, 2)),
creation_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),
)
def write(self, stream: BinaryIO):
stream.write(self.digest.digest)
stream.write(self.object_type)
stream.write(self.size.to_bytes(8))
stream.write(self.permissions.to_bytes(2))
stream.write(self.creation_timestamp.to_bytes(8, signed=True))
stream.write(self.modification_timestamp.to_bytes(8, signed=True))
name_bytes = self.name.encode("utf-8")
stream.write(len(name_bytes).to_bytes(2))

4
src/bsv/simple_cas/cas.py

@ -96,7 +96,7 @@ class SimpleCas:
return Object(digest, object_type, size, data)
def write(self, object_type: bytes, data: bytes) -> Digest:
def write(self, object_type: bytes, data: bytes, dry_run: bool=False) -> Digest:
assert len(object_type) == 4
assert len(data) < 2**32
@ -108,7 +108,7 @@ class SimpleCas:
hash.update(data)
digest = Digest(hash.digest())
if digest not in self:
if not dry_run and digest not in self:
with self._open_writer(digest, object_type, len(data)) as out:
out.write(digest.digest)
out.write(object_type)

157
src/bsv/tree_walker.py

@ -0,0 +1,157 @@
# bsv - Backup, Synchronization, Versioning
# Copyright (C) 2023 Simon Boyé
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations
from enum import Enum
from os import stat_result
from pathlib import Path
import stat
from bsv.object import Digest
from bsv.repository import Repository, Tree, TreeItem
from bsv.util import is_bsv_repository, object_type_from_mode
class Action(Enum):
ADD = "add"
UPDATE = "update"
IGNORE = "ignore"
ERROR = "error"
class IgnoreCause(Enum):
IGNORE_RULE = "ignore_rule"
UNCHANGED = "unchanged"
UNSUPPORTED_TYPE = "unsupported_type"
class TreeWalker:
_repo: Repository
_dry_run: bool = False
def __init__(self, repo: Repository, dry_run: bool=False):
self._repo = repo
self._dry_run = dry_run
def add_tree(self, path: Path) -> Digest:
pstat = path.stat(follow_symlinks=False)
if self.ignore(path, pstat):
self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE)
return Digest()
return self._add_tree(path, pstat)
def _add_tree(self, path: Path, pstat: stat_result) -> Digest:
tree = Tree(self._repo, [])
for item in sorted(path.iterdir()):
digest = Digest()
try:
istat = item.lstat()
if self.ignore(item, istat):
self.report(Action.IGNORE, item, istat, IgnoreCause.IGNORE_RULE)
continue
object_type = object_type_from_mode(istat.st_mode)
if object_type == b"slnk":
digest = self._add_symlink(item, istat)
elif object_type == b"tree":
digest = self._add_tree(item, istat)
elif object_type == b"blob":
digest = self._add_blob(item, istat)
else:
self.report(Action.IGNORE, item, istat, IgnoreCause.UNSUPPORTED_TYPE)
continue
except Exception as err:
self.report(Action.ERROR, item, None, err)
continue
if digest:
self.report(Action.ADD, path, pstat)
tree.items.append(TreeItem(
digest = digest,
object_type = object_type,
size = istat.st_size,
permissions = stat.S_IMODE(istat.st_mode),
modification_timestamp = istat.st_mtime_ns,
name = item.name,
))
return self._repo.add_tree(tree, dry_run=self._dry_run)
def _add_symlink(self, path: Path, pstat: stat_result) -> Digest:
# TODO: Store symlink relative to current dir ?
# * What about symlink that points outside of the backup dirs
# * Should symlinks that points inside the backup dirs but in another
# mount-point adjusted ?
# * Should absolute symlink be restored as absolute ?
self.report(Action.ADD, path, pstat)
return self._repo._cas.write(
b"slnk",
path.readlink().as_posix().encode("utf-8"),
dry_run = self._dry_run,
)
def _add_blob(self, path: Path, pstat: stat_result) -> Digest:
self.report(Action.ADD, path, pstat)
with path.open("rb") as stream:
return self._repo.add_blob(stream, dry_run=self._dry_run)
def ignore(self, path: Path, pstat: stat_result) -> bool:
return is_bsv_repository(path)
def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | None=None):
match action, info:
case (Action.ADD, None):
print(f"Add: {path}")
case (Action.IGNORE, IgnoreCause.IGNORE_RULE):
print(f"Ignore (rule): {path}")
case (Action.IGNORE, IgnoreCause.UNCHANGED):
print(f"Ignore (unchanged): {path}")
case (Action.IGNORE, IgnoreCause.UNSUPPORTED_TYPE) if pstat is not None:
assert pstat is not None
print(f"Ignore (unsupported type {path_type_name(pstat)}): {path}")
case (Action.ERROR, _) if isinstance(info, Exception):
print(f"Error {info}: {path}")
case _:
raise ValueError("TreeWalker.report(): unsupported parameter combination")
def path_type_name(pstat: stat_result) -> str:
parts = []
if stat.S_ISBLK(pstat.st_mode):
parts.append("block_device")
if stat.S_ISCHR(pstat.st_mode):
parts.append("char_device")
if stat.S_ISDIR(pstat.st_mode):
parts.append("dir")
if stat.S_ISDOOR(pstat.st_mode):
parts.append("door")
if stat.S_ISFIFO(pstat.st_mode):
parts.append("fifo")
if stat.S_ISLNK(pstat.st_mode):
parts.append("symlink")
if stat.S_ISPORT(pstat.st_mode):
parts.append("port")
if stat.S_ISREG(pstat.st_mode):
parts.append("file")
if stat.S_ISSOCK(pstat.st_mode):
parts.append("socket")
if stat.S_ISWHT(pstat.st_mode):
parts.append("whiteout")
if not parts:
return "unknown"
return ", ".join(parts)

19
src/bsv/util.py

@ -17,6 +17,8 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from datetime import UTC, datetime as DateTime, timedelta as TimeDelta
from pathlib import Path
import stat
from typing import BinaryIO
@ -45,6 +47,23 @@ def read_exact_or_eof(stream: BinaryIO, num_bytes: int) -> bytes | None:
return data
def is_bsv_repository(path: Path) -> bool:
return (path / "bsv_config.toml").is_file()
def object_type_from_path(path: Path) -> bytes:
return object_type_from_mode(path.stat(follow_symlinks=False).st_mode)
def object_type_from_mode(mode: int) -> bytes:
if stat.S_ISLNK(mode):
return b"slnk"
elif stat.S_ISDIR(mode):
return b"tree"
elif stat.S_ISREG(mode):
return b"blob"
return b""
class Hash(ABC):
name: str
digest_size: int

90
tests/test_repository.py

@ -15,7 +15,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations
from datetime import UTC, datetime
from io import BytesIO
from os import stat_result
from pathlib import Path
from random import randbytes
from typing import Iterator
@ -25,6 +25,7 @@ from tempfile import TemporaryDirectory
from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_from_time
from bsv.simple_cas.cas import Digest
from bsv.tree_walker import Action, IgnoreCause, TreeWalker
@pytest.fixture
@ -61,18 +62,20 @@ def test_read_write_tree(repo: Repository):
repo,
[
TreeItem(
"xyz",
Digest(bytes([42]) * repo._cas._digest_size),
0o744,
creation_timestamp = timestamp_from_time(now),
digest = Digest(bytes([42]) * repo._cas._digest_size),
object_type = b"blob",
size = 123,
permissions = 0o744,
modification_timestamp = timestamp_from_time(now),
name = "xyz",
),
TreeItem(
"foobar",
Digest(bytes([123]) * repo._cas._digest_size),
0o777,
creation_timestamp = timestamp_from_time(now),
digest = Digest(bytes([123]) * repo._cas._digest_size),
object_type = b"slnk",
size = 42,
permissions = 0o777,
modification_timestamp = timestamp_from_time(now),
name = "foobar",
),
]
)
@ -97,6 +100,75 @@ def test_read_write_snapshot(repo: Repository):
assert repo.get_snapshot(digest) == snapshot
class TestTreeWalker(TreeWalker):
reports: list
def __init__(self, repo: Repository, dry_run: bool=False):
super().__init__(repo, dry_run)
self.reports = []
def report(
self,
action: Action,
path: Path,
pstat: stat_result | None,
info: IgnoreCause | Exception | None = None
):
super().report(action, path, pstat, info)
self.reports.append((action, path, pstat, info))
def test_add_tree(tmp_dir: Path, repo: Repository):
dir = tmp_dir / "test"
structure = {
"folder": {
"sub_folder": {
"empty_folder": {},
"foo.txt": b"Hello World!\n",
},
"test.py": b"print(\"Hello World!\")\n",
"bar.dat": bytes(range(256)),
},
"Another test with long name and spaces and a bang !": b"Should works.\n",
"bsv_repo": {
"bsv_config.toml": b"[bsv]\n",
},
}
create_file_structure(dir, structure)
walker = TestTreeWalker(repo)
dir_digest = walker.add_tree(dir)
def check(digest: Digest, value: dict | bytes):
if isinstance(value, dict):
tree = repo.get_tree(digest)
assert tree
assert list(map(lambda i: i.name, tree.items)) == sorted(value.keys())
for item in tree.items:
check(item.digest, value[item.name])
elif isinstance(value, bytes):
blob = repo.get_blob(digest)
data = blob.reader().read()
assert data == value
expected = dict(structure)
del expected["bsv_repo"]
check(dir_digest, expected)
def create_file_structure(dst: Path, value: dict | bytes):
assert not dst.exists()
if isinstance(value, dict):
dst.mkdir()
for name, item in value.items():
create_file_structure(dst / name, item)
elif isinstance(value, bytes):
dst.write_bytes(value)
else:
raise TypeError(f"invalid type {type(value).__name__} for parameter value")
def make_random_file(path: Path, size: int):
with path.open("wb") as stream:
for chunk_size in iter_chunks(size):

Loading…
Cancel
Save