Browse Source

TreeWalker.

master
Draklaw 2 years ago
parent
commit
17bef2e63a
  1. 5
      src/bsv/object.py
  2. 49
      src/bsv/repository.py
  3. 4
      src/bsv/simple_cas/cas.py
  4. 157
      src/bsv/tree_walker.py
  5. 19
      src/bsv/util.py
  6. 90
      tests/test_repository.py

5
src/bsv/object.py

@ -20,7 +20,10 @@ from dataclasses import dataclass
@dataclass(frozen=True, order=True, slots=True) @dataclass(frozen=True, order=True, slots=True)
class Digest: class Digest:
digest: bytes digest: bytes = b""
def __bool__(self) -> bool:
return bool(self.digest)
def __repr__(self) -> str: def __repr__(self) -> str:
return self.digest.hex() return self.digest.hex()

49
src/bsv/repository.py

@ -97,32 +97,37 @@ class Repository:
with self: with self:
return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore
def add_blob(self, stream: BinaryIO) -> Digest: def add_blob(self, stream: BinaryIO, dry_run: bool=False) -> Digest:
with self: with self:
return self._write(b"blob", stream) return self._write(b"blob", stream, dry_run=dry_run)
def get_tree(self, digest: Digest) -> Tree: def get_tree(self, digest: Digest) -> Tree:
with self: with self:
return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data) return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data)
def add_tree(self, tree: Tree) -> Digest: def add_tree(self, tree: Tree, dry_run: bool=False) -> Digest:
with self: with self:
return self._cas.write(b"tree", tree.to_bytes()) return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run)
def add_tree_from_path(self, path: Path, dry_run: bool=False) -> Digest:
from bsv.tree_walker import TreeWalker
walker = TreeWalker(self, dry_run=dry_run)
return walker.add_tree(path)
def get_snapshot(self, digest: Digest) -> Snapshot: def get_snapshot(self, digest: Digest) -> Snapshot:
with self: with self:
return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data) return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data)
def add_snapshot(self, snapshot: Snapshot) -> Digest: def add_snapshot(self, snapshot: Snapshot, dry_run: bool=False) -> Digest:
with self: with self:
return self._cas.write(b"snap", snapshot.to_bytes()) return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run)
def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject: def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject:
obj = self._cas.read(digest, object_type=object_type) obj = self._cas.read(digest, object_type=object_type)
stream = BytesIO(obj.data) stream = BytesIO(obj.data)
return cls.from_stream(self, stream, digest_size=self._cas._digest_size) return cls.from_stream(self, stream, digest_size=self._cas._digest_size)
def _write(self, object_type: bytes, stream: BinaryIO) -> Digest: def _write(self, object_type: bytes, stream: BinaryIO, dry_run: bool=False) -> Digest:
out = BytesIO() out = BytesIO()
size = 0 size = 0
for chunk in fastcdc( for chunk in fastcdc(
@ -133,7 +138,7 @@ class Repository:
fat = True, fat = True,
): ):
size += chunk.length size += chunk.length
digest = self._cas.write(b"chnk", chunk.data) digest = self._cas.write(b"chnk", chunk.data, dry_run=dry_run)
out.write(digest.digest) out.write(digest.digest)
out.write(chunk.length.to_bytes(4)) out.write(chunk.length.to_bytes(4))
return self._cas.write(object_type, size.to_bytes(8) + out.getvalue()) return self._cas.write(object_type, size.to_bytes(8) + out.getvalue())
@ -323,34 +328,30 @@ class Tree:
@dataclass @dataclass
class TreeItem: class TreeItem:
name: str
digest: Digest digest: Digest
object_type: bytes
size: int
permissions: int permissions: int
creation_timestamp: int
modification_timestamp: int modification_timestamp: int
name: str
def __init__( def __init__(
self, self,
name: str,
digest: Digest, digest: Digest,
object_type: bytes,
size: int,
permissions: int, permissions: int,
creation_timestamp: int,
modification_timestamp: int, modification_timestamp: int,
name: str,
): ):
if "/\\" in name: if "/\\" in name:
raise ValueError(f"invalid tree item name {name}") raise ValueError(f"invalid tree item name {name}")
self.name = name
self.digest = digest self.digest = digest
self.object_type = object_type
self.size = size
self.permissions = permissions self.permissions = permissions
self.creation_timestamp = creation_timestamp
self.modification_timestamp = modification_timestamp self.modification_timestamp = modification_timestamp
self.name = name
@property
def creation_time(self) -> DateTime:
return time_from_timestamp(self.creation_timestamp)
@creation_time.setter
def creation_time(self, time: DateTime):
self.creation_timestamp = timestamp_from_time(time)
@property @property
def modification_time(self) -> DateTime: def modification_time(self) -> DateTime:
@ -366,16 +367,18 @@ class TreeItem:
return None return None
return TreeItem( return TreeItem(
digest = Digest(digest_bytes), digest = Digest(digest_bytes),
object_type = read_exact(stream, 4),
size = int.from_bytes(read_exact(stream, 8)),
permissions = int.from_bytes(read_exact(stream, 2)), permissions = int.from_bytes(read_exact(stream, 2)),
creation_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True), modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),
) )
def write(self, stream: BinaryIO): def write(self, stream: BinaryIO):
stream.write(self.digest.digest) stream.write(self.digest.digest)
stream.write(self.object_type)
stream.write(self.size.to_bytes(8))
stream.write(self.permissions.to_bytes(2)) stream.write(self.permissions.to_bytes(2))
stream.write(self.creation_timestamp.to_bytes(8, signed=True))
stream.write(self.modification_timestamp.to_bytes(8, signed=True)) stream.write(self.modification_timestamp.to_bytes(8, signed=True))
name_bytes = self.name.encode("utf-8") name_bytes = self.name.encode("utf-8")
stream.write(len(name_bytes).to_bytes(2)) stream.write(len(name_bytes).to_bytes(2))

4
src/bsv/simple_cas/cas.py

@ -96,7 +96,7 @@ class SimpleCas:
return Object(digest, object_type, size, data) return Object(digest, object_type, size, data)
def write(self, object_type: bytes, data: bytes) -> Digest: def write(self, object_type: bytes, data: bytes, dry_run: bool=False) -> Digest:
assert len(object_type) == 4 assert len(object_type) == 4
assert len(data) < 2**32 assert len(data) < 2**32
@ -108,7 +108,7 @@ class SimpleCas:
hash.update(data) hash.update(data)
digest = Digest(hash.digest()) digest = Digest(hash.digest())
if digest not in self: if not dry_run and digest not in self:
with self._open_writer(digest, object_type, len(data)) as out: with self._open_writer(digest, object_type, len(data)) as out:
out.write(digest.digest) out.write(digest.digest)
out.write(object_type) out.write(object_type)

157
src/bsv/tree_walker.py

@ -0,0 +1,157 @@
# bsv - Backup, Synchronization, Versioning
# Copyright (C) 2023 Simon Boyé
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations
from enum import Enum
from os import stat_result
from pathlib import Path
import stat
from bsv.object import Digest
from bsv.repository import Repository, Tree, TreeItem
from bsv.util import is_bsv_repository, object_type_from_mode
class Action(Enum):
ADD = "add"
UPDATE = "update"
IGNORE = "ignore"
ERROR = "error"
class IgnoreCause(Enum):
IGNORE_RULE = "ignore_rule"
UNCHANGED = "unchanged"
UNSUPPORTED_TYPE = "unsupported_type"
class TreeWalker:
_repo: Repository
_dry_run: bool = False
def __init__(self, repo: Repository, dry_run: bool=False):
self._repo = repo
self._dry_run = dry_run
def add_tree(self, path: Path) -> Digest:
pstat = path.stat(follow_symlinks=False)
if self.ignore(path, pstat):
self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE)
return Digest()
return self._add_tree(path, pstat)
def _add_tree(self, path: Path, pstat: stat_result) -> Digest:
tree = Tree(self._repo, [])
for item in sorted(path.iterdir()):
digest = Digest()
try:
istat = item.lstat()
if self.ignore(item, istat):
self.report(Action.IGNORE, item, istat, IgnoreCause.IGNORE_RULE)
continue
object_type = object_type_from_mode(istat.st_mode)
if object_type == b"slnk":
digest = self._add_symlink(item, istat)
elif object_type == b"tree":
digest = self._add_tree(item, istat)
elif object_type == b"blob":
digest = self._add_blob(item, istat)
else:
self.report(Action.IGNORE, item, istat, IgnoreCause.UNSUPPORTED_TYPE)
continue
except Exception as err:
self.report(Action.ERROR, item, None, err)
continue
if digest:
self.report(Action.ADD, path, pstat)
tree.items.append(TreeItem(
digest = digest,
object_type = object_type,
size = istat.st_size,
permissions = stat.S_IMODE(istat.st_mode),
modification_timestamp = istat.st_mtime_ns,
name = item.name,
))
return self._repo.add_tree(tree, dry_run=self._dry_run)
def _add_symlink(self, path: Path, pstat: stat_result) -> Digest:
# TODO: Store symlink relative to current dir ?
# * What about symlink that points outside of the backup dirs
# * Should symlinks that points inside the backup dirs but in another
# mount-point adjusted ?
# * Should absolute symlink be restored as absolute ?
self.report(Action.ADD, path, pstat)
return self._repo._cas.write(
b"slnk",
path.readlink().as_posix().encode("utf-8"),
dry_run = self._dry_run,
)
def _add_blob(self, path: Path, pstat: stat_result) -> Digest:
self.report(Action.ADD, path, pstat)
with path.open("rb") as stream:
return self._repo.add_blob(stream, dry_run=self._dry_run)
def ignore(self, path: Path, pstat: stat_result) -> bool:
return is_bsv_repository(path)
def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | None=None):
match action, info:
case (Action.ADD, None):
print(f"Add: {path}")
case (Action.IGNORE, IgnoreCause.IGNORE_RULE):
print(f"Ignore (rule): {path}")
case (Action.IGNORE, IgnoreCause.UNCHANGED):
print(f"Ignore (unchanged): {path}")
case (Action.IGNORE, IgnoreCause.UNSUPPORTED_TYPE) if pstat is not None:
assert pstat is not None
print(f"Ignore (unsupported type {path_type_name(pstat)}): {path}")
case (Action.ERROR, _) if isinstance(info, Exception):
print(f"Error {info}: {path}")
case _:
raise ValueError("TreeWalker.report(): unsupported parameter combination")
def path_type_name(pstat: stat_result) -> str:
parts = []
if stat.S_ISBLK(pstat.st_mode):
parts.append("block_device")
if stat.S_ISCHR(pstat.st_mode):
parts.append("char_device")
if stat.S_ISDIR(pstat.st_mode):
parts.append("dir")
if stat.S_ISDOOR(pstat.st_mode):
parts.append("door")
if stat.S_ISFIFO(pstat.st_mode):
parts.append("fifo")
if stat.S_ISLNK(pstat.st_mode):
parts.append("symlink")
if stat.S_ISPORT(pstat.st_mode):
parts.append("port")
if stat.S_ISREG(pstat.st_mode):
parts.append("file")
if stat.S_ISSOCK(pstat.st_mode):
parts.append("socket")
if stat.S_ISWHT(pstat.st_mode):
parts.append("whiteout")
if not parts:
return "unknown"
return ", ".join(parts)

19
src/bsv/util.py

@ -17,6 +17,8 @@ from __future__ import annotations
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from datetime import UTC, datetime as DateTime, timedelta as TimeDelta from datetime import UTC, datetime as DateTime, timedelta as TimeDelta
from pathlib import Path
import stat
from typing import BinaryIO from typing import BinaryIO
@ -45,6 +47,23 @@ def read_exact_or_eof(stream: BinaryIO, num_bytes: int) -> bytes | None:
return data return data
def is_bsv_repository(path: Path) -> bool:
return (path / "bsv_config.toml").is_file()
def object_type_from_path(path: Path) -> bytes:
return object_type_from_mode(path.stat(follow_symlinks=False).st_mode)
def object_type_from_mode(mode: int) -> bytes:
if stat.S_ISLNK(mode):
return b"slnk"
elif stat.S_ISDIR(mode):
return b"tree"
elif stat.S_ISREG(mode):
return b"blob"
return b""
class Hash(ABC): class Hash(ABC):
name: str name: str
digest_size: int digest_size: int

90
tests/test_repository.py

@ -15,7 +15,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations from __future__ import annotations
from datetime import UTC, datetime from datetime import UTC, datetime
from io import BytesIO from os import stat_result
from pathlib import Path from pathlib import Path
from random import randbytes from random import randbytes
from typing import Iterator from typing import Iterator
@ -25,6 +25,7 @@ from tempfile import TemporaryDirectory
from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_from_time from bsv.repository import Repository, Snapshot, Tree, TreeItem, create_repository, timestamp_from_time
from bsv.simple_cas.cas import Digest from bsv.simple_cas.cas import Digest
from bsv.tree_walker import Action, IgnoreCause, TreeWalker
@pytest.fixture @pytest.fixture
@ -61,18 +62,20 @@ def test_read_write_tree(repo: Repository):
repo, repo,
[ [
TreeItem( TreeItem(
"xyz", digest = Digest(bytes([42]) * repo._cas._digest_size),
Digest(bytes([42]) * repo._cas._digest_size), object_type = b"blob",
0o744, size = 123,
creation_timestamp = timestamp_from_time(now), permissions = 0o744,
modification_timestamp = timestamp_from_time(now), modification_timestamp = timestamp_from_time(now),
name = "xyz",
), ),
TreeItem( TreeItem(
"foobar", digest = Digest(bytes([123]) * repo._cas._digest_size),
Digest(bytes([123]) * repo._cas._digest_size), object_type = b"slnk",
0o777, size = 42,
creation_timestamp = timestamp_from_time(now), permissions = 0o777,
modification_timestamp = timestamp_from_time(now), modification_timestamp = timestamp_from_time(now),
name = "foobar",
), ),
] ]
) )
@ -97,6 +100,75 @@ def test_read_write_snapshot(repo: Repository):
assert repo.get_snapshot(digest) == snapshot assert repo.get_snapshot(digest) == snapshot
class TestTreeWalker(TreeWalker):
reports: list
def __init__(self, repo: Repository, dry_run: bool=False):
super().__init__(repo, dry_run)
self.reports = []
def report(
self,
action: Action,
path: Path,
pstat: stat_result | None,
info: IgnoreCause | Exception | None = None
):
super().report(action, path, pstat, info)
self.reports.append((action, path, pstat, info))
def test_add_tree(tmp_dir: Path, repo: Repository):
dir = tmp_dir / "test"
structure = {
"folder": {
"sub_folder": {
"empty_folder": {},
"foo.txt": b"Hello World!\n",
},
"test.py": b"print(\"Hello World!\")\n",
"bar.dat": bytes(range(256)),
},
"Another test with long name and spaces and a bang !": b"Should works.\n",
"bsv_repo": {
"bsv_config.toml": b"[bsv]\n",
},
}
create_file_structure(dir, structure)
walker = TestTreeWalker(repo)
dir_digest = walker.add_tree(dir)
def check(digest: Digest, value: dict | bytes):
if isinstance(value, dict):
tree = repo.get_tree(digest)
assert tree
assert list(map(lambda i: i.name, tree.items)) == sorted(value.keys())
for item in tree.items:
check(item.digest, value[item.name])
elif isinstance(value, bytes):
blob = repo.get_blob(digest)
data = blob.reader().read()
assert data == value
expected = dict(structure)
del expected["bsv_repo"]
check(dir_digest, expected)
def create_file_structure(dst: Path, value: dict | bytes):
assert not dst.exists()
if isinstance(value, dict):
dst.mkdir()
for name, item in value.items():
create_file_structure(dst / name, item)
elif isinstance(value, bytes):
dst.write_bytes(value)
else:
raise TypeError(f"invalid type {type(value).__name__} for parameter value")
def make_random_file(path: Path, size: int): def make_random_file(path: Path, size: int):
with path.open("wb") as stream: with path.open("wb") as stream:
for chunk_size in iter_chunks(size): for chunk_size in iter_chunks(size):

Loading…
Cancel
Save