TreeWalker.

This commit is contained in:
2023-11-19 18:40:30 +01:00
parent 7420d891d4
commit 17bef2e63a
6 changed files with 289 additions and 35 deletions

View File

@@ -20,7 +20,10 @@ from dataclasses import dataclass
@dataclass(frozen=True, order=True, slots=True)
class Digest:
digest: bytes
digest: bytes = b""
def __bool__(self) -> bool:
return bool(self.digest)
def __repr__(self) -> str:
return self.digest.hex()

View File

@@ -97,32 +97,37 @@ class Repository:
with self:
return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore
def add_blob(self, stream: BinaryIO) -> Digest:
def add_blob(self, stream: BinaryIO, dry_run: bool=False) -> Digest:
with self:
return self._write(b"blob", stream)
return self._write(b"blob", stream, dry_run=dry_run)
def get_tree(self, digest: Digest) -> Tree:
with self:
return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data)
def add_tree(self, tree: Tree) -> Digest:
def add_tree(self, tree: Tree, dry_run: bool=False) -> Digest:
with self:
return self._cas.write(b"tree", tree.to_bytes())
return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run)
def add_tree_from_path(self, path: Path, dry_run: bool=False) -> Digest:
from bsv.tree_walker import TreeWalker
walker = TreeWalker(self, dry_run=dry_run)
return walker.add_tree(path)
def get_snapshot(self, digest: Digest) -> Snapshot:
with self:
return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data)
def add_snapshot(self, snapshot: Snapshot) -> Digest:
def add_snapshot(self, snapshot: Snapshot, dry_run: bool=False) -> Digest:
with self:
return self._cas.write(b"snap", snapshot.to_bytes())
return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run)
def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject:
obj = self._cas.read(digest, object_type=object_type)
stream = BytesIO(obj.data)
return cls.from_stream(self, stream, digest_size=self._cas._digest_size)
def _write(self, object_type: bytes, stream: BinaryIO) -> Digest:
def _write(self, object_type: bytes, stream: BinaryIO, dry_run: bool=False) -> Digest:
out = BytesIO()
size = 0
for chunk in fastcdc(
@@ -133,7 +138,7 @@ class Repository:
fat = True,
):
size += chunk.length
digest = self._cas.write(b"chnk", chunk.data)
digest = self._cas.write(b"chnk", chunk.data, dry_run=dry_run)
out.write(digest.digest)
out.write(chunk.length.to_bytes(4))
return self._cas.write(object_type, size.to_bytes(8) + out.getvalue())
@@ -323,34 +328,30 @@ class Tree:
@dataclass
class TreeItem:
name: str
digest: Digest
object_type: bytes
size: int
permissions: int
creation_timestamp: int
modification_timestamp: int
name: str
def __init__(
self,
name: str,
digest: Digest,
object_type: bytes,
size: int,
permissions: int,
creation_timestamp: int,
modification_timestamp: int,
name: str,
):
if "/\\" in name:
raise ValueError(f"invalid tree item name {name}")
self.name = name
self.digest = digest
self.object_type = object_type
self.size = size
self.permissions = permissions
self.creation_timestamp = creation_timestamp
self.modification_timestamp = modification_timestamp
@property
def creation_time(self) -> DateTime:
return time_from_timestamp(self.creation_timestamp)
@creation_time.setter
def creation_time(self, time: DateTime):
self.creation_timestamp = timestamp_from_time(time)
self.name = name
@property
def modification_time(self) -> DateTime:
@@ -366,16 +367,18 @@ class TreeItem:
return None
return TreeItem(
digest = Digest(digest_bytes),
object_type = read_exact(stream, 4),
size = int.from_bytes(read_exact(stream, 8)),
permissions = int.from_bytes(read_exact(stream, 2)),
creation_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),
)
def write(self, stream: BinaryIO):
stream.write(self.digest.digest)
stream.write(self.object_type)
stream.write(self.size.to_bytes(8))
stream.write(self.permissions.to_bytes(2))
stream.write(self.creation_timestamp.to_bytes(8, signed=True))
stream.write(self.modification_timestamp.to_bytes(8, signed=True))
name_bytes = self.name.encode("utf-8")
stream.write(len(name_bytes).to_bytes(2))

View File

@@ -96,7 +96,7 @@ class SimpleCas:
return Object(digest, object_type, size, data)
def write(self, object_type: bytes, data: bytes) -> Digest:
def write(self, object_type: bytes, data: bytes, dry_run: bool=False) -> Digest:
assert len(object_type) == 4
assert len(data) < 2**32
@@ -108,7 +108,7 @@ class SimpleCas:
hash.update(data)
digest = Digest(hash.digest())
if digest not in self:
if not dry_run and digest not in self:
with self._open_writer(digest, object_type, len(data)) as out:
out.write(digest.digest)
out.write(object_type)

157
src/bsv/tree_walker.py Normal file
View File

@@ -0,0 +1,157 @@
# bsv - Backup, Synchronization, Versioning
# Copyright (C) 2023 Simon Boyé
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations
from enum import Enum
from os import stat_result
from pathlib import Path
import stat
from bsv.object import Digest
from bsv.repository import Repository, Tree, TreeItem
from bsv.util import is_bsv_repository, object_type_from_mode
class Action(Enum):
ADD = "add"
UPDATE = "update"
IGNORE = "ignore"
ERROR = "error"
class IgnoreCause(Enum):
IGNORE_RULE = "ignore_rule"
UNCHANGED = "unchanged"
UNSUPPORTED_TYPE = "unsupported_type"
class TreeWalker:
_repo: Repository
_dry_run: bool = False
def __init__(self, repo: Repository, dry_run: bool=False):
self._repo = repo
self._dry_run = dry_run
def add_tree(self, path: Path) -> Digest:
pstat = path.stat(follow_symlinks=False)
if self.ignore(path, pstat):
self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE)
return Digest()
return self._add_tree(path, pstat)
def _add_tree(self, path: Path, pstat: stat_result) -> Digest:
tree = Tree(self._repo, [])
for item in sorted(path.iterdir()):
digest = Digest()
try:
istat = item.lstat()
if self.ignore(item, istat):
self.report(Action.IGNORE, item, istat, IgnoreCause.IGNORE_RULE)
continue
object_type = object_type_from_mode(istat.st_mode)
if object_type == b"slnk":
digest = self._add_symlink(item, istat)
elif object_type == b"tree":
digest = self._add_tree(item, istat)
elif object_type == b"blob":
digest = self._add_blob(item, istat)
else:
self.report(Action.IGNORE, item, istat, IgnoreCause.UNSUPPORTED_TYPE)
continue
except Exception as err:
self.report(Action.ERROR, item, None, err)
continue
if digest:
self.report(Action.ADD, path, pstat)
tree.items.append(TreeItem(
digest = digest,
object_type = object_type,
size = istat.st_size,
permissions = stat.S_IMODE(istat.st_mode),
modification_timestamp = istat.st_mtime_ns,
name = item.name,
))
return self._repo.add_tree(tree, dry_run=self._dry_run)
def _add_symlink(self, path: Path, pstat: stat_result) -> Digest:
# TODO: Store symlink relative to current dir ?
# * What about symlink that points outside of the backup dirs
# * Should symlinks that points inside the backup dirs but in another
# mount-point adjusted ?
# * Should absolute symlink be restored as absolute ?
self.report(Action.ADD, path, pstat)
return self._repo._cas.write(
b"slnk",
path.readlink().as_posix().encode("utf-8"),
dry_run = self._dry_run,
)
def _add_blob(self, path: Path, pstat: stat_result) -> Digest:
self.report(Action.ADD, path, pstat)
with path.open("rb") as stream:
return self._repo.add_blob(stream, dry_run=self._dry_run)
def ignore(self, path: Path, pstat: stat_result) -> bool:
return is_bsv_repository(path)
def report(self, action: Action, path: Path, pstat: stat_result | None, info: IgnoreCause | Exception | None=None):
match action, info:
case (Action.ADD, None):
print(f"Add: {path}")
case (Action.IGNORE, IgnoreCause.IGNORE_RULE):
print(f"Ignore (rule): {path}")
case (Action.IGNORE, IgnoreCause.UNCHANGED):
print(f"Ignore (unchanged): {path}")
case (Action.IGNORE, IgnoreCause.UNSUPPORTED_TYPE) if pstat is not None:
assert pstat is not None
print(f"Ignore (unsupported type {path_type_name(pstat)}): {path}")
case (Action.ERROR, _) if isinstance(info, Exception):
print(f"Error {info}: {path}")
case _:
raise ValueError("TreeWalker.report(): unsupported parameter combination")
def path_type_name(pstat: stat_result) -> str:
parts = []
if stat.S_ISBLK(pstat.st_mode):
parts.append("block_device")
if stat.S_ISCHR(pstat.st_mode):
parts.append("char_device")
if stat.S_ISDIR(pstat.st_mode):
parts.append("dir")
if stat.S_ISDOOR(pstat.st_mode):
parts.append("door")
if stat.S_ISFIFO(pstat.st_mode):
parts.append("fifo")
if stat.S_ISLNK(pstat.st_mode):
parts.append("symlink")
if stat.S_ISPORT(pstat.st_mode):
parts.append("port")
if stat.S_ISREG(pstat.st_mode):
parts.append("file")
if stat.S_ISSOCK(pstat.st_mode):
parts.append("socket")
if stat.S_ISWHT(pstat.st_mode):
parts.append("whiteout")
if not parts:
return "unknown"
return ", ".join(parts)

View File

@@ -17,6 +17,8 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from datetime import UTC, datetime as DateTime, timedelta as TimeDelta
from pathlib import Path
import stat
from typing import BinaryIO
@@ -45,6 +47,23 @@ def read_exact_or_eof(stream: BinaryIO, num_bytes: int) -> bytes | None:
return data
def is_bsv_repository(path: Path) -> bool:
return (path / "bsv_config.toml").is_file()
def object_type_from_path(path: Path) -> bytes:
return object_type_from_mode(path.stat(follow_symlinks=False).st_mode)
def object_type_from_mode(mode: int) -> bytes:
if stat.S_ISLNK(mode):
return b"slnk"
elif stat.S_ISDIR(mode):
return b"tree"
elif stat.S_ISREG(mode):
return b"blob"
return b""
class Hash(ABC):
name: str
digest_size: int