Browse Source

Repository now returns *Object instead of Digest.

master
Draklaw 2 years ago
parent
commit
eb6ae85698
  1. 3
      src/bsv/exception.py
  2. 41
      src/bsv/path_map.py
  3. 183
      src/bsv/repository.py
  4. 69
      src/bsv/tree_walker.py
  5. 52
      tests/test_repository.py

3
src/bsv/exception.py

@ -22,6 +22,9 @@ class BsvError(RuntimeError):
class NotFound(BsvError):
pass
class UnmappedPath(BsvError):
pass
class UnexpectedObjectType(BsvError):
pass

41
src/bsv/path_map.py

@ -20,6 +20,8 @@ from itertools import pairwise
from pathlib import Path, PurePosixPath
from typing import Any
from bsv.exception import UnmappedPath
@dataclass(order=True, frozen=True, slots=True)
class PathPair:
@ -29,6 +31,7 @@ class PathPair:
def __post_init__(self):
if not self.bsv.is_absolute() or not self.fs.is_absolute():
raise ValueError("paths in path_map must be absolute")
super().__setattr__("fs", self.fs.resolve())
@classmethod
def from_obj(cls, obj: dict[str, str]) -> PathPair:
@ -39,11 +42,11 @@ class PathPair:
class PathMap:
paths: list[PathPair]
pairs: list[PathPair]
def __init__(self, paths: list[PathPair]=[]):
self.paths = sorted(paths)
for path0, path1 in pairwise(self.paths):
def __init__(self, pairs: list[PathPair]=[]):
self.pairs = sorted(pairs)
for path0, path1 in pairwise(self.pairs):
if path0 == path1 or path1.bsv.relative_to(path0.bsv):
raise ValueError("bsv paths must be unique and independent")
@ -54,5 +57,33 @@ class PathMap:
for item in obj
])
def mount_point(self, fs_path: Path) -> PathPair:
fs_path = fs_path.resolve()
best_pair = None
best_relative = None
for pair in self.pairs:
try:
relative = fs_path.relative_to(pair.fs)
if not best_pair or not best_relative or len(relative.parts) < len(best_relative.parts):
best_pair = pair
best_relative = relative
except ValueError:
pass
if best_pair is None:
raise UnmappedPath(f"unmapped fs path {fs_path}")
return best_pair
def relative_bsv_path(self, fs_target: Path, relative_to: Path) -> PurePosixPath:
if not relative_to.is_absolute():
relative_to = fs_target / relative_to
fs_target = fs_target.resolve()
relative_to = relative_to.resolve()
target_mount_point = self.mount_point(fs_target)
base_mount_point = self.mount_point(relative_to)
raise NotImplemented("not implemented yet")
def clone(self) -> PathMap:
return PathMap(self.paths)
return PathMap(self.pairs)

183
src/bsv/repository.py

@ -19,16 +19,17 @@ from dataclasses import dataclass
from datetime import datetime as DateTime
import hashlib
from io import BytesIO
from pathlib import Path
from pathlib import Path, PurePosixPath
import platform
import tomllib
from typing import TYPE_CHECKING, BinaryIO, Callable, Type
from typing import TYPE_CHECKING, BinaryIO, Callable, Self, Type
from fastcdc import fastcdc
import tomlkit
from bsv import __version__
from bsv.exception import ConfigError
from bsv.object import ObjectInfo
from bsv.path_map import PathMap
from bsv.simple_cas import SimpleCas
from bsv.simple_cas.cas import Digest, SimpleCas
@ -94,34 +95,95 @@ class Repository:
def path_map(self) -> PathMap:
return self._path_map.clone()
def get_blob(self, digest: Digest) -> Blob:
def get_blob(self, digest: Digest) -> BlobObject:
with self:
return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore
def add_blob(self, stream: BinaryIO, *, dry_run: bool=False) -> Digest:
obj, blob = self._read(digest, object_type=b"blob")
return BlobObject(
digest = obj.digest,
object_type = obj.object_type,
size = obj.size,
blob = blob,
)
def add_blob(self, stream: BinaryIO, *, dry_run: bool=False) -> BlobObject:
with self:
return self._write(b"blob", stream, dry_run=dry_run)
def get_tree(self, digest: Digest) -> Tree:
def get_symlink(self, digest: Digest) -> SymlinkObject:
with self:
return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data)
def add_tree(self, tree: Tree, *, dry_run: bool=False) -> Digest:
obj = self._cas.read(digest, object_type=b"slnk")
return SymlinkObject(
digest = obj.digest,
object_type = obj.object_type,
size = obj.size,
symlink = Symlink.from_bytes(self, obj.data),
)
def add_symlink(self, symlink: Symlink, *, dry_run: bool=False) -> SymlinkObject:
with self:
return self._cas.write(b"tree", tree.to_bytes(), dry_run=dry_run)
data = symlink.to_bytes()
return SymlinkObject(
digest = self._cas.write(b"slnk", data, dry_run=dry_run),
object_type = b"slnk",
size = len(data),
symlink = symlink,
)
def add_symlink_from_fs_target(self, fs_symlink: Path, fs_target: Path, *, dry_run: bool=False) -> SymlinkObject:
assert fs_symlink.is_absolute()
return self.add_symlink(
Symlink(
repo = self,
is_absolute = fs_target.is_absolute(),
target = self._path_map.relative_bsv_path(fs_target, relative_to=fs_symlink),
),
dry_run = dry_run,
)
def add_tree_from_path(self, path: Path, *, dry_run: bool=False) -> Digest:
def get_tree(self, digest: Digest) -> TreeObject:
with self:
obj = self._cas.read(digest, object_type=b"tree")
return TreeObject(
digest = obj.digest,
object_type = obj.object_type,
size = obj.size,
tree = Tree.from_bytes(self, obj.data),
)
def add_tree(self, tree: Tree, *, dry_run: bool=False) -> TreeObject:
with self:
data = tree.to_bytes()
return TreeObject(
digest = self._cas.write(b"tree", data, dry_run=dry_run),
object_type = b"tree",
size = len(data),
tree = tree,
)
def add_tree_from_path(self, path: Path, *, dry_run: bool=False) -> TreeObject:
from bsv.tree_walker import TreeWalker
walker = TreeWalker(self, dry_run=dry_run)
return walker.add_tree(path)
def get_snapshot(self, digest: Digest) -> Snapshot:
def get_snapshot(self, digest: Digest) -> SnapshotObject:
with self:
return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data)
def add_snapshot(self, snapshot: Snapshot, *, dry_run: bool=False) -> Digest:
obj = self._cas.read(digest, object_type=b"snap")
return SnapshotObject(
digest = obj.digest,
object_type = obj.object_type,
size = obj.size,
snapshot = Snapshot.from_bytes(self, obj.data),
)
def add_snapshot(self, snapshot: Snapshot, *, dry_run: bool=False) -> SnapshotObject:
with self:
return self._cas.write(b"snap", snapshot.to_bytes(), dry_run=dry_run)
data = snapshot.to_bytes()
return SnapshotObject(
digest = self._cas.write(b"snap", data, dry_run=dry_run),
object_type = b"snap",
size = len(data),
snapshot = snapshot,
)
# def take_snapshot(
# self,
@ -151,14 +213,15 @@ class Repository:
def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject:
def _read(self, digest: Digest, object_type: bytes) -> tuple[ObjectInfo, Blob]:
obj = self._cas.read(digest, object_type=object_type)
stream = BytesIO(obj.data)
return cls.from_stream(self, stream, digest_size=self._cas._digest_size)
return obj, Blob.from_stream(self, stream, digest_size=self._cas._digest_size)
def _write(self, object_type: bytes, stream: BinaryIO, *, dry_run: bool=False) -> Digest:
def _write(self, object_type: bytes, stream: BinaryIO, *, dry_run: bool=False) -> BlobObject:
out = BytesIO()
size = 0
chunks = []
for chunk in fastcdc(
stream,
min_size = self._min_chunk_size,
@ -168,9 +231,19 @@ class Repository:
):
size += chunk.length
digest = self._cas.write(b"chnk", chunk.data, dry_run=dry_run)
chunks.append(Chunk(digest, chunk.length))
out.write(digest.digest)
out.write(chunk.length.to_bytes(4))
return self._cas.write(object_type, size.to_bytes(8) + out.getvalue())
return BlobObject(
digest = self._cas.write(object_type, size.to_bytes(8) + out.getvalue()),
object_type = object_type,
size = 8 + len(out.getvalue()),
blob = Blob(
repo = self,
size = size,
chunks = chunks,
)
)
def __enter__(self):
if self._context_depth == 0:
@ -256,7 +329,7 @@ class ChunkedObject:
chunks: list[Chunk]
@classmethod
def from_stream(cls, repo: Repository, stream: BinaryIO, digest_size: int) -> ChunkedObject:
def from_stream(cls, repo: Repository, stream: BinaryIO, digest_size: int) -> Self:
self = cls(
repo = repo,
size = int.from_bytes(read_exact(stream, 8)),
@ -276,7 +349,7 @@ class Chunk:
size: int
@classmethod
def from_stream(cls, stream: BinaryIO, digest_size: int) -> Chunk | None:
def from_stream(cls, stream: BinaryIO, digest_size: int) -> Self | None:
digest_bytes = read_exact_or_eof(stream, digest_size)
if digest_bytes is None:
return None
@ -324,23 +397,65 @@ class ChunkedObjectReader:
class Blob(ChunkedObject):
pass
@dataclass(frozen=True, order=True, slots=True)
class BlobObject(ObjectInfo):
blob: Blob
@dataclass(slots=True)
class Symlink:
repo: Repository
is_absolute: bool
target: PurePosixPath
@classmethod
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Self:
return cls(
repo = repo,
is_absolute = bool(read_exact(stream, 1)),
target = PurePosixPath(stream.read().decode("utf-8")),
)
@classmethod
def from_bytes(cls, repo: Repository, bytes: bytes) -> Self:
stream = BytesIO(bytes)
return cls.from_stream(repo, stream)
def write(self, stream: BinaryIO):
stream.write(self.is_absolute.to_bytes(1))
stream.write(self.target.as_posix().encode("utf-8"))
def to_bytes(self) -> bytes:
stream = BytesIO()
self.write(stream)
return stream.getvalue()
@dataclass(frozen=True, order=True, slots=True)
class SymlinkObject(ObjectInfo):
symlink: Symlink
@dataclass
class Tree:
repo: Repository
items: list[TreeItem]
@property
def total_size(self) -> int:
return sum(
item.size
for item in self.items
)
@classmethod
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Tree:
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Self:
tree = Tree(repo, [])
while (item := TreeItem.from_stream(stream, repo._cas._digest_size)) is not None:
tree.items.append(item)
return tree
@classmethod
def from_bytes(cls, repo: Repository, data: bytes) -> Tree:
def from_bytes(cls, repo: Repository, data: bytes) -> Self:
stream = BytesIO(data)
return cls.from_stream(repo, stream)
@ -354,6 +469,14 @@ class Tree:
self.write(stream)
return stream.getvalue()
@dataclass(frozen=True, order=True, slots=True)
class TreeObject(ObjectInfo):
tree: Tree
@property
def total_size(self) -> int:
return self.size + self.tree.total_size
@dataclass
class TreeItem:
@ -390,7 +513,7 @@ class TreeItem:
self.modification_timestamp_us = timestamp_us_from_time(time)
@classmethod
def from_stream(cls, stream: BinaryIO, digest_size: int) -> TreeItem | None:
def from_stream(cls, stream: BinaryIO, digest_size: int) -> Self | None:
digest_bytes = read_exact_or_eof(stream, digest_size)
if digest_bytes is None:
return None
@ -435,7 +558,7 @@ class Snapshot:
self.timestamp_us = timestamp_us_from_time(time)
@classmethod
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Snapshot:
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Self:
return Snapshot(
repo = repo,
tree_digest = Digest(read_exact(stream, repo._cas._digest_size)),
@ -448,7 +571,7 @@ class Snapshot:
)
@classmethod
def from_bytes(cls, repo: Repository, data: bytes) -> Snapshot:
def from_bytes(cls, repo: Repository, data: bytes) -> Self:
stream = BytesIO(data)
return cls.from_stream(repo, stream)
@ -467,3 +590,7 @@ class Snapshot:
stream = BytesIO()
self.write(stream)
return stream.getvalue()
@dataclass(frozen=True, order=True, slots=True)
class SnapshotObject(ObjectInfo):
snapshot: Snapshot

69
src/bsv/tree_walker.py

@ -21,9 +21,8 @@ from os import stat_result
from pathlib import Path
import stat
from bsv.object import Digest
from bsv.path_map import PathMap
from bsv.repository import Repository, Tree, TreeItem
from bsv.object import Digest, ObjectInfo
from bsv.repository import BlobObject, Repository, SymlinkObject, Tree, TreeItem, TreeObject
from bsv.util import is_bsv_repository, object_type_from_mode
@ -70,6 +69,16 @@ class TreeWalker:
self._dry_run = dry_run
# def add_virtual_tree(self, paths: PathMap) -> Digest:
# assert paths
# fs_paths = sorted([
# pair.fs
# for pair in paths.pairs
# ])
# tree_map = {
# fs_paths[0]: self.add_tree()
# }
# root = {}
# for pair in paths.paths:
# vdir = root
@ -98,15 +107,16 @@ class TreeWalker:
# ))
# return self._repo.add_tree(tree, dry_run=self._dry_run)
def add_tree(self, path: Path, *, source_digest: Digest | None=None) -> Digest:
def add_tree(self, path: Path, *, source_digest: Digest | None=None) -> TreeObject:
pstat = path.stat(follow_symlinks=False)
if self.ignore(path, pstat):
self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE)
return Digest()
raise ValueError(f"path {path} is ignored")
# self.report(Action.IGNORE, path, pstat, IgnoreCause.IGNORE_RULE)
# return Digest()
return self._add_tree(path, pstat, source_digest=source_digest)
def _add_tree(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> Digest:
source = self._repo.get_tree(source_digest) if source_digest else None
def _add_tree(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> TreeObject:
source = self._repo.get_tree(source_digest).tree if source_digest else None
tree = Tree(self._repo, [])
subpaths = sorted(path.iterdir())
@ -130,7 +140,7 @@ class TreeWalker:
source_item_index += 1
if subpath is not None:
digest = Digest()
obj: ObjectInfo | None = None
try:
istat = subpath.lstat()
@ -151,11 +161,14 @@ class TreeWalker:
sub_source_digest = source_item and source_item.digest
object_type = object_type_from_mode(istat.st_mode)
if object_type == b"slnk":
digest = self._add_symlink(subpath, istat, source_digest=sub_source_digest)
obj = self._add_symlink(subpath, istat, source_digest=sub_source_digest)
size = obj.size
elif object_type == b"tree":
digest = self._add_tree(subpath, istat, source_digest=sub_source_digest)
obj = self._add_tree(subpath, istat, source_digest=sub_source_digest)
size = obj.total_size
elif object_type == b"blob":
digest = self._add_blob(subpath, istat, source_digest=sub_source_digest)
obj = self._add_blob(subpath, istat, source_digest=sub_source_digest)
size = istat.st_size
else:
self.report(Action.IGNORE, subpath, istat, IgnoreCause.UNSUPPORTED_TYPE)
continue
@ -163,11 +176,11 @@ class TreeWalker:
self.report(Action.ERROR, subpath, None, err)
continue
if digest:
if obj:
tree.items.append(TreeItem(
digest = digest,
digest = obj.digest,
object_type = object_type,
size = istat.st_size,
size = size,
permissions = stat.S_IMODE(istat.st_mode),
modification_timestamp_us = istat.st_mtime_ns // 1000,
name = subpath.name,
@ -175,36 +188,32 @@ class TreeWalker:
elif source_item:
self.report(Action.REMOVE, path / source_item.name, None, source_item)
digest = self._repo.add_tree(tree, dry_run=self._dry_run)
tree_object = self._repo.add_tree(tree, dry_run=self._dry_run)
action, info = Action.from_digests(digest, source_digest)
action, info = Action.from_digests(tree_object.digest, source_digest)
self.report(action, path, pstat, info)
return digest
return tree_object
def _add_symlink(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> Digest:
def _add_symlink(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> SymlinkObject:
# TODO: Store symlink relative to current dir ?
# * What about symlink that points outside of the backup dirs
# * Should symlinks that points inside the backup dirs but in another
# mount-point adjusted ?
# * Should absolute symlink be restored as absolute ?
digest = self._repo._cas.write(
b"slnk",
path.readlink().as_posix().encode("utf-8"),
dry_run = self._dry_run,
)
obj = self._repo.add_symlink_from_fs_target(path, path.readlink())
action, info = Action.from_digests(digest, source_digest)
action, info = Action.from_digests(obj.digest, source_digest)
self.report(action, path, pstat, info)
return digest
return obj
def _add_blob(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> Digest:
def _add_blob(self, path: Path, pstat: stat_result, *, source_digest: Digest | None) -> BlobObject:
with path.open("rb") as stream:
digest = self._repo.add_blob(stream, dry_run=self._dry_run)
obj = self._repo.add_blob(stream, dry_run=self._dry_run)
action, info = Action.from_digests(digest, source_digest)
action, info = Action.from_digests(obj.digest, source_digest)
self.report(action, path, pstat, info)
return digest
return obj
def ignore(self, path: Path, pstat: stat_result, *, source: TreeItem | None=None) -> bool:

52
tests/test_repository.py

@ -48,13 +48,17 @@ def test_read_write_blob(tmp_dir: Path, repo: Repository):
make_random_file(path, 1 << 20)
with path.open("rb") as stream:
digest = repo.add_blob(stream)
blob = repo.get_blob(digest)
data = blob.reader().read()
obj0 = repo.add_blob(stream)
assert obj0.object_type == b"blob"
with path.open("rb") as stream:
assert obj0.blob.reader().read() == stream.read()
obj1 = repo.get_blob(obj0.digest)
assert obj1.digest == obj0.digest
assert obj1.object_type == obj0.object_type
assert obj1.size == obj0.size
with path.open("rb") as stream:
assert data == stream.read()
assert obj1.blob.reader().read() == stream.read()
def test_read_write_tree(repo: Repository):
@ -83,8 +87,15 @@ def test_read_write_tree(repo: Repository):
assert Tree.from_bytes(repo, tree.to_bytes()) == tree
digest = repo.add_tree(tree)
assert repo.get_tree(digest) == tree
obj0 = repo.add_tree(tree)
assert obj0.object_type == b"tree"
assert obj0.tree == tree
obj1 = repo.get_tree(obj0.digest)
assert obj1.digest == obj0.digest
assert obj1.object_type == obj0.object_type
assert obj1.size == obj0.size
assert obj1.tree == obj0.tree
def test_read_write_snapshot(repo: Repository):
@ -101,8 +112,15 @@ def test_read_write_snapshot(repo: Repository):
assert Snapshot.from_bytes(repo, snapshot.to_bytes()) == snapshot
digest = repo.add_snapshot(snapshot)
assert repo.get_snapshot(digest) == snapshot
obj0 = repo.add_snapshot(snapshot)
assert obj0.object_type == b"snap"
assert obj0.snapshot == snapshot
obj1 = repo.get_snapshot(obj0.digest)
assert obj1.digest == obj0.digest
assert obj1.object_type == obj0.object_type
assert obj1.size == obj0.size
assert obj1.snapshot == obj0.snapshot
class TestTreeWalker(TreeWalker):
@ -164,18 +182,19 @@ def test_add_tree(tmp_dir: Path, repo: Repository):
def check(digest: Digest, value: dict | bytes):
if isinstance(value, dict):
tree = repo.get_tree(digest)
tree = repo.get_tree(digest).tree
assert tree
assert list(map(lambda i: i.name, tree.items)) == sorted(value.keys())
for item in tree.items:
check(item.digest, value[item.name])
elif isinstance(value, bytes):
blob = repo.get_blob(digest)
data = blob.reader().read()
blob_obj = repo.get_blob(digest)
data = blob_obj.blob.reader().read()
assert data == value
walker = TestTreeWalker(repo)
dir_digest0 = walker.add_tree(dir)
obj0 = walker.add_tree(dir)
assert obj0.object_type == b"tree"
assert walker.reports == [
(Action.ADD, dir / "Another test with long name and spaces and a bang !", None),
(Action.IGNORE, dir / "bsv_repo", IgnoreCause.IGNORE_RULE),
@ -187,12 +206,13 @@ def test_add_tree(tmp_dir: Path, repo: Repository):
(Action.ADD, dir / "folder", None),
(Action.ADD, dir, None),
]
check(dir_digest0, expected0)
check(obj0.digest, expected0)
create_file_structure(dir, structure1)
walker.reports.clear()
dir_digest1 = walker.add_tree(dir, source_digest=dir_digest0)
obj1 = walker.add_tree(dir, source_digest=obj0.digest)
assert obj0.object_type == b"tree"
assert walker.reports == [
(Action.IGNORE, dir / "Another test with long name and spaces and a bang !", IgnoreCause.UNCHANGED),
(Action.IGNORE, dir / "bsv_repo", IgnoreCause.IGNORE_RULE),
@ -205,7 +225,7 @@ def test_add_tree(tmp_dir: Path, repo: Repository):
(Action.ADD, dir / "new_file", None),
(Action.UPDATE, dir, None),
]
check(dir_digest1, expected1)
check(obj1.digest, expected1)
def create_file_structure(dst: Path, value: dict | bytes):

Loading…
Cancel
Save