|
|
@ -15,20 +15,43 @@ |
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>. |
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>. |
|
|
from __future__ import annotations |
|
|
from __future__ import annotations |
|
|
|
|
|
|
|
|
|
|
|
from dataclasses import dataclass |
|
|
|
|
|
from datetime import UTC, datetime as DateTime, timedelta as TimeDelta |
|
|
|
|
|
import hashlib |
|
|
|
|
|
from io import BytesIO |
|
|
from pathlib import Path, PurePosixPath |
|
|
from pathlib import Path, PurePosixPath |
|
|
import platform |
|
|
import platform |
|
|
import tomllib |
|
|
import tomllib |
|
|
from typing import Any |
|
|
from typing import Any, BinaryIO, Callable, Type |
|
|
|
|
|
|
|
|
|
|
|
from fastcdc import fastcdc |
|
|
|
|
|
import tomlkit |
|
|
|
|
|
|
|
|
from bsv import __version__ |
|
|
from bsv import __version__ |
|
|
|
|
|
from bsv.simple_cas import SimpleCas |
|
|
|
|
|
from bsv.simple_cas.cas import ConfigError, Digest, SimpleCas |
|
|
|
|
|
from bsv.simple_cas.util import Hash, read_exact, read_exact_or_eof |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_MIN_CHUNK_SIZE = 1 << 12 |
|
|
|
|
|
DEFAULT_AVG_CHUNK_SIZE = 1 << 16 |
|
|
|
|
|
DEFAULT_MAX_CHUNK_SIZE = 1 << 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Repository: |
|
|
class Repository: |
|
|
_path: Path |
|
|
_path: Path |
|
|
_name: str |
|
|
_name: str |
|
|
|
|
|
|
|
|
|
|
|
_cas: SimpleCas |
|
|
|
|
|
_min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE |
|
|
|
|
|
_avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE |
|
|
|
|
|
_max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE |
|
|
|
|
|
|
|
|
_path_map: list[PathPair] |
|
|
_path_map: list[PathPair] |
|
|
# _remotes: list[object] |
|
|
# _remotes: list[object] |
|
|
|
|
|
|
|
|
|
|
|
_context_depth: int = 0 |
|
|
|
|
|
|
|
|
def __init__(self, path: Path): |
|
|
def __init__(self, path: Path): |
|
|
self._path = path |
|
|
self._path = path |
|
|
|
|
|
|
|
|
@ -39,6 +62,15 @@ class Repository: |
|
|
|
|
|
|
|
|
self._name = bsv.get("name") or platform.node() |
|
|
self._name = bsv.get("name") or platform.node() |
|
|
|
|
|
|
|
|
|
|
|
self._cas = make_cas( |
|
|
|
|
|
bsv.get("cas"), |
|
|
|
|
|
self._path, |
|
|
|
|
|
lambda: hashlib.new(bsv.get("hash")), # type: ignore |
|
|
|
|
|
) |
|
|
|
|
|
self._min_chunk_size = bsv.get("min_chunk_size") |
|
|
|
|
|
self._avg_chunk_size = bsv.get("avg_chunk_size") |
|
|
|
|
|
self._max_chunk_size = bsv.get("max_chunk_size") |
|
|
|
|
|
|
|
|
self._path_map = [ |
|
|
self._path_map = [ |
|
|
PathPair.from_obj(pair) |
|
|
PathPair.from_obj(pair) |
|
|
for pair in bsv.get("path_map", []) |
|
|
for pair in bsv.get("path_map", []) |
|
|
@ -60,6 +92,318 @@ class Repository: |
|
|
def path_map(self) -> list[PathPair]: |
|
|
def path_map(self) -> list[PathPair]: |
|
|
return list(self._path_map) |
|
|
return list(self._path_map) |
|
|
|
|
|
|
|
|
|
|
|
def get_blob(self, digest: Digest) -> Blob: |
|
|
|
|
|
with self: |
|
|
|
|
|
return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore |
|
|
|
|
|
|
|
|
|
|
|
def add_blob(self, stream: BinaryIO) -> Digest: |
|
|
|
|
|
with self: |
|
|
|
|
|
return self._write(b"blob", stream) |
|
|
|
|
|
|
|
|
|
|
|
def get_tree(self, digest: Digest) -> Tree: |
|
|
|
|
|
with self: |
|
|
|
|
|
return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data) |
|
|
|
|
|
|
|
|
|
|
|
def add_tree(self, tree: Tree) -> Digest: |
|
|
|
|
|
with self: |
|
|
|
|
|
return self._cas.write(b"tree", tree.to_bytes()) |
|
|
|
|
|
|
|
|
|
|
|
def get_snapshot(self, digest: Digest) -> Snapshot: |
|
|
|
|
|
with self: |
|
|
|
|
|
return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data) |
|
|
|
|
|
|
|
|
|
|
|
def add_snapshot(self, snapshot: Snapshot) -> Digest: |
|
|
|
|
|
with self: |
|
|
|
|
|
return self._cas.write(b"snap", snapshot.to_bytes()) |
|
|
|
|
|
|
|
|
|
|
|
def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject: |
|
|
|
|
|
obj = self._cas.read(digest, object_type=object_type) |
|
|
|
|
|
stream = BytesIO(obj.data) |
|
|
|
|
|
return cls.from_stream(self, stream, digest_size=self._cas._digest_size) |
|
|
|
|
|
|
|
|
|
|
|
def _write(self, object_type: bytes, stream: BinaryIO) -> Digest: |
|
|
|
|
|
out = BytesIO() |
|
|
|
|
|
size = 0 |
|
|
|
|
|
for chunk in fastcdc( |
|
|
|
|
|
stream, |
|
|
|
|
|
min_size = self._min_chunk_size, |
|
|
|
|
|
avg_size = self._avg_chunk_size, |
|
|
|
|
|
max_size = self._max_chunk_size, |
|
|
|
|
|
fat = True, |
|
|
|
|
|
): |
|
|
|
|
|
size += chunk.length |
|
|
|
|
|
digest = self._cas.write(b"chnk", chunk.data) |
|
|
|
|
|
out.write(digest.digest) |
|
|
|
|
|
out.write(chunk.length.to_bytes(4)) |
|
|
|
|
|
return self._cas.write(object_type, size.to_bytes(8) + out.getvalue()) |
|
|
|
|
|
|
|
|
|
|
|
def __enter__(self): |
|
|
|
|
|
if self._context_depth == 0: |
|
|
|
|
|
self._cas.__enter__() |
|
|
|
|
|
self._context_depth += 1 |
|
|
|
|
|
return self |
|
|
|
|
|
|
|
|
|
|
|
def __exit__(self, exc_type, exc_value, traceback): |
|
|
|
|
|
self._context_depth -= 1 |
|
|
|
|
|
if self._context_depth == 0: |
|
|
|
|
|
return self._cas.__exit__(exc_type, exc_value, traceback) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_repository( |
|
|
|
|
|
destination: Path, |
|
|
|
|
|
name: str, |
|
|
|
|
|
cas: str = "simple", |
|
|
|
|
|
hash: str = "sha256", |
|
|
|
|
|
min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE, |
|
|
|
|
|
avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE, |
|
|
|
|
|
max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE, |
|
|
|
|
|
): |
|
|
|
|
|
from datetime import datetime as DateTime |
|
|
|
|
|
from os import getlogin |
|
|
|
|
|
|
|
|
|
|
|
if not name: |
|
|
|
|
|
raise RuntimeError("repository name cannot be empty") |
|
|
|
|
|
if not destination.parent.exists(): |
|
|
|
|
|
raise RuntimeError(f"destination directory {destination.parent} does not exists") |
|
|
|
|
|
if destination.exists() and not destination.is_dir(): |
|
|
|
|
|
raise RuntimeError(f"destination {destination} exists but is not a directory") |
|
|
|
|
|
if destination.exists() and len(list(destination.iterdir())): |
|
|
|
|
|
raise RuntimeError(f"destination directory {destination} is not empty") |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
destination.mkdir(exist_ok=True) |
|
|
|
|
|
except: |
|
|
|
|
|
raise RuntimeError(f"failed to create destination directory {destination}") |
|
|
|
|
|
|
|
|
|
|
|
bsv_table = tomlkit.table() |
|
|
|
|
|
bsv_table.add(tomlkit.comment("Name of the repository.")) |
|
|
|
|
|
bsv_table.add(tomlkit.comment("Ideally, this should be unique among all connected repositories.")) |
|
|
|
|
|
bsv_table.add("name", name) |
|
|
|
|
|
bsv_table.add(tomlkit.nl()) |
|
|
|
|
|
bsv_table.add(tomlkit.comment("Mapping between bsv tree and the actual filesystem.")) |
|
|
|
|
|
bsv_table.add("path_map", tomlkit.array()) |
|
|
|
|
|
bsv_table.add("cas", cas) |
|
|
|
|
|
bsv_table.add("hash", hash) |
|
|
|
|
|
bsv_table.add("min_chunk_size", min_chunk_size) |
|
|
|
|
|
bsv_table.add("avg_chunk_size", avg_chunk_size) |
|
|
|
|
|
bsv_table.add("max_chunk_size", max_chunk_size) |
|
|
|
|
|
|
|
|
|
|
|
doc = tomlkit.document() |
|
|
|
|
|
doc.add(tomlkit.comment("bsv repository configuration")) |
|
|
|
|
|
doc.add(tomlkit.comment(f"Created by {getlogin()} on {DateTime.now().isoformat()}.")) |
|
|
|
|
|
doc.add(tomlkit.nl()) |
|
|
|
|
|
doc.add("bsv", bsv_table) |
|
|
|
|
|
|
|
|
|
|
|
config_path = destination / "bsv_config.toml" |
|
|
|
|
|
try: |
|
|
|
|
|
stream = config_path.open("w", encoding="utf-8") |
|
|
|
|
|
except: |
|
|
|
|
|
raise RuntimeError("failed to open configuration file {config_path}") |
|
|
|
|
|
|
|
|
|
|
|
with stream: |
|
|
|
|
|
tomlkit.dump(doc, stream) |
|
|
|
|
|
|
|
|
|
|
|
return Repository(destination) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_cas(cas_name: str, path: Path, hash_factory: Callable[[], Hash]) -> SimpleCas: |
|
|
|
|
|
if cas_name == "simple": |
|
|
|
|
|
return SimpleCas(path, hash_factory) |
|
|
|
|
|
raise ConfigError(f"unknown cas name {cas_name}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
|
class ChunkedObject: |
|
|
|
|
|
repo: Repository |
|
|
|
|
|
size: int |
|
|
|
|
|
chunks: list[Chunk] |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
|
def from_stream(cls, repo: Repository, stream: BinaryIO, digest_size: int) -> ChunkedObject: |
|
|
|
|
|
self = cls( |
|
|
|
|
|
repo = repo, |
|
|
|
|
|
size = int.from_bytes(read_exact(stream, 8)), |
|
|
|
|
|
chunks = [], |
|
|
|
|
|
) |
|
|
|
|
|
while (chunk := Chunk.from_stream(stream, digest_size)) is not None: |
|
|
|
|
|
self.chunks.append(chunk) |
|
|
|
|
|
return self |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
|
class Blob(ChunkedObject): |
|
|
|
|
|
_chunk_index: int = 0 |
|
|
|
|
|
_chunk_data: bytes = b"" |
|
|
|
|
|
|
|
|
|
|
|
def read(self, num_bytes: int = -1) -> bytes: |
|
|
|
|
|
parts = [self._chunk_data] |
|
|
|
|
|
size = len(parts[-1]) |
|
|
|
|
|
while (num_bytes < 0 or size < num_bytes) and self._chunk_index < len(self.chunks): |
|
|
|
|
|
parts.append(self.read1()) |
|
|
|
|
|
size += len(parts[-1]) |
|
|
|
|
|
if num_bytes >= 0: |
|
|
|
|
|
self._chunk_data = parts[-1][num_bytes - size:] |
|
|
|
|
|
else: |
|
|
|
|
|
self._chunk_data = b"" |
|
|
|
|
|
return b"".join(parts) |
|
|
|
|
|
|
|
|
|
|
|
def read1(self) -> bytes: |
|
|
|
|
|
if self._chunk_index == len(self.chunks): |
|
|
|
|
|
return b"" |
|
|
|
|
|
object = self.repo._cas.read(self.chunks[self._chunk_index].digest, object_type=b"chnk") |
|
|
|
|
|
self._chunk_index += 1 |
|
|
|
|
|
return object.data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
|
class Tree: |
|
|
|
|
|
repo: Repository |
|
|
|
|
|
items: list[TreeItem] |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
|
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Tree: |
|
|
|
|
|
tree = Tree(repo, []) |
|
|
|
|
|
while (item := TreeItem.from_stream(stream, repo._cas._digest_size)) is not None: |
|
|
|
|
|
tree.items.append(item) |
|
|
|
|
|
return tree |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
|
def from_bytes(cls, repo: Repository, data: bytes) -> Tree: |
|
|
|
|
|
stream = BytesIO(data) |
|
|
|
|
|
return cls.from_stream(repo, stream) |
|
|
|
|
|
|
|
|
|
|
|
def write(self, stream: BinaryIO): |
|
|
|
|
|
self.items.sort(key=lambda i: i.name) |
|
|
|
|
|
for item in self.items: |
|
|
|
|
|
item.write(stream) |
|
|
|
|
|
|
|
|
|
|
|
def to_bytes(self) -> bytes: |
|
|
|
|
|
stream = BytesIO() |
|
|
|
|
|
self.write(stream) |
|
|
|
|
|
return stream.getvalue() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EPOCH = DateTime(1970, 1, 1, tzinfo=UTC) |
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
|
class TreeItem: |
|
|
|
|
|
name: str |
|
|
|
|
|
digest: Digest |
|
|
|
|
|
permissions: int |
|
|
|
|
|
creation_timestamp: int |
|
|
|
|
|
modification_timestamp: int |
|
|
|
|
|
|
|
|
|
|
|
def __init__( |
|
|
|
|
|
self, |
|
|
|
|
|
name: str, |
|
|
|
|
|
digest: Digest, |
|
|
|
|
|
permissions: int, |
|
|
|
|
|
creation_timestamp: int, |
|
|
|
|
|
modification_timestamp: int, |
|
|
|
|
|
): |
|
|
|
|
|
if "/\\" in name: |
|
|
|
|
|
raise ValueError(f"invalid tree item name {name}") |
|
|
|
|
|
self.name = name |
|
|
|
|
|
self.digest = digest |
|
|
|
|
|
self.permissions = permissions |
|
|
|
|
|
self.creation_timestamp = creation_timestamp |
|
|
|
|
|
self.modification_timestamp = modification_timestamp |
|
|
|
|
|
|
|
|
|
|
|
@property |
|
|
|
|
|
def creation_time(self) -> DateTime: |
|
|
|
|
|
return time_from_timestamp(self.creation_timestamp) |
|
|
|
|
|
@creation_time.setter |
|
|
|
|
|
def creation_time(self, time: DateTime): |
|
|
|
|
|
self.creation_timestamp = timestamp_from_time(time) |
|
|
|
|
|
|
|
|
|
|
|
@property |
|
|
|
|
|
def modification_time(self) -> DateTime: |
|
|
|
|
|
return time_from_timestamp(self.modification_timestamp) |
|
|
|
|
|
@modification_time.setter |
|
|
|
|
|
def modification_time(self, time: DateTime): |
|
|
|
|
|
self.modification_timestamp = timestamp_from_time(time) |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
|
def from_stream(cls, stream: BinaryIO, digest_size: int) -> TreeItem | None: |
|
|
|
|
|
digest_bytes = read_exact_or_eof(stream, digest_size) |
|
|
|
|
|
if digest_bytes is None: |
|
|
|
|
|
return None |
|
|
|
|
|
return TreeItem( |
|
|
|
|
|
digest = Digest(digest_bytes), |
|
|
|
|
|
permissions = int.from_bytes(read_exact(stream, 2)), |
|
|
|
|
|
creation_timestamp = int.from_bytes(read_exact(stream, 8), signed=True), |
|
|
|
|
|
modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True), |
|
|
|
|
|
name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def write(self, stream: BinaryIO): |
|
|
|
|
|
stream.write(self.digest.digest) |
|
|
|
|
|
stream.write(self.permissions.to_bytes(2)) |
|
|
|
|
|
stream.write(self.creation_timestamp.to_bytes(8, signed=True)) |
|
|
|
|
|
stream.write(self.modification_timestamp.to_bytes(8, signed=True)) |
|
|
|
|
|
name_bytes = self.name.encode("utf-8") |
|
|
|
|
|
stream.write(len(name_bytes).to_bytes(2)) |
|
|
|
|
|
stream.write(name_bytes) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
|
class Snapshot: |
|
|
|
|
|
repo: Repository |
|
|
|
|
|
tree_digest: Digest |
|
|
|
|
|
repo_name: str |
|
|
|
|
|
timestamp: int |
|
|
|
|
|
|
|
|
|
|
|
@property |
|
|
|
|
|
def time(self) -> DateTime: |
|
|
|
|
|
return time_from_timestamp(self.timestamp) |
|
|
|
|
|
@time.setter |
|
|
|
|
|
def time(self, time: DateTime): |
|
|
|
|
|
self.timestamp = timestamp_from_time(time) |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
|
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Snapshot: |
|
|
|
|
|
return Snapshot( |
|
|
|
|
|
repo = repo, |
|
|
|
|
|
tree_digest = Digest(read_exact(stream, repo._cas._digest_size)), |
|
|
|
|
|
repo_name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"), |
|
|
|
|
|
timestamp = int.from_bytes(read_exact(stream, 8), signed=True), |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
|
def from_bytes(cls, repo: Repository, data: bytes) -> Snapshot: |
|
|
|
|
|
stream = BytesIO(data) |
|
|
|
|
|
return cls.from_stream(repo, stream) |
|
|
|
|
|
|
|
|
|
|
|
def write(self, stream: BinaryIO): |
|
|
|
|
|
stream.write(self.tree_digest.digest) |
|
|
|
|
|
repo_name_bytes = self.repo_name.encode("utf-8") |
|
|
|
|
|
stream.write(len(repo_name_bytes).to_bytes(2)) |
|
|
|
|
|
stream.write(repo_name_bytes) |
|
|
|
|
|
stream.write(self.timestamp.to_bytes(8, signed=True)) |
|
|
|
|
|
|
|
|
|
|
|
def to_bytes(self) -> bytes: |
|
|
|
|
|
stream = BytesIO() |
|
|
|
|
|
self.write(stream) |
|
|
|
|
|
return stream.getvalue() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
|
class Chunk: |
|
|
|
|
|
digest: Digest |
|
|
|
|
|
size: int |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
|
def from_stream(cls, stream: BinaryIO, digest_size: int) -> Chunk | None: |
|
|
|
|
|
digest_bytes = read_exact_or_eof(stream, digest_size) |
|
|
|
|
|
if digest_bytes is None: |
|
|
|
|
|
return None |
|
|
|
|
|
digest = Digest(digest_bytes) |
|
|
|
|
|
|
|
|
|
|
|
return cls( |
|
|
|
|
|
digest = digest, |
|
|
|
|
|
size = int.from_bytes(read_exact(stream, 4)), |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PathPair: |
|
|
class PathPair: |
|
|
bsv: PurePosixPath |
|
|
bsv: PurePosixPath |
|
|
@ -83,4 +427,11 @@ class PathPair: |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
def __lt__(self, rhs: PathPair) -> bool: |
|
|
def __lt__(self, rhs: PathPair) -> bool: |
|
|
return self.bsv < rhs.bsv |
|
|
return self.bsv < rhs.bsv |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def time_from_timestamp(timestamp: int) -> DateTime: |
|
|
|
|
|
return EPOCH + TimeDelta(microseconds=timestamp) |
|
|
|
|
|
|
|
|
|
|
|
def timestamp_from_time(time: DateTime) -> int: |
|
|
|
|
|
return (time.astimezone(UTC) - EPOCH) // TimeDelta(microseconds=1) |