Blob, Tree & Snapshot objects.

This commit is contained in:
2023-11-12 21:54:25 +01:00
parent 8d6f8ad7db
commit 67d15f989a
7 changed files with 546 additions and 24 deletions

View File

@@ -15,20 +15,43 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations
from dataclasses import dataclass
from datetime import UTC, datetime as DateTime, timedelta as TimeDelta
import hashlib
from io import BytesIO
from pathlib import Path, PurePosixPath
import platform
import tomllib
from typing import Any
from typing import Any, BinaryIO, Callable, Type
from fastcdc import fastcdc
import tomlkit
from bsv import __version__
from bsv.simple_cas import SimpleCas
from bsv.simple_cas.cas import ConfigError, Digest, SimpleCas
from bsv.simple_cas.util import Hash, read_exact, read_exact_or_eof
DEFAULT_MIN_CHUNK_SIZE = 1 << 12
DEFAULT_AVG_CHUNK_SIZE = 1 << 16
DEFAULT_MAX_CHUNK_SIZE = 1 << 20
class Repository:
_path: Path
_name: str
_cas: SimpleCas
_min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE
_avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE
_max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE
_path_map: list[PathPair]
# _remotes: list[object]
_context_depth: int = 0
def __init__(self, path: Path):
self._path = path
@@ -39,6 +62,15 @@ class Repository:
self._name = bsv.get("name") or platform.node()
self._cas = make_cas(
bsv.get("cas"),
self._path,
lambda: hashlib.new(bsv.get("hash")), # type: ignore
)
self._min_chunk_size = bsv.get("min_chunk_size")
self._avg_chunk_size = bsv.get("avg_chunk_size")
self._max_chunk_size = bsv.get("max_chunk_size")
self._path_map = [
PathPair.from_obj(pair)
for pair in bsv.get("path_map", [])
@@ -60,6 +92,318 @@ class Repository:
def path_map(self) -> list[PathPair]:
return list(self._path_map)
def get_blob(self, digest: Digest) -> Blob:
with self:
return self._read(digest, object_type=b"blob", cls=Blob) # type: ignore
def add_blob(self, stream: BinaryIO) -> Digest:
with self:
return self._write(b"blob", stream)
def get_tree(self, digest: Digest) -> Tree:
with self:
return Tree.from_bytes(self, self._cas.read(digest, object_type=b"tree").data)
def add_tree(self, tree: Tree) -> Digest:
with self:
return self._cas.write(b"tree", tree.to_bytes())
def get_snapshot(self, digest: Digest) -> Snapshot:
with self:
return Snapshot.from_bytes(self, self._cas.read(digest, object_type=b"snap").data)
def add_snapshot(self, snapshot: Snapshot) -> Digest:
with self:
return self._cas.write(b"snap", snapshot.to_bytes())
def _read(self, digest: Digest, object_type: bytes, cls: Type[ChunkedObject]) -> ChunkedObject:
obj = self._cas.read(digest, object_type=object_type)
stream = BytesIO(obj.data)
return cls.from_stream(self, stream, digest_size=self._cas._digest_size)
def _write(self, object_type: bytes, stream: BinaryIO) -> Digest:
out = BytesIO()
size = 0
for chunk in fastcdc(
stream,
min_size = self._min_chunk_size,
avg_size = self._avg_chunk_size,
max_size = self._max_chunk_size,
fat = True,
):
size += chunk.length
digest = self._cas.write(b"chnk", chunk.data)
out.write(digest.digest)
out.write(chunk.length.to_bytes(4))
return self._cas.write(object_type, size.to_bytes(8) + out.getvalue())
def __enter__(self):
if self._context_depth == 0:
self._cas.__enter__()
self._context_depth += 1
return self
def __exit__(self, exc_type, exc_value, traceback):
self._context_depth -= 1
if self._context_depth == 0:
return self._cas.__exit__(exc_type, exc_value, traceback)
def create_repository(
destination: Path,
name: str,
cas: str = "simple",
hash: str = "sha256",
min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE,
avg_chunk_size: int = DEFAULT_AVG_CHUNK_SIZE,
max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE,
):
from datetime import datetime as DateTime
from os import getlogin
if not name:
raise RuntimeError("repository name cannot be empty")
if not destination.parent.exists():
raise RuntimeError(f"destination directory {destination.parent} does not exists")
if destination.exists() and not destination.is_dir():
raise RuntimeError(f"destination {destination} exists but is not a directory")
if destination.exists() and len(list(destination.iterdir())):
raise RuntimeError(f"destination directory {destination} is not empty")
try:
destination.mkdir(exist_ok=True)
except:
raise RuntimeError(f"failed to create destination directory {destination}")
bsv_table = tomlkit.table()
bsv_table.add(tomlkit.comment("Name of the repository."))
bsv_table.add(tomlkit.comment("Ideally, this should be unique among all connected repositories."))
bsv_table.add("name", name)
bsv_table.add(tomlkit.nl())
bsv_table.add(tomlkit.comment("Mapping between bsv tree and the actual filesystem."))
bsv_table.add("path_map", tomlkit.array())
bsv_table.add("cas", cas)
bsv_table.add("hash", hash)
bsv_table.add("min_chunk_size", min_chunk_size)
bsv_table.add("avg_chunk_size", avg_chunk_size)
bsv_table.add("max_chunk_size", max_chunk_size)
doc = tomlkit.document()
doc.add(tomlkit.comment("bsv repository configuration"))
doc.add(tomlkit.comment(f"Created by {getlogin()} on {DateTime.now().isoformat()}."))
doc.add(tomlkit.nl())
doc.add("bsv", bsv_table)
config_path = destination / "bsv_config.toml"
try:
stream = config_path.open("w", encoding="utf-8")
except:
raise RuntimeError("failed to open configuration file {config_path}")
with stream:
tomlkit.dump(doc, stream)
return Repository(destination)
def make_cas(cas_name: str, path: Path, hash_factory: Callable[[], Hash]) -> SimpleCas:
if cas_name == "simple":
return SimpleCas(path, hash_factory)
raise ConfigError(f"unknown cas name {cas_name}")
@dataclass
class ChunkedObject:
repo: Repository
size: int
chunks: list[Chunk]
@classmethod
def from_stream(cls, repo: Repository, stream: BinaryIO, digest_size: int) -> ChunkedObject:
self = cls(
repo = repo,
size = int.from_bytes(read_exact(stream, 8)),
chunks = [],
)
while (chunk := Chunk.from_stream(stream, digest_size)) is not None:
self.chunks.append(chunk)
return self
@dataclass
class Blob(ChunkedObject):
_chunk_index: int = 0
_chunk_data: bytes = b""
def read(self, num_bytes: int = -1) -> bytes:
parts = [self._chunk_data]
size = len(parts[-1])
while (num_bytes < 0 or size < num_bytes) and self._chunk_index < len(self.chunks):
parts.append(self.read1())
size += len(parts[-1])
if num_bytes >= 0:
self._chunk_data = parts[-1][num_bytes - size:]
else:
self._chunk_data = b""
return b"".join(parts)
def read1(self) -> bytes:
if self._chunk_index == len(self.chunks):
return b""
object = self.repo._cas.read(self.chunks[self._chunk_index].digest, object_type=b"chnk")
self._chunk_index += 1
return object.data
@dataclass
class Tree:
repo: Repository
items: list[TreeItem]
@classmethod
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Tree:
tree = Tree(repo, [])
while (item := TreeItem.from_stream(stream, repo._cas._digest_size)) is not None:
tree.items.append(item)
return tree
@classmethod
def from_bytes(cls, repo: Repository, data: bytes) -> Tree:
stream = BytesIO(data)
return cls.from_stream(repo, stream)
def write(self, stream: BinaryIO):
self.items.sort(key=lambda i: i.name)
for item in self.items:
item.write(stream)
def to_bytes(self) -> bytes:
stream = BytesIO()
self.write(stream)
return stream.getvalue()
EPOCH = DateTime(1970, 1, 1, tzinfo=UTC)
@dataclass
class TreeItem:
name: str
digest: Digest
permissions: int
creation_timestamp: int
modification_timestamp: int
def __init__(
self,
name: str,
digest: Digest,
permissions: int,
creation_timestamp: int,
modification_timestamp: int,
):
if "/\\" in name:
raise ValueError(f"invalid tree item name {name}")
self.name = name
self.digest = digest
self.permissions = permissions
self.creation_timestamp = creation_timestamp
self.modification_timestamp = modification_timestamp
@property
def creation_time(self) -> DateTime:
return time_from_timestamp(self.creation_timestamp)
@creation_time.setter
def creation_time(self, time: DateTime):
self.creation_timestamp = timestamp_from_time(time)
@property
def modification_time(self) -> DateTime:
return time_from_timestamp(self.modification_timestamp)
@modification_time.setter
def modification_time(self, time: DateTime):
self.modification_timestamp = timestamp_from_time(time)
@classmethod
def from_stream(cls, stream: BinaryIO, digest_size: int) -> TreeItem | None:
digest_bytes = read_exact_or_eof(stream, digest_size)
if digest_bytes is None:
return None
return TreeItem(
digest = Digest(digest_bytes),
permissions = int.from_bytes(read_exact(stream, 2)),
creation_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
modification_timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),
)
def write(self, stream: BinaryIO):
stream.write(self.digest.digest)
stream.write(self.permissions.to_bytes(2))
stream.write(self.creation_timestamp.to_bytes(8, signed=True))
stream.write(self.modification_timestamp.to_bytes(8, signed=True))
name_bytes = self.name.encode("utf-8")
stream.write(len(name_bytes).to_bytes(2))
stream.write(name_bytes)
@dataclass
class Snapshot:
repo: Repository
tree_digest: Digest
repo_name: str
timestamp: int
@property
def time(self) -> DateTime:
return time_from_timestamp(self.timestamp)
@time.setter
def time(self, time: DateTime):
self.timestamp = timestamp_from_time(time)
@classmethod
def from_stream(cls, repo: Repository, stream: BinaryIO) -> Snapshot:
return Snapshot(
repo = repo,
tree_digest = Digest(read_exact(stream, repo._cas._digest_size)),
repo_name = read_exact(stream, int.from_bytes(read_exact(stream, 2))).decode("utf-8"),
timestamp = int.from_bytes(read_exact(stream, 8), signed=True),
)
@classmethod
def from_bytes(cls, repo: Repository, data: bytes) -> Snapshot:
stream = BytesIO(data)
return cls.from_stream(repo, stream)
def write(self, stream: BinaryIO):
stream.write(self.tree_digest.digest)
repo_name_bytes = self.repo_name.encode("utf-8")
stream.write(len(repo_name_bytes).to_bytes(2))
stream.write(repo_name_bytes)
stream.write(self.timestamp.to_bytes(8, signed=True))
def to_bytes(self) -> bytes:
stream = BytesIO()
self.write(stream)
return stream.getvalue()
@dataclass
class Chunk:
digest: Digest
size: int
@classmethod
def from_stream(cls, stream: BinaryIO, digest_size: int) -> Chunk | None:
digest_bytes = read_exact_or_eof(stream, digest_size)
if digest_bytes is None:
return None
digest = Digest(digest_bytes)
return cls(
digest = digest,
size = int.from_bytes(read_exact(stream, 4)),
)
class PathPair:
bsv: PurePosixPath
@@ -83,4 +427,11 @@ class PathPair:
)
def __lt__(self, rhs: PathPair) -> bool:
return self.bsv < rhs.bsv
return self.bsv < rhs.bsv
def time_from_timestamp(timestamp: int) -> DateTime:
return EPOCH + TimeDelta(microseconds=timestamp)
def timestamp_from_time(time: DateTime) -> int:
return (time.astimezone(UTC) - EPOCH) // TimeDelta(microseconds=1)

View File

@@ -16,4 +16,4 @@
from __future__ import annotations
from bsv.simple_cas.cas import SimpleCas as Cas
from bsv.simple_cas.cas import SimpleCas

View File

@@ -18,9 +18,22 @@ from __future__ import annotations
from dataclasses import dataclass
import hashlib
from pathlib import Path
from typing import Any, BinaryIO, Callable, Optional
from typing import Any, BinaryIO, Callable, Iterator
from bsv.simple_cas.util import Hash
from bsv.simple_cas.util import Hash, read_exact_or_eof
class BsvError(RuntimeError):
pass
class NotFound(BsvError):
pass
class UnexpectedObjectType(BsvError):
pass
class ConfigError(BsvError):
pass
class SimpleCas:
@@ -28,7 +41,7 @@ class SimpleCas:
_hash_factory: Callable[[], Hash]
_digest_size: int
_index: dict[bytes, IndexItem]
_index: dict[Digest, IndexItem]
_is_inside_context: bool = False
@@ -41,9 +54,10 @@ class SimpleCas:
if (self._root_dir / "cas.idx").exists():
with (self._root_dir / "cas.idx").open("rb") as stream:
while True:
digest = stream.read(self._digest_size)
if not digest:
digest_bytes = read_exact_or_eof(stream, self._digest_size)
if not digest_bytes:
break
digest = Digest(digest_bytes)
object_type = stream.read(4)
offset = int.from_bytes(stream.read(4))
size = int.from_bytes(stream.read(4))
@@ -67,18 +81,24 @@ class SimpleCas:
def __len__(self) -> int:
return len(self._index)
def __contains__(self, digest: bytes) -> bool:
assert len(digest) == self._digest_size
def __contains__(self, digest: Digest) -> bool:
assert len(digest.digest) == self._digest_size
return digest in self._index
def read(self, digest: bytes) -> Optional[Object]:
def __iter__(self) -> Iterator[ObjectInfo]:
for digest, item in self._index.items():
yield ObjectInfo(digest, item.object_type, item.size)
def read(self, digest: Digest, object_type: bytes | None=None) -> Object:
item = self._index.get(digest)
if item is None:
return None
raise NotFound(f"object {digest} not found")
if object_type is not None and item.object_type != object_type:
raise UnexpectedObjectType(f"expected object of type {object_type.decode()}, got {item.object_type.decode()}")
with (self._root_dir / "cas.dat").open("rb") as stream:
stream.seek(item.offset)
assert stream.read(self._digest_size) == digest
assert stream.read(self._digest_size) == digest.digest
object_type = stream.read(4)
assert object_type == item.object_type
size = int.from_bytes(stream.read(4))
@@ -87,7 +107,7 @@ class SimpleCas:
return Object(object_type, data)
def write(self, object_type: bytes, data: bytes) -> bytes:
def write(self, object_type: bytes, data: bytes) -> Digest:
assert len(object_type) == 4
assert len(data) < 2**32
@@ -97,38 +117,38 @@ class SimpleCas:
hash.update(len(data).to_bytes(4))
hash.update(b"\0")
hash.update(data)
digest = hash.digest()
digest = Digest(hash.digest())
if digest not in self:
with self._open_writer(digest, object_type, len(data)) as out:
out.write(digest)
out.write(digest.digest)
out.write(object_type)
out.write(len(data).to_bytes(4))
out.write(data)
return digest
def get_ref(self, key: str) -> bytes | None:
def get_ref(self, key: str) -> Digest | None:
ref_path = self._ref_path(key)
if not ref_path.is_file():
return None
hex = ref_path.read_text().strip()
if len(hex) != 2 * self._digest_size:
return None
return bytes.fromhex(hex)
return Digest(bytes.fromhex(hex))
def set_ref(self, key: str, digest: bytes):
def set_ref(self, key: str, digest: Digest):
ref_path = self._ref_path(key)
ref_path.parent.mkdir(parents=True, exist_ok=True)
ref_path.write_text(digest.hex())
ref_path.write_text(str(digest))
def _open_writer(self, digest: bytes, object_type: bytes, size: int) -> BinaryIO:
def _open_writer(self, digest: Digest, object_type: bytes, size: int) -> BinaryIO:
dat_file = (self._root_dir / "cas.dat").open("ab")
offset = dat_file.tell()
self._index[digest] = IndexItem(object_type, offset, size)
with (self._root_dir / "cas.idx").open("ab") as idx_file:
idx_file.write(digest)
idx_file.write(digest.digest)
idx_file.write(object_type)
idx_file.write(offset.to_bytes(4))
idx_file.write(size.to_bytes(4))
@@ -144,13 +164,36 @@ class SimpleCas:
return self._root_dir / "refs" / key_path
@dataclass(frozen=True, order=True, slots=True)
class Digest:
digest: bytes
def __repr__(self) -> str:
return self.digest.hex()
@dataclass
class Object:
object_type: bytes
data: bytes
def __repr__(self) -> str:
return f"<Object {self.object_type.decode()}: {len(self.data)}B>"
@dataclass
class IndexItem:
object_type: bytes
offset: int
size: int
def __repr__(self) -> str:
return f"<IndexItem {self.object_type.decode()}: {self.offset}B +{self.size}B>"
@dataclass
class ObjectInfo:
digest: Digest
object_type: bytes
size: int
def __repr__(self) -> str:
return f"<ObjectInfo {self.digest} {self.object_type.decode()} {self.size}B>"

View File

@@ -16,6 +16,22 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import BinaryIO
def read_exact(stream: BinaryIO, num_bytes: int) -> bytes:
data = stream.read(num_bytes)
if len(data) != num_bytes:
raise IOError(f"expected {num_bytes} bytes, got {len(data)}")
return data
def read_exact_or_eof(stream: BinaryIO, num_bytes: int) -> bytes | None:
data = stream.read(num_bytes)
if not data:
return None
if len(data) != num_bytes:
raise IOError(f"expected {num_bytes} bytes, got {len(data)}")
return data
class Hash(ABC):