Skip to content

Commit 4b2edc9

Browse files
committed
fixup! [feature] Add option for computing hashes and storing them as xattrs in the index
1 parent dd65c5e commit 4b2edc9

3 files changed

Lines changed: 41 additions & 70 deletions

File tree

core/ratarmountcore/hashing.py

Lines changed: 24 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
import hashlib
22
import io
3+
import logging
34
import zlib
4-
from collections.abc import Iterable
55
from dataclasses import dataclass
6-
from typing import IO, Any, Optional
6+
from typing import IO, Any
77

8-
from ratarmountcore.utils import RatarmountError
8+
logger = logging.getLogger(__name__)
99

1010

1111
@dataclass(frozen=True)
12-
class HashAlgorithmSpec:
13-
name: str
12+
class HashAlgorithm:
1413
sparse: bool
1514

1615

@@ -25,49 +24,22 @@ def hexdigest(self) -> str:
2524
return f"{self._value & 0xFFFFFFFF:08x}"
2625

2726

28-
def _normalize_algorithm_name(name: str) -> str:
29-
return name.strip().lower().replace('-', '_')
30-
31-
32-
def _discover_hashlib_algorithms() -> set[str]:
27+
def _build_hash_registry() -> dict[str, HashAlgorithm]:
3328
# shake_* require an explicit output length for digest() / hexdigest() and therefore do not fit this interface.
34-
discovered: set[str] = set()
35-
for name in hashlib.algorithms_available:
36-
normalized = _normalize_algorithm_name(name)
37-
if not normalized or normalized.startswith('shake_'):
38-
continue
39-
discovered.add(normalized)
40-
return discovered
41-
42-
43-
def _build_hash_registry() -> dict[str, HashAlgorithmSpec]:
44-
registry: dict[str, HashAlgorithmSpec] = {
45-
name: HashAlgorithmSpec(name=name, sparse=False) for name in sorted(_discover_hashlib_algorithms())
29+
hashlib_names = {name for name in hashlib.algorithms_available if name and not name.startswith('shake')}
30+
registry: dict[str, HashAlgorithm] = {
31+
# Replace sha3_512 to sha3-512 for readability in the --hashes CLI option and the user.hash.sha3-512 key.
32+
name.replace('_', '-'): HashAlgorithm(sparse=False)
33+
for name in sorted(hashlib_names)
4634
}
47-
registry['crc32'] = HashAlgorithmSpec(name='crc32', sparse=False)
48-
registry['smplayer'] = HashAlgorithmSpec(name='smplayer', sparse=True)
35+
registry['crc32'] = HashAlgorithm(sparse=False)
36+
registry['smplayer'] = HashAlgorithm(sparse=True)
4937
return registry
5038

5139

5240
HASH_REGISTRY = _build_hash_registry()
5341

5442

55-
def parse_hash_algorithms(algorithms: Optional[Iterable[str]]) -> list[str]:
56-
if not algorithms:
57-
return []
58-
59-
normalized = [
60-
_normalize_algorithm_name(part)
61-
for algorithm in algorithms
62-
for part in algorithm.split(',')
63-
if part.strip()
64-
]
65-
unknown = [algorithm for algorithm in normalized if algorithm not in HASH_REGISTRY]
66-
if unknown:
67-
raise RatarmountError(f"Unsupported hash algorithm(s): {', '.join(unknown)}")
68-
return list(dict.fromkeys(normalized))
69-
70-
7143
def _zero_pad(data: bytes, size: int) -> bytes:
7244
return data[:size] + b'\0' * max(0, size - len(data))
7345

@@ -111,21 +83,23 @@ def compute_hashes(fileObject: IO[bytes], fileSize: int, algorithms: list[str])
11183
hashers: dict[str, Any] = {}
11284
sparseAlgorithms: list[str] = []
11385
for algorithm in algorithms:
114-
spec = HASH_REGISTRY.get(algorithm, None)
115-
if spec is None:
116-
raise RatarmountError(f"Unsupported hash algorithm(s): {algorithm}")
86+
if not algorithm:
87+
continue
88+
specification = HASH_REGISTRY.get(algorithm, None)
89+
if specification is None:
90+
logger.warning("Unsupported hash algorithm: %s", algorithm)
91+
continue
11792

118-
if spec.sparse:
93+
if specification.sparse:
11994
sparseAlgorithms.append(algorithm)
95+
elif algorithm == 'crc32':
96+
hashers[algorithm] = _CRC32Hasher()
12097
else:
121-
# Create hasher objects for each algorithm.
122-
if algorithm == 'crc32':
123-
hashers[algorithm] = _CRC32Hasher()
124-
continue
12598
try:
99+
# hashlib.new seems to be stable against _ and - substitutions and probably also case.
126100
hashers[algorithm] = hashlib.new(algorithm)
127-
except ValueError as exception:
128-
raise RatarmountError(f"Unsupported hash algorithm(s): {algorithm}") from exception
101+
except ValueError:
102+
logger.warning("Unsupported hash: %s", algorithm, exc_info=logger.isEnabledFor(logging.DEBUG))
129103

130104
# Read and hash in chunks of 1 MiB.
131105
if hashers:

core/tests/test_SQLiteIndexedTar.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -672,27 +672,23 @@ def test_appending_to_large_archive(parallelization, tmpdir):
672672
class TestSQLiteIndexedTarXattrs:
673673
@staticmethod
674674
def test_hash_xattrs_for_regular_file(parallelization):
675+
hashes = ['crc32', 'sha256', 'sha3-256', 'smplayer']
675676
with (
676677
copy_test_file("single-file.tar") as path,
677-
SQLiteIndexedTar(
678-
path, writeIndex=False, parallelization=parallelization, hashes=['sha256', 'sha3_256', 'smplayer']
679-
) as indexedTar,
678+
SQLiteIndexedTar(path, writeIndex=False, parallelization=parallelization, hashes=hashes) as indexedTar,
680679
):
681-
fileInfo = indexedTar.lookup('/bar')
682-
assert fileInfo
683-
keys = indexedTar.list_xattr(fileInfo)
684-
685-
assert 'user.hash.sha256' in keys
686-
assert 'user.hash.sha3_256' in keys
687-
assert 'user.hash.smplayer' in keys
680+
path = '/bar'
681+
fileInfo = indexedTar.lookup(path)
682+
assert fileInfo, path
683+
assert set(indexedTar.list_xattr(fileInfo)) == {f'user.hash.{name}' for name in hashes}
688684

689685
data = indexedTar.open(fileInfo).read()
690-
assert indexedTar.get_xattr(fileInfo, 'user.hash.sha256') == hashlib.sha256(data).hexdigest().encode(
691-
'utf-8'
692-
)
693-
assert indexedTar.get_xattr(fileInfo, 'user.hash.sha3_256') == hashlib.sha3_256(data).hexdigest().encode(
694-
'utf-8'
686+
assert indexedTar.get_xattr(fileInfo, 'user.hash.crc32') == b'7e3265a8'
687+
assert (
688+
indexedTar.get_xattr(fileInfo, 'user.hash.sha256')
689+
== b'b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c'
695690
)
691+
assert indexedTar.get_xattr(fileInfo, 'user.hash.sha3-256') == hashlib.sha3_256(data).hexdigest().encode()
696692
assert indexedTar.get_xattr(fileInfo, 'user.hash.smplayer')
697693

698694
@staticmethod
@@ -709,8 +705,9 @@ def test_hashes_persist_in_written_sidecar_index(parallelization, tmpdir):
709705
parallelization=parallelization,
710706
hashes=['sha256'],
711707
) as indexedTar:
712-
fileInfo = indexedTar.lookup('/bar')
713-
assert fileInfo
708+
path = '/bar'
709+
fileInfo = indexedTar.lookup(path)
710+
assert fileInfo, path
714711
assert indexedTar.get_xattr(fileInfo, 'user.hash.sha256')
715712

716713
# Load existing index with stored xattrs.

tests/ratarmount-help.txt

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)