Skip to content

Commit 0508f2d

Browse files
committed
[feature] Add option for computing hashes and storing them as xattrs in the index
1 parent 24d2ab6 commit 0508f2d

8 files changed

Lines changed: 162 additions & 12 deletions

File tree

core/ratarmountcore/mountsource/SQLiteIndexMountSource.py

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,21 @@
22

33
import builtins
44
import json
5+
import logging
56
import re
67
import shutil
8+
import stat
79
import tempfile
810
from collections.abc import Iterable
911
from typing import IO, Any, Callable, Optional, Union
1012

13+
from ratarmountcore.hashing import compute_hashes
1114
from ratarmountcore.mountsource import FileInfo, MountSource
1215
from ratarmountcore.SQLiteIndex import SQLiteIndex
1316
from ratarmountcore.utils import RatarmountError, overrides
1417

18+
logger = logging.getLogger(__name__)
19+
1520

1621
class SQLiteIndexMountSource(MountSource):
1722
def __init__(
@@ -21,6 +26,7 @@ def __init__(
2126
clearIndexCache: bool = False,
2227
checkMetadata: Optional[Callable[[dict[str, Any]], None]] = None,
2328
transform: Optional[tuple[str, str]] = None,
29+
hashes: Optional[list[str]] = None,
2430
writeIndex: bool = False,
2531
verifyModificationTime: bool = False,
2632
indexMinimumFileCount: int = 1000,
@@ -48,6 +54,7 @@ def __init__(
4854
self.writeIndex = writeIndex
4955
self.verifyModificationTime = verifyModificationTime
5056
self.options = options
57+
self.hashes = sorted(set(hashes or []))
5158

5259
# Initialize index
5360
if index is None:
@@ -87,7 +94,7 @@ def __init__(
8794
raise RatarmountError(f"Specified file {self.indexFilePath} is not a valid Ratarmount index.")
8895

8996
def _store_default_metadata(self) -> None:
90-
argumentsToSave = ['encoding', 'transformPattern']
97+
argumentsToSave = ['encoding', 'transformPattern', 'hashes']
9198
argumentsMetadata = json.dumps(
9299
{argument: getattr(self, argument) for argument in argumentsToSave if hasattr(self, argument)}
93100
)
@@ -102,27 +109,63 @@ def _check_metadata(self, metadata: dict[str, Any]) -> None:
102109

103110
if 'arguments' in metadata:
104111
SQLiteIndex.check_metadata_arguments(
105-
json.loads(metadata['arguments']), self, argumentsToCheck=['encoding', 'transformPattern']
112+
json.loads(metadata['arguments']), self, argumentsToCheck=['encoding', 'transformPattern', 'hashes']
106113
)
107114

108115
if 'backendName' not in metadata:
109116
self.index.try_to_open_first_file(lambda path: self.open(self.lookup(path)))
110117

118+
def _compute_and_store_hashes(self) -> None:
119+
if not self.hashes:
120+
return
121+
122+
# Simply go over all file rows instead of expensive and complicated recursive tree traversal.
123+
rows = self.index.get_connection().execute(
124+
f'SELECT * {SQLiteIndex.FROM_REGULAR_FILES} AND NOT isgenerated ORDER BY "offsetheader" ASC;'
125+
)
126+
xattrs: list[tuple[int, str, bytes]] = []
127+
for row in rows:
128+
fileInfo = self.index._row_to_file_info(row) # pylint: disable=protected-access
129+
if not stat.S_ISREG(fileInfo.mode) or not fileInfo.userdata:
130+
continue
131+
userData = fileInfo.userdata[-1]
132+
if userData.isgenerated:
133+
continue
134+
135+
try:
136+
with self.open(fileInfo) as fileObject:
137+
computed = compute_hashes(fileObject, fileInfo.size, self.hashes)
138+
except Exception as exception:
139+
logger.warning(
140+
"Failed to compute hashes for indexed file with offsetheader=%s: %s",
141+
userData.offsetheader,
142+
exception,
143+
)
144+
continue
145+
146+
xattrs += [
147+
(userData.offsetheader, f"user.hash.{name}", value.encode('utf-8')) for name, value in computed.items()
148+
]
149+
if len(xattrs) >= 1000:
150+
self.index.setxattrs(xattrs)
151+
xattrs.clear()
152+
153+
if xattrs:
154+
self.index.setxattrs(xattrs)
155+
111156
def _finalize_index(
112157
self,
113158
create_index: Callable[[], None],
114159
*, # force all parameters after to be keyword-only
115160
store_metadata: Optional[Callable[[], None]] = None,
116161
isFileObject: Optional[bool] = None,
117162
):
118-
"""
119-
metadata
120-
Should either be a list of attributes on 'self' that should be stored or a callable that stores
121-
metadata by calling self.index.store_metadata. If it is None a default selection of attributes
122-
will be saved.
123-
"""
163+
def create_index_and_post_process():
164+
create_index()
165+
self._compute_and_store_hashes()
166+
124167
self.index.finalize_index(
125-
create_index=create_index,
168+
create_index=create_index_and_post_process,
126169
store_metadata=store_metadata if callable(store_metadata) else self._store_default_metadata,
127170
isFileObject=isFileObject,
128171
writeIndex=self.writeIndex,

core/ratarmountcore/mountsource/formats/tar.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,7 @@ class SQLiteIndexedTar(SQLiteIndexMountSource):
511511
'transformRecursiveMountPoint',
512512
'transformPattern',
513513
'ignoreZeros',
514+
'hashes',
514515
)
515516

516517
# fmt: off
@@ -793,7 +794,7 @@ def _get_archive_path(self) -> Optional[str]:
793794
return None if self.tarFileName == '<file object>' else self.tarFileName
794795

795796
def _store_metadata(self) -> None:
796-
argumentsToSave = list(self.INDEX_ARGUMENTS_TO_CHECK) + ['gzipSeekPointSpacing']
797+
argumentsToSave = [*list(self.INDEX_ARGUMENTS_TO_CHECK), 'gzipSeekPointSpacing']
797798
argumentsMetadata = json.dumps({argument: getattr(self, argument) for argument in argumentsToSave})
798799
# The second argument must be a path to a file to call os.stat with, not simply a file name.
799800
self.index.store_metadata(argumentsMetadata, "" if self.isFileObject else self.tarFileName)

core/tests/test_SQLiteIndexedTar.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import bz2
66
import concurrent.futures
7+
import hashlib
78
import io
89
import os
910
import stat
@@ -81,6 +82,9 @@ def test_tar_bz2_with_parallelization(parallelization):
8182
info = file.lookup('/mimi/00105.tar')
8283
assert info.userdata[0].offset == 1248256
8384

85+
# There should especially be no hash xattrs without explicit options being set for that.
86+
assert not file.list_xattr(info)
87+
8488
@staticmethod
8589
def test_recursive_tar_bz2_with_parallelization(parallelization):
8690
with (
@@ -661,3 +665,63 @@ def test_appending_to_large_archive(parallelization, tmpdir):
661665
assert indexedTar.exists("/bar")
662666
assert indexedTar.exists("/folder")
663667
assert indexedTar.exists("/bar2")
668+
669+
670+
@pytest.mark.parallel
671+
@pytest.mark.parametrize("parallelization", [1])
672+
class TestSQLiteIndexedTarXattrs:
673+
@staticmethod
674+
def test_hash_xattrs_for_regular_file(parallelization):
675+
with (
676+
copy_test_file("single-file.tar") as path,
677+
SQLiteIndexedTar(
678+
path, writeIndex=False, parallelization=parallelization, hashes=['sha256', 'sha3_256', 'smplayer']
679+
) as indexedTar,
680+
):
681+
fileInfo = indexedTar.lookup('/bar')
682+
assert fileInfo
683+
keys = indexedTar.list_xattr(fileInfo)
684+
685+
assert 'user.hash.sha256' in keys
686+
assert 'user.hash.sha3_256' in keys
687+
assert 'user.hash.smplayer' in keys
688+
689+
data = indexedTar.open(fileInfo).read()
690+
assert indexedTar.get_xattr(fileInfo, 'user.hash.sha256') == hashlib.sha256(data).hexdigest().encode(
691+
'utf-8'
692+
)
693+
assert indexedTar.get_xattr(fileInfo, 'user.hash.sha3_256') == hashlib.sha3_256(data).hexdigest().encode(
694+
'utf-8'
695+
)
696+
assert indexedTar.get_xattr(fileInfo, 'user.hash.smplayer')
697+
698+
@staticmethod
699+
def test_hashes_persist_in_written_sidecar_index(parallelization, tmpdir):
700+
with copy_test_file('single-file.tar') as tarPath:
701+
indexFilePath = os.path.join(tmpdir, 'single-file.tar.index')
702+
703+
# Open and force index creation.
704+
with SQLiteIndexedTar(
705+
tarFileName=tarPath,
706+
writeIndex=True,
707+
clearIndexCache=True,
708+
indexFilePath=indexFilePath,
709+
parallelization=parallelization,
710+
hashes=['sha256'],
711+
) as indexedTar:
712+
fileInfo = indexedTar.lookup('/bar')
713+
assert fileInfo
714+
assert indexedTar.get_xattr(fileInfo, 'user.hash.sha256')
715+
716+
# Load existing index with stored xattrs.
717+
with SQLiteIndexedTar(
718+
tarFileName=tarPath,
719+
writeIndex=False,
720+
clearIndexCache=False,
721+
indexFilePath=indexFilePath,
722+
parallelization=parallelization,
723+
) as indexedTar:
724+
fileInfo = indexedTar.lookup('/bar')
725+
assert fileInfo
726+
digest = indexedTar.get_xattr(fileInfo, 'user.hash.sha256')
727+
assert digest

core/tests/test_ZipMountSource.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# pylint: disable=wrong-import-position
55
# pylint: disable=protected-access
66

7+
import hashlib
78
import os
89
import shutil
910
import stat
@@ -121,6 +122,21 @@ def test_transform():
121122
with mountSource.open(mountSource.lookup(filePath)) as file:
122123
assert file.read() == b'iriya\n'
123124

125+
@staticmethod
126+
def test_hash_xattrs_for_regular_file():
127+
with ZipMountSource(find_test_file('folder-symlink.zip'), hashes=['sha256']) as mountSource:
128+
path = '/foo/fighter/ufo'
129+
fileInfo = mountSource.lookup(path)
130+
assert fileInfo, path
131+
132+
keys = mountSource.list_xattr(fileInfo)
133+
assert 'user.hash.sha256' in keys
134+
assert 'user.hash.crc32' not in keys
135+
136+
with mountSource.open(fileInfo) as file:
137+
expected = hashlib.sha256(file.read()).hexdigest().encode('utf-8')
138+
assert mountSource.get_xattr(fileInfo, 'user.hash.sha256') == expected
139+
124140

125141
def benchmark_fast_zipfile_decryption():
126142
with tempfile.TemporaryDirectory(suffix=".ratarmount-benchmark") as folder:

ratarmount/CLIHelpers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ def parsed_args_to_options(args) -> dict[str, Any]:
252252
'logFile' : args.log_file,
253253
'enableFileVersions' : args.file_versions,
254254
'controlInterface' : args.control_interface,
255+
'hashes' : args.hashes,
255256
'writeIndex' : True,
256257
'mount' : args.mount,
257258
}

ratarmount/cli.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import textwrap
1717
from typing import Any, Optional
1818

19+
from ratarmountcore.hashing import HASH_REGISTRY
1920
from ratarmountcore.utils import RatarmountError, get_xdg_cache_home
2021

2122
with contextlib.suppress(ImportError):
@@ -303,6 +304,12 @@ def create_parser(useColor: Optional[bool] = True) -> argparse.ArgumentParser:
303304
' --index-folder ~/.ratarmount : will only test ~/.ratarmount as a storage location and nothing else. '
304305
'Instead, it will first try ~/.ratarmount and the folder "foo,9000". ')
305306

307+
indexGroup.add_argument(
308+
'--hashes', type=str, action=AppendCommaSeparatedOverwriteDefault, default=None,
309+
help='Comma-separated list of hashes to compute for indexed files. They are stored as xattrs in the index '
310+
'with key user.hash.<hash_name>. Disabled by default because it is expensive. '
311+
f'Available algorithms: {", ".join(sorted(HASH_REGISTRY.keys()))}.')
312+
306313
# Recursion Options
307314

308315
# TODO The recursion depth is only heeded by AutoMountLayer but not by SQLiteIndexedTar.

tests/ratarmount-help.txt

Lines changed: 8 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/test_cli.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from ratarmountcore.utils import ceil_div
2222

2323
from ratarmount.cli import cli as ratarmountcli
24+
from ratarmount.cli import create_parser
2425

2526
try:
2627
import ext4
@@ -441,3 +442,14 @@ def test_recursive_extensions_patterns(recursiveExtensions, pathInArchive, shoul
441442
with RunRatarmount(mountPoint, args):
442443
path = Path(mountPoint) / pathInArchive
443444
assert path.is_file() == shouldExist
445+
446+
447+
def test_hashes_cli_defaults_and_parsing():
448+
parser = create_parser(useColor=False)
449+
450+
def parse(args: list[str]):
451+
return parser.parse_args(args).hashes
452+
453+
assert parse(['--hashes', 'sha256', '--hashes', 'smplayer', 'single-file.tar']) == ['sha256', 'smplayer']
454+
assert parse(['--hashes', 'sha256,smplayer', 'single-file.tar']) == ['sha256', 'smplayer']
455+
assert parse(['single-file.tar']) is None

0 commit comments

Comments
 (0)