22
33import builtins
44import json
5+ import logging
56import re
67import shutil
8+ import stat
79import tempfile
810from collections .abc import Iterable
911from typing import IO , Any , Callable , Optional , Union
1012
13+ from ratarmountcore .hashing import compute_hashes
1114from ratarmountcore .mountsource import FileInfo , MountSource
1215from ratarmountcore .SQLiteIndex import SQLiteIndex
1316from ratarmountcore .utils import RatarmountError , overrides
1417
18+ logger = logging .getLogger (__name__ )
19+
1520
1621class SQLiteIndexMountSource (MountSource ):
1722 def __init__ (
@@ -21,6 +26,7 @@ def __init__(
2126 clearIndexCache : bool = False ,
2227 checkMetadata : Optional [Callable [[dict [str , Any ]], None ]] = None ,
2328 transform : Optional [tuple [str , str ]] = None ,
29+ hashes : Optional [list [str ]] = None ,
2430 writeIndex : bool = False ,
2531 verifyModificationTime : bool = False ,
2632 indexMinimumFileCount : int = 1000 ,
@@ -48,6 +54,7 @@ def __init__(
4854 self .writeIndex = writeIndex
4955 self .verifyModificationTime = verifyModificationTime
5056 self .options = options
57+ self .hashes = sorted (set (hashes or []))
5158
5259 # Initialize index
5360 if index is None :
@@ -87,7 +94,7 @@ def __init__(
8794 raise RatarmountError (f"Specified file { self .indexFilePath } is not a valid Ratarmount index." )
8895
8996 def _store_default_metadata (self ) -> None :
90- argumentsToSave = ['encoding' , 'transformPattern' ]
97+ argumentsToSave = ['encoding' , 'transformPattern' , 'hashes' ]
9198 argumentsMetadata = json .dumps (
9299 {argument : getattr (self , argument ) for argument in argumentsToSave if hasattr (self , argument )}
93100 )
@@ -102,27 +109,63 @@ def _check_metadata(self, metadata: dict[str, Any]) -> None:
102109
103110 if 'arguments' in metadata :
104111 SQLiteIndex .check_metadata_arguments (
105- json .loads (metadata ['arguments' ]), self , argumentsToCheck = ['encoding' , 'transformPattern' ]
112+ json .loads (metadata ['arguments' ]), self , argumentsToCheck = ['encoding' , 'transformPattern' , 'hashes' ]
106113 )
107114
108115 if 'backendName' not in metadata :
109116 self .index .try_to_open_first_file (lambda path : self .open (self .lookup (path )))
110117
118+ def _compute_and_store_hashes (self ) -> None :
119+ if not self .hashes :
120+ return
121+
122+ # Simply go over all file rows instead of expensive and complicated recursive tree traversal.
123+ rows = self .index .get_connection ().execute (
124+ f'SELECT * { SQLiteIndex .FROM_REGULAR_FILES } AND NOT isgenerated ORDER BY "offsetheader" ASC;'
125+ )
126+ xattrs : list [tuple [int , str , bytes ]] = []
127+ for row in rows :
128+ fileInfo = self .index ._row_to_file_info (row ) # pylint: disable=protected-access
129+ if not stat .S_ISREG (fileInfo .mode ) or not fileInfo .userdata :
130+ continue
131+ userData = fileInfo .userdata [- 1 ]
132+ if userData .isgenerated :
133+ continue
134+
135+ try :
136+ with self .open (fileInfo ) as fileObject :
137+ computed = compute_hashes (fileObject , fileInfo .size , self .hashes )
138+ except Exception as exception :
139+ logger .warning (
140+ "Failed to compute hashes for indexed file with offsetheader=%s: %s" ,
141+ userData .offsetheader ,
142+ exception ,
143+ )
144+ continue
145+
146+ xattrs += [
147+ (userData .offsetheader , f"user.hash.{ name } " , value .encode ('utf-8' )) for name , value in computed .items ()
148+ ]
149+ if len (xattrs ) >= 1000 :
150+ self .index .setxattrs (xattrs )
151+ xattrs .clear ()
152+
153+ if xattrs :
154+ self .index .setxattrs (xattrs )
155+
111156 def _finalize_index (
112157 self ,
113158 create_index : Callable [[], None ],
114159 * , # force all parameters after to be keyword-only
115160 store_metadata : Optional [Callable [[], None ]] = None ,
116161 isFileObject : Optional [bool ] = None ,
117162 ):
118- """
119- metadata
120- Should either be a list of attributes on 'self' that should be stored or a callable that stores
121- metadata by calling self.index.store_metadata. If it is None a default selection of attributes
122- will be saved.
123- """
163+ def create_index_and_post_process ():
164+ create_index ()
165+ self ._compute_and_store_hashes ()
166+
124167 self .index .finalize_index (
125- create_index = create_index ,
168+ create_index = create_index_and_post_process ,
126169 store_metadata = store_metadata if callable (store_metadata ) else self ._store_default_metadata ,
127170 isFileObject = isFileObject ,
128171 writeIndex = self .writeIndex ,
0 commit comments