11import datetime
22import json
3+ import logging
4+ import math
5+ import os
36import re
47import urllib .request
8+ from concurrent .futures import (
9+ as_completed ,
10+ ThreadPoolExecutor ,
11+ )
512from typing import (
613 Any ,
714 cast ,
1118from urllib .error import HTTPError
1219from urllib .parse import quote
1320
21+ log = logging .getLogger (__name__ )
22+
1423from typing_extensions import (
1524 TypedDict ,
1625)
@@ -104,6 +113,53 @@ class RecordLinks(TypedDict):
104113 reserve_doi : str
105114
106115
116+ # AWS S3 multipart limits (used by Invenio RDM)
117+ MIN_UPLOAD_PART_SIZE = 50 * 1024 * 1024 # 50 MiB
118+ MAX_UPLOAD_PART_SIZE = 5 * 1024 ** 3 # 5 GiB
119+ MAX_UPLOAD_PARTS = 10_000
120+
121+
122+ def calculate_multipart_params (file_size : int , preferred_part_size : int | None = None ) -> tuple [int , int ]:
123+ """Calculate optimal parts count and part size for multipart upload.
124+
125+ Args:
126+ file_size: Total file size in bytes
127+ preferred_part_size: Preferred part size in bytes (optional)
128+
129+ Returns:
130+ Tuple of (parts_count, part_size)
131+
132+ Note:
133+ Maximum uploadable file size is MAX_UPLOAD_PARTS * MAX_UPLOAD_PART_SIZE (~48.8 TiB).
134+ Files larger than this will still return valid params but would fail server-side.
135+ """
136+ if file_size == 0 :
137+ return 1 , 0
138+
139+ # Start with preferred or minimum part size
140+ part_size = preferred_part_size or MIN_UPLOAD_PART_SIZE
141+
142+ # Ensure part_size is within bounds
143+ part_size = max (part_size , MIN_UPLOAD_PART_SIZE )
144+ part_size = min (part_size , MAX_UPLOAD_PART_SIZE )
145+
146+ # Calculate parts needed
147+ parts = math .ceil (file_size / part_size )
148+
149+ # If too many parts, increase part size (up to max)
150+ while parts > MAX_UPLOAD_PARTS and part_size < MAX_UPLOAD_PART_SIZE :
151+ part_size = min (part_size * 2 , MAX_UPLOAD_PART_SIZE )
152+ parts = math .ceil (file_size / part_size )
153+
154+ # For extremely large files, cap parts at MAX_UPLOAD_PARTS
155+ # This means part_size may effectively be larger than calculated
156+ # but such files would likely fail server-side anyway
157+ if parts > MAX_UPLOAD_PARTS :
158+ parts = MAX_UPLOAD_PARTS
159+
160+ return parts , part_size
161+
162+
107163class InvenioRecord (TypedDict ):
108164 id : str
109165 title : str
@@ -331,6 +387,25 @@ def upload_file_to_draft_container(
331387 file_path : str ,
332388 context : FilesSourceRuntimeContext [RDMFileSourceConfiguration ],
333389 ):
390+ file_size = os .path .getsize (file_path )
391+ threshold = context .config .multipart_threshold
392+
393+ use_multipart = threshold is not None and threshold > 0 and file_size >= threshold
394+
395+ if use_multipart :
396+ log .info (f"Using multipart upload for file '{ filename } ' ({ file_size } bytes >= threshold { threshold } )" )
397+ self ._upload_file_multipart (record_id , filename , file_path , file_size , context )
398+ else :
399+ self ._upload_file_single (record_id , filename , file_path , context )
400+
401+ def _upload_file_single (
402+ self ,
403+ record_id : str ,
404+ filename : str ,
405+ file_path : str ,
406+ context : FilesSourceRuntimeContext [RDMFileSourceConfiguration ],
407+ ):
408+ """Upload a file using single PUT request."""
334409 record = self ._get_draft_record (record_id , context )
335410 upload_file_url = record ["links" ]["files" ]
336411 headers = self ._get_request_headers (context , auth_required = True )
@@ -352,6 +427,136 @@ def upload_file_to_draft_container(
352427 response = requests .post (commit_file_upload_url , headers = headers )
353428 self ._ensure_response_has_expected_status_code (response , 200 )
354429
430+ def _upload_file_multipart (
431+ self ,
432+ record_id : str ,
433+ filename : str ,
434+ file_path : str ,
435+ file_size : int ,
436+ context : FilesSourceRuntimeContext [RDMFileSourceConfiguration ],
437+ ):
438+ """Upload a file using multipart upload.
439+
440+ Flow:
441+ 1. Calculate parts/part_size
442+ 2. POST with transfer metadata
443+ 3. Server returns links.parts[] with URL for each part
444+ 4. Upload parts (parallel for > 2 parts)
445+ 5. POST to commit URL
446+ """
447+ preferred_part_size = context .config .multipart_chunk_size
448+ num_parts , part_size = calculate_multipart_params (file_size , preferred_part_size )
449+
450+ log .info (f"Multipart upload: { num_parts } parts of { part_size } bytes each for '{ filename } '" )
451+
452+ record = self ._get_draft_record (record_id , context )
453+ upload_file_url = record ["links" ]["files" ]
454+ headers = self ._get_request_headers (context , auth_required = True )
455+
456+ # Initialize multipart upload with transfer metadata
457+ file_metadata = {
458+ "key" : filename ,
459+ "size" : file_size ,
460+ "transfer" : {
461+ "type" : "M" ,
462+ "parts" : num_parts ,
463+ "part_size" : part_size ,
464+ },
465+ }
466+ response = requests .post (upload_file_url , json = [file_metadata ], headers = headers )
467+ self ._ensure_response_has_expected_status_code (response , 201 )
468+
469+ # Get part upload URLs from response
470+ entries = response .json ()["entries" ]
471+ file_entry = next (entry for entry in entries if entry ["key" ] == filename )
472+ commit_url = file_entry ["links" ]["commit" ]
473+ part_links = file_entry .get ("links" , {}).get ("parts" , [])
474+
475+ if len (part_links ) != num_parts :
476+ raise Exception (
477+ f"Server returned { len (part_links )} part URLs but expected { num_parts } for file '{ filename } '"
478+ )
479+
480+ # Upload parts
481+ self ._upload_parts (file_path , file_size , part_size , part_links , headers )
482+
483+ # Commit multipart upload
484+ response = requests .post (commit_url , json = {}, headers = headers )
485+ self ._ensure_response_has_expected_status_code (response , 200 )
486+ log .info (f"Multipart upload completed for '{ filename } '" )
487+
488+ def _upload_parts (
489+ self ,
490+ file_path : str ,
491+ file_size : int ,
492+ part_size : int ,
493+ part_links : list [dict ],
494+ headers : dict ,
495+ ):
496+ """Upload all parts, sequentially for <=2 parts, parallel otherwise."""
497+ num_parts = len (part_links )
498+
499+ if num_parts <= 2 :
500+ # Sequential upload for small number of parts
501+ for part_index , part_info in enumerate (part_links ):
502+ self ._upload_single_part (file_path , file_size , part_size , part_index , part_info , headers )
503+ else :
504+ # Parallel upload for larger number of parts
505+ max_workers = min (4 , num_parts )
506+ with ThreadPoolExecutor (max_workers = max_workers ) as executor :
507+ futures = {}
508+ for part_index , part_info in enumerate (part_links ):
509+ future = executor .submit (
510+ self ._upload_single_part ,
511+ file_path ,
512+ file_size ,
513+ part_size ,
514+ part_index ,
515+ part_info ,
516+ headers ,
517+ )
518+ futures [future ] = part_index
519+
520+ for future in as_completed (futures ):
521+ part_index = futures [future ]
522+ try :
523+ future .result ()
524+ except Exception as e :
525+ log .error (f"Failed to upload part { part_index } : { e } " )
526+ raise
527+
528+ def _upload_single_part (
529+ self ,
530+ file_path : str ,
531+ file_size : int ,
532+ part_size : int ,
533+ part_index : int ,
534+ part_info : dict ,
535+ headers : dict ,
536+ ):
537+ """Upload a single part of a multipart upload."""
538+ part_url = part_info .get ("url" )
539+ if not part_url :
540+ raise Exception (f"No URL provided for part { part_index } " )
541+
542+ # Calculate byte range for this part
543+ start_byte = part_index * part_size
544+ end_byte = min (start_byte + part_size , file_size )
545+ part_content_length = end_byte - start_byte
546+
547+ log .debug (f"Uploading part { part_index } : bytes { start_byte } -{ end_byte - 1 } ({ part_content_length } bytes)" )
548+
549+ with open (file_path , "rb" ) as f :
550+ f .seek (start_byte )
551+ part_data = f .read (part_content_length )
552+
553+ part_headers = headers .copy ()
554+ part_headers ["Content-Length" ] = str (part_content_length )
555+ part_headers ["Content-Type" ] = "application/octet-stream"
556+
557+ response = requests .put (part_url , data = part_data , headers = part_headers )
558+ self ._ensure_response_has_expected_status_code (response , 200 )
559+
355560 def download_file_from_container (
356561 self ,
357562 container_id : str ,
0 commit comments