i have this output when i trying to download the data from HMP.
import argparse
import os
import requests
from urllib.parse import urlparse
from tqdm import tqdm
from requests.exceptions import RequestException
import time
# Step 1: Initialize Argument Parser
my_parser = argparse.ArgumentParser(description='Make sure your files exist in the data folder!')
# Step 2: Add Argument for Input File
my_parser.add_argument('-i', '--input',
action='store',
metavar='input',
type=str,
help='The path to your manifest file (TSV format)')
# Step 3: Parse Arguments
args = my_parser.parse_args()
# Step 4: Read the Manifest File
f_name = args.input
manifest_df = pd.read_csv(f_name, sep='\t')
list_of_single_column = manifest_df['urls'].tolist()
# Initialize Counters and Lists for Success and Failure Tracking
cc = 0
success = []
failed = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Referer": "https://downloads.hmpdacc.org/"
}
print("Updated: 15_04_2024")
print("Tip: In your manifest file.tsv, if the URL (https://...) works in your browser, this script will help you.")
# Step 5: File Download Loop
for x in list_of_single_column:
try:
print(f"\nAttempting to download: {x}")
a = urlparse(x)
filename = os.path.basename(a.path)
cc += 1
print(f"File number {cc}")
print(f"Filename: {filename}")
# Retry mechanism
max_retries = 3
retry_delay = 5 # seconds
for attempt in range(max_retries):
try:
response = requests.get(x, headers=headers, stream=True)
response.raise_for_status() # Raise an error for HTTP issues
total_size = int(response.headers.get('content-length', 0))
with open(filename, 'wb') as f, tqdm(
desc=filename,
total=total_size,
unit='B',
unit_scale=True,
unit_divisor=1024,
) as bar:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
bar.update(len(chunk))
success.append(filename)
print(f"Downloaded successfully: {filename}")
break
except RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(retry_delay)
else:
print("Max retries reached. Skipping this file.")
failed.append(filename)
except Exception as e:
print(f"Error downloading {filename}: {e}")
failed.append(filename)
# Step 6: Save Successful and Failed Downloads to Separate Manifest Files
successful = manifest_df[manifest_df['urls'].str.contains('|'.join(success), na=False)]
failed_df = manifest_df[manifest_df['urls'].str.contains('|'.join(failed), na=False)]
successful.to_csv("successful_manifest.tsv", sep='\t', index=False, header=True)
failed_df.to_csv("failed_manifest.tsv", sep='\t', index=False, header=True)
print("\n## Finally finished! ##")
print(f"Successful downloads: {len(success)}")
print(f"Failed downloads: {len(failed)}")
Attempting to download: https://downloads.hmpdacc.org/dacc/hhs/genome/microbiome/wgs/analysis/hmgs/v1/PGA/SRS015072/SRS015072-annotation.slimmap.v2.bz2
File number 1
Filename: SRS015072-annotation.slimmap.v2.bz2
Attempt 1 failed: 403 Client Error: Forbidden for url: https://downloads.hmpdacc.org/dacc/hhs/genome/microbiome/wgs/analysis/hmgs/v1/PGA/SRS015072/SRS015072-annotation.slimmap.v2.bz2
i have this output when i trying to download the data from HMP.
this is my current code