Skip to content

Have issue to download the manifest data with the tsv file. #32

@yyuu33nn

Description

@yyuu33nn

Attempting to download: https://downloads.hmpdacc.org/dacc/hhs/genome/microbiome/wgs/analysis/hmgs/v1/PGA/SRS015072/SRS015072-annotation.slimmap.v2.bz2
File number 1
Filename: SRS015072-annotation.slimmap.v2.bz2
Attempt 1 failed: 403 Client Error: Forbidden for url: https://downloads.hmpdacc.org/dacc/hhs/genome/microbiome/wgs/analysis/hmgs/v1/PGA/SRS015072/SRS015072-annotation.slimmap.v2.bz2

i have this output when i trying to download the data from HMP.

this is my current code

import argparse
import os
import requests
from urllib.parse import urlparse
from tqdm import tqdm
from requests.exceptions import RequestException
import time

# Step 1: Initialize Argument Parser
my_parser = argparse.ArgumentParser(description='Make sure your files exist in the data folder!')

# Step 2: Add Argument for Input File
my_parser.add_argument('-i', '--input',
                       action='store',
                       metavar='input',
                       type=str,
                       help='The path to your manifest file (TSV format)')

# Step 3: Parse Arguments
args = my_parser.parse_args()

# Step 4: Read the Manifest File
f_name = args.input
manifest_df = pd.read_csv(f_name, sep='\t')
list_of_single_column = manifest_df['urls'].tolist()

# Initialize Counters and Lists for Success and Failure Tracking
cc = 0
success = []
failed = []

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Referer": "https://downloads.hmpdacc.org/"
}

print("Updated: 15_04_2024")
print("Tip: In your manifest file.tsv, if the URL (https://...) works in your browser, this script will help you.")

# Step 5: File Download Loop
for x in list_of_single_column:
    try:
        print(f"\nAttempting to download: {x}")

        a = urlparse(x)
        filename = os.path.basename(a.path)
        cc += 1
        print(f"File number {cc}")
        print(f"Filename: {filename}")

        # Retry mechanism
        max_retries = 3
        retry_delay = 5  # seconds

        for attempt in range(max_retries):
            try:
                response = requests.get(x, headers=headers, stream=True)
                response.raise_for_status()  # Raise an error for HTTP issues

                total_size = int(response.headers.get('content-length', 0))

                with open(filename, 'wb') as f, tqdm(
                    desc=filename,
                    total=total_size,
                    unit='B',
                    unit_scale=True,
                    unit_divisor=1024,
                ) as bar:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                        bar.update(len(chunk))

                success.append(filename)
                print(f"Downloaded successfully: {filename}")
                break
            except RequestException as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt < max_retries - 1:
                    time.sleep(retry_delay)
                else:
                    print("Max retries reached. Skipping this file.")
                    failed.append(filename)

    except Exception as e:
        print(f"Error downloading {filename}: {e}")
        failed.append(filename)

# Step 6: Save Successful and Failed Downloads to Separate Manifest Files
successful = manifest_df[manifest_df['urls'].str.contains('|'.join(success), na=False)]
failed_df = manifest_df[manifest_df['urls'].str.contains('|'.join(failed), na=False)]

successful.to_csv("successful_manifest.tsv", sep='\t', index=False, header=True)
failed_df.to_csv("failed_manifest.tsv", sep='\t', index=False, header=True)

print("\n## Finally finished! ##")
print(f"Successful downloads: {len(success)}")
print(f"Failed downloads: {len(failed)}")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions