finetune_parakeetv2_arabic/prepare_manifest.py at master · Ahmed-Ezzat20/finetune_parakeetv2_arabic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import json
import librosa
import os
import wget
import tarfile
import random
from pathlib import Path
import soundfile as sf
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Define paths
base_dir = os.path.join(os.getcwd(), "data", "LJSpeech-1.1")
dataset_dir = os.path.join(base_dir, "LJSpeech-1.1")
metadata_file = os.path.join(dataset_dir, "metadata.csv")  # Path to LJSpeech metadata
input_audio_dir = os.path.join(dataset_dir, "wavs")  # Original 22.05 kHz WAVs
output_audio_dir = os.path.join(dataset_dir, "wavs_16k")  # Resampled 16 kHz WAVs
manifest_file = os.path.join(dataset_dir, "manifest.json")  # NeMo manifest file
os.makedirs(base_dir, exist_ok=True)
os.makedirs(dataset_dir, exist_ok=True)
os.makedirs(output_audio_dir, exist_ok=True)
os.makedirs(input_audio_dir, exist_ok=True)

# Step 1: Download the LJSpeech dataset if it doesn't exist
dataset_url = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
dataset_tar = os.path.join(base_dir, "LJSpeech-1.1.tar.bz2")

if not os.path.exists(metadata_file):
    print(f"Downloading LJSpeech dataset from {dataset_url}...")
    wget.download(dataset_url, dataset_tar)
    print("\nExtracting dataset...")
    with tarfile.open(dataset_tar, "r:bz2") as tar:
        tar.extractall(base_dir)
    print(f"Dataset extracted to {dataset_dir}")
else:
    print(f"Dataset already exists at {dataset_dir}")

os.makedirs(output_audio_dir, exist_ok=True)  # Ensure output directory exists
train_manifest = "train_manifest.json"
val_manifest = "val_manifest.json"
test_manifest = "test_manifest.json"
target_sr = 16000  # Target sample rate for Parakeet v2

# Set random seed for reproducibility
random.seed(42)

# Ensure the audio directories exist
if not os.path.exists(input_audio_dir):
    raise FileNotFoundError(f"Input audio directory {input_audio_dir} does not exist.")
if not os.path.exists(output_audio_dir):
    os.makedirs(output_audio_dir)

# Step 1: Resample audio to 16kHz if needed
def resample_audio():
    wav_files = [f for f in os.listdir(input_audio_dir) if f.endswith(".wav")]
    for wav_file in tqdm(wav_files, desc="Resampling audio"):
        input_path = os.path.join(input_audio_dir, wav_file)
        output_path = os.path.join(output_audio_dir, wav_file)

        # Skip if already resampled
        if os.path.exists(output_path):
            continue

        try:
            audio, sr = librosa.load(input_path, sr=None)
            if sr != target_sr:
                audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
                sf.write(output_path, audio_resampled, target_sr, subtype='PCM_16')
            else:
                # Copy file if already at target sample rate
                with open(input_path, 'rb') as src, open(output_path, 'wb') as dst:
                    dst.write(src.read())
        except Exception as e:
            print(f"Error resampling {wav_file}: {e}")

# Step 2: Read and validate metadata, create entries
def process_metadata():
    entries = []
    error_count = 0

    with open(metadata_file, "r", encoding="utf-8") as f:
        for line_number, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                print(f"Skipping empty line {line_number} in {metadata_file}")
                continue

            parts = line.split("|")
            if len(parts) < 2:
                print(f"Skipping malformed line {line_number}: {line}")
                continue

            audio_id, normalized_text, _ = parts
            audio_path = os.path.join(output_audio_dir, f"{audio_id}.wav")

            # Check if audio file exists
            if not os.path.exists(audio_path):
                print(f"Line {line_number}: Audio file {audio_path} not found, skipping.")
                continue

            # Calculate duration
            try:
                duration = librosa.get_duration(path=audio_path)
            except Exception as e:
                print(f"Line {line_number}: Error calculating duration for {audio_path}: {e}")
                continue

            # Clean text to remove non-ASCII characters
            cleaned_text = normalized_text.encode('ascii', 'ignore').decode('ascii')

            # Normalize audio path to use forward slashes
            normalized_audio_path = str(Path(audio_path).as_posix())

            entries.append({
                "audio_filepath": normalized_audio_path,
                "text": cleaned_text,
                "duration": duration
            })

    if error_count > 0:
        print(f"Found {error_count} errors in {metadata_file}. Please review the logs.")

    return entries

# Step 3: Write manifest file
def write_manifest(entries, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for entry in entries:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")
    print(f"Created {filename} with {len(entries)} entries")

# Main execution
if __name__ == "__main__":
    # Resample audio
    resample_audio()

    # Process metadata and create entries
    entries = process_metadata()

    # Shuffle and split the dataset
    random.shuffle(entries)
    train_entries, temp_entries = train_test_split(entries, test_size=0.2, random_state=42)
    val_entries, test_entries = train_test_split(temp_entries, test_size=0.5, random_state=42)

    # Write manifest files
    write_manifest(train_entries, train_manifest)
    write_manifest(val_entries, val_manifest)
    write_manifest(test_entries, test_manifest)

    print(f"Total entries processed: {len(entries)}")
    print(f"Training set: {len(train_entries)} entries")
    print(f"Validation set: {len(val_entries)} entries")
    print(f"Test set: {len(test_entries)} entries")