Skip to content

Commit 79d0fd1

Browse files
committed
Add the main file
1 parent 234d4d9 commit 79d0fd1

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

main.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import os
2+
import sys
3+
from pathlib import Path
4+
5+
from settings.development import settings
6+
7+
# Add project root to path for imports
8+
PROJECT_ROOT = Path(__file__).parent
9+
sys.path.insert(0, str(PROJECT_ROOT))
10+
11+
from utils.downloader import download
12+
from utils.tokenizer import tokenize
13+
14+
15+
def main():
16+
"""Main entry point for the application."""
17+
18+
file_url = settings["file_url"]
19+
local_folder = settings["local_folder"]
20+
21+
try:
22+
if download(file_url, local_folder):
23+
print("✓ File downloaded successfully.")
24+
else:
25+
print("✓ File already exists.")
26+
except ValueError as ve:
27+
print(f"✗ Download error: {ve}")
28+
return
29+
30+
file_name = file_url.split("/")[-1]
31+
file_path = os.path.join(local_folder, file_name)
32+
33+
try:
34+
with open(file_path, "r", encoding="utf-8") as f:
35+
raw_text = f.read()
36+
37+
tokens = tokenize(raw_text)
38+
print(f"✓ Tokenization complete. Total tokens: {len(tokens)}")
39+
print(f"First 10 tokens: {tokens[:30]}")
40+
41+
except FileNotFoundError:
42+
print(f"✗ File not found: {file_path}")
43+
except Exception as e:
44+
print(f"✗ Error during tokenization: {e}")
45+
46+
47+
if __name__ == "__main__":
48+
main()

0 commit comments

Comments
 (0)