File tree Expand file tree Collapse file tree 4 files changed +7
-15
lines changed
Expand file tree Collapse file tree 4 files changed +7
-15
lines changed Original file line number Diff line number Diff line change @@ -27,16 +27,15 @@ def main():
2727 print (f"✗ Download error: { ve } " )
2828 return
2929
30- file_name = file_url .split ("/" )[- 1 ]
31- file_path = os .path .join (local_folder , file_name )
30+ file_path = os .path .join (local_folder , file_url .split ("/" )[- 1 ])
3231
3332 try :
3433 with open (file_path , "r" , encoding = "utf-8" ) as f :
3534 raw_text = f .read ()
3635
3736 tokens = tokenize (raw_text )
3837 print (f"✓ Tokenization complete. Total tokens: { len (tokens )} " )
39- print (f"First 10 tokens: { tokens [:30 ]} " )
38+ print (f"First 30 tokens: { tokens [:30 ]} " )
4039
4140 except FileNotFoundError :
4241 print (f"✗ File not found: { file_path } " )
Original file line number Diff line number Diff line change 22
33settings = {
44 "file_url" : "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" ,
5- "local_folder" : f" { os .path .join (os .path .expanduser ("~" ), "data" )} " ,
5+ "local_folder" : os .path .join (os .path .expanduser ("~" ), "data" ),
66}
Original file line number Diff line number Diff line change 44from settings .development import settings
55
66
7- def assert_local_folder_exists (local_folder : str ):
8- """Ensure the download folder exists."""
9-
10- if not os .path .exists (local_folder ):
11- os .makedirs (local_folder , exist_ok = True )
12-
13-
14- def download (file_url , local_folder ) -> bool :
7+ def download (file_url : str , local_folder : str ) -> bool :
158 """Download a file from a predefined URL if it does not exist locally."""
169
17- assert_local_folder_exists (local_folder )
10+ os . makedirs (local_folder , exist_ok = True )
1811
1912 file_name = file_url .split ("/" )[- 1 ]
2013 file_path = os .path .join (local_folder , file_name )
Original file line number Diff line number Diff line change 11import re
22
33
4- def tokenize (text ) :
4+ def tokenize (text : str ) -> list [ str ] :
55 """Tokenizes the input text into words.
66
77 Args:
@@ -11,7 +11,7 @@ def tokenize(text):
1111 list: A list of tokens (words).
1212 """
1313
14- result = re .split (r' ([,.:;?_!"()\ ']|--|\s)' , text )
14+ result = re .split (r" ([,.:;?_!\ "()']|--|\s)" , text )
1515 result = [token .strip () for token in result if token .strip ()]
1616 return result
1717
You can’t perform that action at this time.
0 commit comments