-
Notifications
You must be signed in to change notification settings - Fork 20
Expand file tree
/
Copy pathbpe_tokenizer.py
More file actions
27 lines (19 loc) · 876 Bytes
/
bpe_tokenizer.py
File metadata and controls
27 lines (19 loc) · 876 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Split
from tokenizers import Regex
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import BpeTrainer
from os import mkdir
def bpe_tokenizer(path="./data/selfies_subset.txt", save_to="./data/bpe/"):
try:
mkdir(save_to)
except FileExistsError:
pass
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = Split(pattern=Regex("\[|\]"), behavior="removed")
tokenizer.post_processor = TemplateProcessing(single="<s> $A </s>", pair="<s> $A </s> $B:1 </s>:1", special_tokens=[("<s>", 1), ("</s>", 2)],)
trainer = BpeTrainer(special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"])
tokenizer.train(files=[path], trainer=trainer)
tokenizer.save(save_to + "/bpe.json", pretty=True)
tokenizer.model.save(save_to)