-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtokenizer.py
More file actions
44 lines (24 loc) · 777 Bytes
/
tokenizer.py
File metadata and controls
44 lines (24 loc) · 777 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
from logging import getLogger
import numpy as np
from tqdm import tqdm
import json
from collections import defaultdict
import torch
import torch.nn.functional as F
from utils import *
class AbstractTokenizer:
def __init__(self, config):
self.config = config
self.logger = getLogger()
self.eos_token = None
@property
def vocab_size(self):
raise NotImplementedError('Vocabulary size not implemented.')
def tokenize(self, example:dict) -> dict:
raise NotImplementedError("tokenization not implemented")
@property
def max_token_seq_len(self):
raise NotImplementedError("Maximum token sequence length not implemented")
def log(self, message, level='info'):
return log(message, self.config['accelerator'], self.logger, level=level)