Skip to content

Commit 00e5549

Browse files
jongwookMajdoddin
authored andcommitted
Use tiktoken (openai#1044)
* use tiktoken==0.3.0 * formatting * tuple should be safer * Update whisper/tokenizer.py Co-authored-by: Ruhollah Majdoddin <r.majdodin@gmail.com> * use tiktoken 0.3.1 * reflecting suggestions * cleanup * bypassing load_tiktoken_bpe to avoid blobfile dep --------- Co-authored-by: Ruhollah Majdoddin <r.majdodin@gmail.com>
1 parent f05f320 commit 00e5549

15 files changed

+100601
-100096
lines changed

MANIFEST.in

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,4 @@ include requirements.txt
22
include README.md
33
include LICENSE
44
include whisper/assets/*
5-
include whisper/assets/gpt2/*
6-
include whisper/assets/multilingual/*
75
include whisper/normalizers/english.json

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ numpy
33
torch
44
tqdm
55
more-itertools
6-
transformers>=4.19.0
6+
tiktoken==0.3.1
77
ffmpeg-python==0.2.0

tests/test_transcribe.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import torch
55

66
import whisper
7+
from whisper.tokenizer import get_tokenizer
78

89

910
@pytest.mark.parametrize("model_name", whisper.available_models())
@@ -24,14 +25,18 @@ def test_transcribe(model_name: str):
2425
assert "your country" in transcription
2526
assert "do for you" in transcription
2627

28+
tokenizer = get_tokenizer(model.is_multilingual)
29+
all_tokens = [t for s in result["segments"] for t in s["tokens"]]
30+
assert tokenizer.decode(all_tokens) == result["text"]
31+
assert tokenizer.decode_with_timestamps(all_tokens).startswith("<|0.00|>")
32+
2733
timing_checked = False
2834
for segment in result["segments"]:
2935
for timing in segment["words"]:
3036
assert timing["start"] < timing["end"]
3137
if timing["word"].strip(" ,") == "Americans":
3238
assert timing["start"] <= 1.8
3339
assert timing["end"] >= 1.8
34-
print(timing)
3540
timing_checked = True
3641

3742
assert timing_checked

0 commit comments

Comments
 (0)