Skip to content

Commit ff3c02b

Browse files
NielsMayerjongwook
authored andcommitted
Add TSV formatted output in transcript, using integer start/end times in milliseconds. (openai#228)
* Add CSV format output in transcript, containing lines of characters formatted like: <startTime-in-integer-milliseconds>, <endTime-in-integer-milliseconds>, <transcript-including-commas> * for easier reading by spreadsheets importing CSV, the third column of the CSV file is delimited by quotes, and any quote characters that might be in the transcript (which would interfere with parsing the third column as a string) are converted to "''". * fix syntax error * docstring edit Co-authored-by: Jong Wook Kim <jongwook@openai.com> Co-authored-by: Jong Wook Kim <jongwook@nyu.edu>
1 parent 91519d4 commit ff3c02b

File tree

2 files changed

+22
-1
lines changed

2 files changed

+22
-1
lines changed

whisper/transcribe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def cli():
280280
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
281281
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
282282
parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
283-
parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
283+
parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "tsv", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
284284
parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
285285

286286
parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")

whisper/utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,25 @@ def write_result(self, result: dict, file: TextIO):
102102
)
103103

104104

105+
class WriteTSV(ResultWriter):
106+
"""
107+
Write a transcript to a file in TSV (tab-separated values) format containing lines like:
108+
<start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
109+
110+
Using integer milliseconds as start and end times means there's no chance of interference from
111+
an environment setting a language encoding that causes the decimal in a floating point number
112+
to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
113+
"""
114+
extension: str = "tsv"
115+
116+
def write_result(self, result: dict, file: TextIO):
117+
print("start", "end", "text", sep="\t", file=file)
118+
for segment in result["segments"]:
119+
print(round(1000 * segment['start']), file=file, end="\t")
120+
print(round(1000 * segment['end']), file=file, end="\t")
121+
print(segment['text'].strip().replace("\t", " "), file=file, flush=True)
122+
123+
105124
class WriteJSON(ResultWriter):
106125
extension: str = "json"
107126

@@ -114,6 +133,7 @@ def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO],
114133
"txt": WriteTXT,
115134
"vtt": WriteVTT,
116135
"srt": WriteSRT,
136+
"tsv": WriteTSV,
117137
"json": WriteJSON,
118138
}
119139

@@ -127,3 +147,4 @@ def write_all(result: dict, file: TextIO):
127147
return write_all
128148

129149
return writers[output_format](output_dir)
150+

0 commit comments

Comments
 (0)