|
1 | | -""" |
2 | | -Initialize the pie tagger from the python class directly, and use that object to tag. |
3 | | -We use this method instead of calling 'pie tag' on the commandline, |
4 | | -because we want to avoid the overhead of reinitializing the tagger. |
5 | | -""" |
6 | | - |
7 | | -# Standard library |
8 | | -import os |
9 | | -import shutil |
10 | | -import tempfile |
11 | | -import sys |
12 | | - |
13 | | -# Some path magic to import pie. |
14 | | -# Because pie mixes all kinds of absolute and relative imports. |
15 | | -script_dir = os.path.dirname(os.path.realpath(__file__)) |
16 | | -sys.path.insert(0, os.path.join(script_dir, "pie")) |
17 | | - |
18 | | -from pie.tagger import Tagger |
19 | | - |
20 | | -# The extension of output files produced by the tagger. |
21 | | -OUTPUT_EXTENSION = ".tsv" |
22 | | -# Expected throughput in chars per sec. |
23 | | -PROCESSING_SPEED = 370 |
24 | | -# Global tagger for the sake of initialization. |
25 | | -tagger = None |
26 | | - |
27 | | - |
28 | | -def init() -> None: |
29 | | - """ |
30 | | - We initialize the PIE tagger class directly. |
31 | | - """ |
32 | | - device = "cpu" if os.getenv("CPU_GPU") == "cpu" else "cuda" |
33 | | - global tagger |
34 | | - tagger = Tagger( |
35 | | - batch_size=50, |
36 | | - lower=False, |
37 | | - max_sent_len=35, |
38 | | - vrt=False, |
39 | | - tokenize=True, |
40 | | - device=device, |
41 | | - ) |
42 | | - for model, tasks in [("model.tar", [])]: |
43 | | - tagger.add_model(model, *tasks) |
44 | | - print("Model initialized.") |
45 | | - |
46 | | - |
47 | | -def process(in_file: str, out_file: str) -> None: |
48 | | - """ |
49 | | - Process the file with the global tagger instance. |
50 | | - Pie outputs to the same directory as the input file. |
51 | | - We process the file in a temporary directory so we don't polute /input. |
52 | | - """ |
53 | | - with tempfile.TemporaryDirectory() as temp_dir: |
54 | | - # input |
55 | | - temp_in_file = os.path.join(temp_dir, "file.txt") |
56 | | - shutil.copy(in_file, temp_in_file) |
57 | | - # tag |
58 | | - tagger.tag_file( |
59 | | - temp_in_file, use_beam=False, beam_width=10, keep_boundaries=False |
60 | | - ) |
61 | | - # output |
62 | | - temp_result_file = os.path.join(temp_dir, "file-pie.txt") |
63 | | - shutil.move(temp_result_file, out_file) |
| 1 | +""" |
| 2 | +Initialize the pie tagger from the python class directly, and use that object to tag. |
| 3 | +We use this method instead of calling 'pie tag' on the commandline, |
| 4 | +because we want to avoid the overhead of reinitializing the tagger. |
| 5 | +""" |
| 6 | + |
| 7 | +# Standard library |
| 8 | +import os |
| 9 | +import shutil |
| 10 | +import tempfile |
| 11 | +import sys |
| 12 | + |
| 13 | +# Some path magic to import pie. |
| 14 | +# Because pie mixes all kinds of absolute and relative imports. |
| 15 | +script_dir = os.path.dirname(os.path.realpath(__file__)) |
| 16 | +sys.path.insert(0, os.path.join(script_dir, "pie")) |
| 17 | + |
| 18 | +from pie.tagger import Tagger |
| 19 | + |
| 20 | +# The extension of output files produced by the tagger. |
| 21 | +OUTPUT_EXTENSION = ".tsv" |
| 22 | +# Expected throughput in chars per sec. |
| 23 | +PROCESSING_SPEED = 370 |
| 24 | +# Global tagger for the sake of initialization. |
| 25 | +tagger = None |
| 26 | + |
| 27 | + |
| 28 | +def init() -> None: |
| 29 | + """ |
| 30 | + We initialize the PIE tagger class directly. |
| 31 | + """ |
| 32 | + device = "cpu" if os.getenv("CPU_GPU") == "cpu" else "cuda" |
| 33 | + global tagger |
| 34 | + tagger = Tagger( |
| 35 | + batch_size=50, |
| 36 | + lower=False, |
| 37 | + max_sent_len=35, |
| 38 | + vrt=False, |
| 39 | + tokenize=True, |
| 40 | + device=device, |
| 41 | + ) |
| 42 | + for model, tasks in [("model.tar", [])]: |
| 43 | + tagger.add_model(model, *tasks) |
| 44 | + print("Model initialized.") |
| 45 | + |
| 46 | + |
| 47 | +def process(in_file: str, out_file: str) -> None: |
| 48 | + """ |
| 49 | + Process the file with the global tagger instance. |
| 50 | + Pie outputs to the same directory as the input file. |
| 51 | + We process the file in a temporary directory so we don't polute /input. |
| 52 | + """ |
| 53 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 54 | + # input |
| 55 | + temp_in_file = os.path.join(temp_dir, "file.txt") |
| 56 | + shutil.copy(in_file, temp_in_file) |
| 57 | + # tag |
| 58 | + tagger.tag_file( |
| 59 | + temp_in_file, use_beam=False, beam_width=10, keep_boundaries=True |
| 60 | + ) |
| 61 | + # output |
| 62 | + temp_result_file = os.path.join(temp_dir, "file-pie.txt") |
| 63 | + shutil.move(temp_result_file, out_file) |
0 commit comments