Skip to content

Commit 614431e

Browse files
committed
Pie: keep sentence boundaries
1 parent 98be010 commit 614431e

1 file changed

Lines changed: 63 additions & 63 deletions

File tree

pie/base/process.py

Lines changed: 63 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,63 @@
1-
"""
2-
Initialize the pie tagger from the python class directly, and use that object to tag.
3-
We use this method instead of calling 'pie tag' on the commandline,
4-
because we want to avoid the overhead of reinitializing the tagger.
5-
"""
6-
7-
# Standard library
8-
import os
9-
import shutil
10-
import tempfile
11-
import sys
12-
13-
# Some path magic to import pie.
14-
# Because pie mixes all kinds of absolute and relative imports.
15-
script_dir = os.path.dirname(os.path.realpath(__file__))
16-
sys.path.insert(0, os.path.join(script_dir, "pie"))
17-
18-
from pie.tagger import Tagger
19-
20-
# The extension of output files produced by the tagger.
21-
OUTPUT_EXTENSION = ".tsv"
22-
# Expected throughput in chars per sec.
23-
PROCESSING_SPEED = 370
24-
# Global tagger for the sake of initialization.
25-
tagger = None
26-
27-
28-
def init() -> None:
29-
"""
30-
We initialize the PIE tagger class directly.
31-
"""
32-
device = "cpu" if os.getenv("CPU_GPU") == "cpu" else "cuda"
33-
global tagger
34-
tagger = Tagger(
35-
batch_size=50,
36-
lower=False,
37-
max_sent_len=35,
38-
vrt=False,
39-
tokenize=True,
40-
device=device,
41-
)
42-
for model, tasks in [("model.tar", [])]:
43-
tagger.add_model(model, *tasks)
44-
print("Model initialized.")
45-
46-
47-
def process(in_file: str, out_file: str) -> None:
48-
"""
49-
Process the file with the global tagger instance.
50-
Pie outputs to the same directory as the input file.
51-
We process the file in a temporary directory so we don't polute /input.
52-
"""
53-
with tempfile.TemporaryDirectory() as temp_dir:
54-
# input
55-
temp_in_file = os.path.join(temp_dir, "file.txt")
56-
shutil.copy(in_file, temp_in_file)
57-
# tag
58-
tagger.tag_file(
59-
temp_in_file, use_beam=False, beam_width=10, keep_boundaries=False
60-
)
61-
# output
62-
temp_result_file = os.path.join(temp_dir, "file-pie.txt")
63-
shutil.move(temp_result_file, out_file)
1+
"""
2+
Initialize the pie tagger from the python class directly, and use that object to tag.
3+
We use this method instead of calling 'pie tag' on the commandline,
4+
because we want to avoid the overhead of reinitializing the tagger.
5+
"""
6+
7+
# Standard library
8+
import os
9+
import shutil
10+
import tempfile
11+
import sys
12+
13+
# Some path magic to import pie.
14+
# Because pie mixes all kinds of absolute and relative imports.
15+
script_dir = os.path.dirname(os.path.realpath(__file__))
16+
sys.path.insert(0, os.path.join(script_dir, "pie"))
17+
18+
from pie.tagger import Tagger
19+
20+
# The extension of output files produced by the tagger.
21+
OUTPUT_EXTENSION = ".tsv"
22+
# Expected throughput in chars per sec.
23+
PROCESSING_SPEED = 370
24+
# Global tagger for the sake of initialization.
25+
tagger = None
26+
27+
28+
def init() -> None:
29+
"""
30+
We initialize the PIE tagger class directly.
31+
"""
32+
device = "cpu" if os.getenv("CPU_GPU") == "cpu" else "cuda"
33+
global tagger
34+
tagger = Tagger(
35+
batch_size=50,
36+
lower=False,
37+
max_sent_len=35,
38+
vrt=False,
39+
tokenize=True,
40+
device=device,
41+
)
42+
for model, tasks in [("model.tar", [])]:
43+
tagger.add_model(model, *tasks)
44+
print("Model initialized.")
45+
46+
47+
def process(in_file: str, out_file: str) -> None:
48+
"""
49+
Process the file with the global tagger instance.
50+
Pie outputs to the same directory as the input file.
51+
We process the file in a temporary directory so we don't polute /input.
52+
"""
53+
with tempfile.TemporaryDirectory() as temp_dir:
54+
# input
55+
temp_in_file = os.path.join(temp_dir, "file.txt")
56+
shutil.copy(in_file, temp_in_file)
57+
# tag
58+
tagger.tag_file(
59+
temp_in_file, use_beam=False, beam_width=10, keep_boundaries=True
60+
)
61+
# output
62+
temp_result_file = os.path.join(temp_dir, "file-pie.txt")
63+
shutil.move(temp_result_file, out_file)

0 commit comments

Comments
 (0)