Skip to content

Commit a64fd5a

Browse files
committed
Address points in #4
Two things: - isascii on strings is actuall 3.7+ - 私 was becoming 代名詞 It turns out a small number of words - 私, 君, 余, but not 僕 etc. - have a lemma that looks like 私-代名詞. This is weird.
1 parent 689ade4 commit a64fd5a

File tree

2 files changed

+37
-7
lines changed

2 files changed

+37
-7
lines changed

cutlet/cutlet.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,40 @@
1414
'nihon': NIHONSHIKI,
1515
}
1616

17+
def isascii(word):
18+
try:
19+
word.encode('ascii');
20+
return True
21+
except UnicodeEncodeError:
22+
return False
23+
24+
def has_foreign_lemma(word):
25+
"""Check if a word has a foreign lemma.
26+
27+
This doesn't get its own field, the lemma field is overloaded. There are
28+
also cases where the lemma field is overloaded with non-foreign-lemma
29+
information."""
30+
31+
if '-' in word.surface:
32+
# TODO check if this is actually possible in vanilla unidic
33+
return False
34+
35+
if not word.feature.lemma:
36+
# No lemma means no foreign lemma
37+
return False
38+
39+
lemma = word.feature.lemma
40+
41+
if not '-' in lemma:
42+
return False
43+
44+
cand = lemma.split('-')[-1]
45+
# NOTE: some words have 外国 instead of a foreign spelling. ジル
46+
# (Jill?) is an example. Unclear why this is the case.
47+
# NOTE: There are other hyphenated lemmas, like 私-代名詞.
48+
if isascii(cand):
49+
return True
50+
1751
def load_exceptions():
1852
cdir = pathlib.Path(__file__).parent.absolute()
1953
exceptions = {}
@@ -130,7 +164,7 @@ def romaji_word(self, word):
130164
if word.surface.isdigit():
131165
return word.surface
132166

133-
if word.surface.isascii():
167+
if isascii(word.surface):
134168
return word.surface
135169

136170
if word.feature.pos1 == '補助記号':
@@ -145,13 +179,8 @@ def romaji_word(self, word):
145179
word.feature.pos1 == '助詞' and word.feature.pron == 'オ'):
146180
return 'o'
147181
elif (self.use_foreign_spelling and
148-
'-' not in word.surface and word.feature.lemma and
149-
'-' in word.feature.lemma and
150-
'外国' not in word.feature.lemma):
182+
has_foreign_lemma(word)):
151183
# this is a foreign word with known spelling
152-
153-
#NOTE: some words have 外国 instead of a foreign spelling. ジル
154-
# (Jill?) is an example. Unclear why this is the case.
155184
return word.feature.lemma.split('-')[-1]
156185
elif word.feature.kana:
157186
# for known words

cutlet/test/test_basic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
"Kokuritsu kokugo kenkyuusho (NINJAL) wa, Nippon gogaku/gengogaku/Nippon go kyouiku kenkyuu wo chuushin to shita kenkyuu kikan desu."),
5252
("やっちゃった!", "Yacchatta!"),
5353
("暖かかった", "Atatakakatta"),
54+
("私はテストです", "Watakushi wa test desu"), # issue #4, 私 -> 代名詞
5455
]
5556

5657
SENTENCES_KUNREI = [

0 commit comments

Comments
 (0)