1414 'nihon' : NIHONSHIKI ,
1515}
1616
17+ def isascii (word ):
18+ try :
19+ word .encode ('ascii' );
20+ return True
21+ except UnicodeEncodeError :
22+ return False
23+
24+ def has_foreign_lemma (word ):
25+ """Check if a word has a foreign lemma.
26+
27+ This doesn't get its own field, the lemma field is overloaded. There are
28+ also cases where the lemma field is overloaded with non-foreign-lemma
29+ information."""
30+
31+ if '-' in word .surface :
32+ # TODO check if this is actually possible in vanilla unidic
33+ return False
34+
35+ if not word .feature .lemma :
36+ # No lemma means no foreign lemma
37+ return False
38+
39+ lemma = word .feature .lemma
40+
41+ if not '-' in lemma :
42+ return False
43+
44+ cand = lemma .split ('-' )[- 1 ]
45+ # NOTE: some words have 外国 instead of a foreign spelling. ジル
46+ # (Jill?) is an example. Unclear why this is the case.
47+ # NOTE: There are other hyphenated lemmas, like 私-代名詞.
48+ if isascii (cand ):
49+ return True
50+
1751def load_exceptions ():
1852 cdir = pathlib .Path (__file__ ).parent .absolute ()
1953 exceptions = {}
@@ -130,7 +164,7 @@ def romaji_word(self, word):
130164 if word .surface .isdigit ():
131165 return word .surface
132166
133- if word .surface . isascii ( ):
167+ if isascii ( word .surface ):
134168 return word .surface
135169
136170 if word .feature .pos1 == '補助記号' :
@@ -145,13 +179,8 @@ def romaji_word(self, word):
145179 word .feature .pos1 == '助詞' and word .feature .pron == 'オ' ):
146180 return 'o'
147181 elif (self .use_foreign_spelling and
148- '-' not in word .surface and word .feature .lemma and
149- '-' in word .feature .lemma and
150- '外国' not in word .feature .lemma ):
182+ has_foreign_lemma (word )):
151183 # this is a foreign word with known spelling
152-
153- #NOTE: some words have 外国 instead of a foreign spelling. ジル
154- # (Jill?) is an example. Unclear why this is the case.
155184 return word .feature .lemma .split ('-' )[- 1 ]
156185 elif word .feature .kana :
157186 # for known words
0 commit comments