Address points in #4

polm · polm · commit a64fd5a0b1be · 2020-07-13T00:29:43.000+09:00
Two things:

- isascii on strings is actuall 3.7+
- 私 was becoming 代名詞

It turns out a small number of words - 私, 君, 余, but not 僕 etc. -
have a lemma that looks like 私-代名詞. This is weird.
diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py
@@ -14,6 +14,40 @@
         'nihon': NIHONSHIKI,
 }
 
+def isascii(word):
+    try:
+       word.encode('ascii');
+       return True
+    except UnicodeEncodeError:
+       return False
+
+def has_foreign_lemma(word):
+    """Check if a word has a foreign lemma.
+
+    This doesn't get its own field, the lemma field is overloaded. There are
+    also cases where the lemma field is overloaded with non-foreign-lemma
+    information."""
+
+    if '-' in word.surface: 
+        # TODO check if this is actually possible in vanilla unidic
+        return False
+
+    if not word.feature.lemma:
+        # No lemma means no foreign lemma
+        return False
+
+    lemma = word.feature.lemma
+
+    if not '-' in lemma:
+        return False
+
+    cand = lemma.split('-')[-1]
+    # NOTE: some words have 外国 instead of a foreign spelling. ジル
+    # (Jill?) is an example. Unclear why this is the case.
+    # NOTE: There are other hyphenated lemmas, like 私-代名詞. 
+    if isascii(cand):
+        return True
+
 def load_exceptions():
     cdir = pathlib.Path(__file__).parent.absolute()
     exceptions = {}
@@ -130,7 +164,7 @@ def romaji_word(self, word):
         if word.surface.isdigit():
             return word.surface
 
-        if word.surface.isascii():
+        if isascii(word.surface):
             return word.surface
 
         if word.feature.pos1 == '補助記号':
@@ -145,13 +179,8 @@ def romaji_word(self, word):
                 word.feature.pos1 == '助詞' and word.feature.pron == 'オ'):
             return 'o'
         elif (self.use_foreign_spelling and 
-                '-' not in word.surface and word.feature.lemma and
-                '-' in word.feature.lemma and
-                '外国' not in word.feature.lemma):
+                has_foreign_lemma(word)):
             # this is a foreign word with known spelling
-
-            #NOTE: some words have 外国 instead of a foreign spelling. ジル
-            # (Jill?) is an example. Unclear why this is the case.
             return word.feature.lemma.split('-')[-1]
         elif word.feature.kana:
             # for known words
diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py
@@ -51,6 +51,7 @@
             "Kokuritsu kokugo kenkyuusho (NINJAL) wa, Nippon gogaku/gengogaku/Nippon go kyouiku kenkyuu wo chuushin to shita kenkyuu kikan desu."),
         ("やっちゃった！", "Yacchatta!"),
         ("暖かかった", "Atatakakatta"),
+        ("私はテストです", "Watakushi wa test desu"), # issue #4, 私 -> 代名詞
         ]
 
 SENTENCES_KUNREI = [

Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,7 @@`
`51`	`51`	`"Kokuritsu kokugo kenkyuusho (NINJAL) wa, Nippon gogaku/gengogaku/Nippon go kyouiku kenkyuu wo chuushin to shita kenkyuu kikan desu."),`
`52`	`52`	`("やっちゃった！", "Yacchatta!"),`
`53`	`53`	`("暖かかった", "Atatakakatta"),`
	`54`	`+ ("私はテストです", "Watakushi wa test desu"), # issue #4, 私 -> 代名詞`
`54`	`55`	`]`
`55`	`56`
`56`	`57`	`SENTENCES_KUNREI = [`