-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathmyougiden_api.py
More file actions
325 lines (266 loc) · 12.3 KB
/
Copy pathmyougiden_api.py
File metadata and controls
325 lines (266 loc) · 12.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#!/usr/bin/env python3
import argparse
import re
import sys
import romkan
from myougiden import (
color,
common,
config,
database,
orm,
search,
texttools as tt
)
def run(query):
ap = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
ap.add_argument('--version', action='store_true',
help='Show version.')
ag = ap.add_argument_group('Type of query',
'''What field to look in. If not provided,
try all of them and return the first
to match.''')
ag.add_argument('-k', '--kanji', action='store_const', dest='field',
const='kanji', default='auto',
help='''Return entries matching query on kanji.''')
ag.add_argument('-r', '--reading', action='store_const', dest='field',
const='reading',
help='''Return entries matching query on reading
(in kana or rōmaji).''')
ag.add_argument('-g', '--gloss', '--meaning', action='store_const',
dest='field', const='gloss',
help='''Return entries matching query on glosses (English
translations/meaning).''')
ag = ap.add_argument_group('Query options')
ag.add_argument('--case-sensitive', '--sensitive', action='store_true',
help='''Case-sensitive search (distinguish uppercase from
lowercase). Default: Insensitive, unless there's an
uppercase letter in query.''')
ag.add_argument('-x', '--regexp', action='store_true',
help='''Regular expression search. Extent limits (-e) are
respected. Regexps currently don't work
for rōmaji; use kana for readings.''')
ag.add_argument('-e', '--extent', default='auto',
choices=('whole', 'beginning', 'word', 'partial', 'auto'),
help='''How much of the field should the query match:
- whole: Query must match the entire field.
- beginning: Query must match the beginning
of the field.
- word: Query must match whole word (at present
only works for English; treated as 'whole'
for kanji or reading fields.)
- partial: Query may match anywhere, even partially
inside words.
- auto (default): Try all four, and return the
first to match something.''')
ag.add_argument('-w', '--whole', action='store_const',
const='whole', dest='extent',
help='''Equivalent to --extent=whole.''')
ag.add_argument('-b', '--beginning', action='store_const',
const='beginning', dest='extent',
help='''Equivalent to --extent=beginning.''')
ag.add_argument('--word', action='store_const', const='word',
dest='extent',
help='''Equivalent to --extent=word.''')
ag.add_argument('-p', '--partial', action='store_const',
const='partial', dest='extent',
help='Equivalent to --extent=partial.')
ag.add_argument('-f', '--frequent', '-P', action='store_true',
help='''Restrict to frequent words (equivalent
to EDICT entries marked as ‘(P)’)''')
ag = ap.add_argument_group('Output control')
ag.add_argument('--output-mode', default='tab',
choices=('human', 'tab', 'auto'),
help='''Output mode; one of:
- human: Multiline human-readable output.
- tab: One-line tab-separated.
- auto (default): Human if output is to
terminal, tab if writing
to pipe or file.''')
ag.add_argument('-t', '--tsv', '--tab', action='store_const',
const='tab', dest='output_mode',
help="Equivalent to --output-mode=tab")
ag.add_argument('--human', action='store_const', const='human',
dest='output_mode',
help="Equivalent to --output-mode=human")
ag.add_argument('--color', choices=('yes', 'no', 'auto'), default='no',
help='''Whether to colorize output. Default 'auto' means to
colorize if writing to a terminal.''')
ag.add_argument('-c', action='store_const', const='yes', dest='color',
help='Equivalent to --color=yes')
ag.add_argument('--background', '--bg', choices=('dark', 'light', 'auto'),
default='auto',
help='''Use colorscheme for dark or light background.
Autodetection can be spotty. If it's not working
for you, you can also set it in the BACKGROUND
environment variable.''')
ag.add_argument('--out-hepburn', '--oh',
action='store_const', const=romkan.to_hepburn,
dest='out_romaji', default=None,
help='Convert reading to Hepburn rōmaji in output.')
ag.add_argument('--out-kunrei', '--ok',
action='store_const', const=romkan.to_kunrei,
dest='out_romaji', default=None,
help='Convert reading to Kunrei rōmaji in output.')
ag = ap.add_argument_group('Abbreviations help')
ag.add_argument('--list-abbrevs', action='store_true',
help='''List all abbreviations.''')
ag.add_argument('-a', '--abbrev', metavar='ABBREV', default=None,
help='''Print meaning of an abbreviation.''')
ap.add_argument('query', help='Text to look for.',
metavar='QUERY', nargs='*')
args = ap.parse_args()
args.output_mode = 'human'
if len(query) == 0:
return None
args.query = [query]
args.color = 'yes'
args.background = 'dark'
color.use_color = True
args.query = ' '.join(args.query)
# case sensitivity must be handled before opening db
if not args.case_sensitive:
if re.search("[A-Z]", args.query):
args.case_sensitive = True
if not config:
print('%s: Could not find config.ini!' % fmt('ERROR', 'error'))
# print version regardless
if args.version:
print(common.version(None))
sys.exit(2)
# try to open database
try:
con, cur = database.opendb(case_sensitive=args.case_sensitive)
except database.DatabaseAccessError as e:
print('''Database error: %s.
Expected database version %s at:
%s
Before using myougiden for the first time, you need to compile the JMdict
(EDICT) dictionary. Try running this command to download and compile it:
updatedb-myougiden -f
It will take a while, but lookups afterwards will be fast.
JMdict is frequently updated. If you'd like to keep up with new entries,
you might want to add the update command to cron (for example, in
/etc/cron.weekly/myougiden ).'''
% (str(e), config.get('core', 'dbversion'),
config.get('paths', 'database')))
if args.version:
print()
print(common.version(None))
sys.exit(2)
# handle short commands first.
if args.version:
print(common.version(cur))
sys.exit(0)
elif args.list_abbrevs:
print(orm.abbrevs_table(cur))
sys.exit(0)
elif args.abbrev:
a = orm.abbrev_line(cur, args.abbrev)
if a:
print(a)
sys.exit(0)
else:
print('Not found!')
sys.exit(0)
# handle query guesswork
if args.query == '':
ap.print_help()
sys.exit(2)
# 'word' doesn't work for Jap. anyway, and 'whole' is much faster.
if args.extent == 'word' and args.field in ('kanji', 'reading'):
args.extent = 'whole'
# first, we need a dictionary of options with only keys understood
# by search_by().
search_args = vars(args).copy() # turn Namespace to dict
# keep only interesting keys
for k in list(search_args.keys()):
if k not in ('field', 'query', 'extent',
'regexp', 'case_sensitive', 'frequent'):
del search_args[k]
# we'll iterate over all required 'field' and 'extent' conditions.
#
# for code clarity, we always use a list of search conditions,
# even if the size of the list is 1.
if args.field == 'auto':
if tt.is_latin(args.query):
# if pure alphabet, try as English first, then as rōmaji
fields = ('gloss', 'reading', 'kanji')
elif tt.is_romaji(args.query):
# latin with special chars; probably rōmaji
fields = ('reading', 'gloss', 'kanji')
elif tt.is_kana(args.query):
fields = ('reading', 'kanji', 'gloss')
else:
fields = ('kanji', 'reading', 'gloss')
else:
fields = (args.field,)
if args.extent != 'auto':
extents = (args.extent,)
else:
extents = ('whole', 'word', 'partial')
if args.regexp:
regexp_flags = (True,)
elif tt.has_regexp_special(args.query):
regexp_flags = (False, True)
else:
regexp_flags = (False,)
conditions = []
for regexp in regexp_flags:
for extent in extents:
for field in fields:
# the useless combination; we'll avoid it to avoid wasting
# time.
if extent == 'word' and field != 'gloss':
if args.extent == 'auto':
# we're trying all possibilities, so we can just
# skip this one. other extents were/will be tried
# elsewhen in the loop.
continue
else:
# not trying all possibilities; this is our only
# pass in this field, so let's adjust it.
sa = search_args.copy()
sa['extent'] = 'whole'
else:
# simple case.
sa = search_args.copy()
sa['extent'] = extent
sa['field'] = field
sa['regexp'] = regexp
conditions.append(sa)
# deal with rōmaji queries
if (args.field in ('auto', 'reading') and tt.is_romaji(args.query)):
if re.search('[A-Z]', args.query):
kana_guess = (romkan.to_katakana, romkan.to_hiragana)
else:
kana_guess = (romkan.to_hiragana, romkan.to_katakana)
new_conditions = conditions[:]
for oldcond in conditions:
if oldcond['field'] == 'reading':
for kanafn in kana_guess:
# the query looks like romaji and the field is reading.
# so we try it converted to kana _first_, then try as-is.
# thus the insert.
for romaji in tt.expand_romaji(oldcond['query']):
newcond = oldcond.copy()
newcond['query'] = kanafn(romaji)
new_conditions.insert(new_conditions.index(oldcond),
newcond)
conditions = new_conditions
chosen_search, ent_seqs = search.guess(cur, conditions)
if chosen_search:
entries = [orm.fetch_entry(cur, ent_seq) for ent_seq in ent_seqs]
if args.output_mode == 'human':
out = [entry.format_human(search_params=chosen_search,
romajifn=args.out_romaji)
for entry in entries]
out = ("\n\n".join(out)) + "\n"
elif args.output_mode == 'tab':
out = [entry.format_tsv(search_params=chosen_search,
romajifn=args.out_romaji)
for entry in entries]
# out = ("\n".join(out)) + "\n"
return out
else:
return None