OCR-Manga/myougiden_api.py at master · klaxa/OCR-Manga · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#!/usr/bin/env python3

import argparse
import re
import sys

import romkan

from myougiden import (
    color,
    common,
    config,
    database,
    orm,
    search,
    texttools as tt
)


def run(query):
    ap = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)

    ap.add_argument('--version', action='store_true',
                    help='Show version.')

    ag = ap.add_argument_group('Type of query',
                               '''What field to look in.  If not provided,
                                  try all of them and return the first
                                  to match.''')
    ag.add_argument('-k', '--kanji', action='store_const', dest='field',
                    const='kanji', default='auto',
                    help='''Return entries matching query on kanji.''')

    ag.add_argument('-r', '--reading', action='store_const', dest='field',
                    const='reading',
                    help='''Return entries matching query on reading
                            (in kana or rōmaji).''')

    ag.add_argument('-g', '--gloss', '--meaning', action='store_const',
                    dest='field', const='gloss',
                    help='''Return entries matching query on glosses (English
                            translations/meaning).''')

    ag = ap.add_argument_group('Query options')
    ag.add_argument('--case-sensitive', '--sensitive', action='store_true',
                    help='''Case-sensitive search (distinguish uppercase from
                            lowercase). Default: Insensitive, unless there's an
                            uppercase letter in query.''')

    ag.add_argument('-x', '--regexp', action='store_true',
                    help='''Regular expression search.  Extent limits (-e) are
                            respected.  Regexps currently don't work
                            for rōmaji; use kana for readings.''')

    ag.add_argument('-e', '--extent', default='auto',
                    choices=('whole', 'beginning', 'word', 'partial', 'auto'),
                    help='''How much of the field should the query match:
                            - whole: Query must match the entire field.
                            - beginning: Query must match the beginning
                                         of the field.
                            - word: Query must match whole word (at present
                                    only works for English; treated as 'whole'
                                    for kanji or reading fields.)
                            - partial: Query may match anywhere, even partially
                                       inside words.
                            - auto (default): Try all four, and return the
                                              first to match something.''')

    ag.add_argument('-w', '--whole', action='store_const',
                    const='whole', dest='extent',
                    help='''Equivalent to --extent=whole.''')

    ag.add_argument('-b', '--beginning', action='store_const',
                    const='beginning', dest='extent',
                    help='''Equivalent to --extent=beginning.''')

    ag.add_argument('--word', action='store_const', const='word',
                    dest='extent',
                    help='''Equivalent to --extent=word.''')

    ag.add_argument('-p', '--partial', action='store_const',
                    const='partial', dest='extent',
                    help='Equivalent to --extent=partial.')
    ag.add_argument('-f', '--frequent', '-P', action='store_true',
                    help='''Restrict to frequent words (equivalent
                            to EDICT entries marked as ‘(P)’)''')

    ag = ap.add_argument_group('Output control')
    ag.add_argument('--output-mode', default='tab',
                    choices=('human', 'tab', 'auto'),
                    help='''Output mode; one of:
                            - human: Multiline human-readable output.
                            - tab: One-line tab-separated.
                            - auto (default): Human if output is to
                                              terminal, tab if writing
                                              to pipe or file.''')

    ag.add_argument('-t', '--tsv', '--tab', action='store_const',
                    const='tab', dest='output_mode',
                    help="Equivalent to --output-mode=tab")

    ag.add_argument('--human', action='store_const', const='human',
                    dest='output_mode',
                    help="Equivalent to --output-mode=human")
    ag.add_argument('--color', choices=('yes', 'no', 'auto'), default='no',
                    help='''Whether to colorize output.  Default 'auto' means to
                            colorize if writing to a terminal.''')
    ag.add_argument('-c', action='store_const', const='yes', dest='color',
                    help='Equivalent to --color=yes')
    ag.add_argument('--background', '--bg', choices=('dark', 'light', 'auto'),
                    default='auto',
                    help='''Use colorscheme for dark or light background.
                            Autodetection can be spotty.  If it's not working
                            for you, you can also set it in the BACKGROUND
                            environment variable.''')

    ag.add_argument('--out-hepburn', '--oh',
                    action='store_const', const=romkan.to_hepburn,
                    dest='out_romaji', default=None,
                    help='Convert reading to Hepburn rōmaji in output.')
    ag.add_argument('--out-kunrei', '--ok',
                    action='store_const', const=romkan.to_kunrei,
                    dest='out_romaji', default=None,
                    help='Convert reading to Kunrei rōmaji in output.')

    ag = ap.add_argument_group('Abbreviations help')
    ag.add_argument('--list-abbrevs', action='store_true',
                    help='''List all abbreviations.''')
    ag.add_argument('-a', '--abbrev', metavar='ABBREV', default=None,
                    help='''Print meaning of an abbreviation.''')

    ap.add_argument('query', help='Text to look for.',
                    metavar='QUERY', nargs='*')

    args = ap.parse_args()
    args.output_mode = 'human'
    if len(query) == 0:
        return None
    args.query = [query]
    args.color = 'yes'
    args.background = 'dark'

    color.use_color = True
    args.query = ' '.join(args.query)

    # case sensitivity must be handled before opening db
    if not args.case_sensitive:
        if re.search("[A-Z]", args.query):
            args.case_sensitive = True

    if not config:
        print('%s: Could not find config.ini!' % fmt('ERROR', 'error'))

        # print version regardless
        if args.version:
            print(common.version(None))
        sys.exit(2)

    # try to open database
    try:
        con, cur = database.opendb(case_sensitive=args.case_sensitive)
    except database.DatabaseAccessError as e:
        print('''Database error: %s.
    Expected database version %s at:
    %s

    Before using myougiden for the first time, you need to compile the JMdict
    (EDICT) dictionary.  Try running this command to download and compile it:

        updatedb-myougiden -f

    It will take a while, but lookups afterwards will be fast.

    JMdict is frequently updated.  If you'd like to keep up with new entries,
    you might want to add the update command to cron (for example, in
    /etc/cron.weekly/myougiden ).'''
              % (str(e), config.get('core', 'dbversion'),
                 config.get('paths', 'database')))

        if args.version:
            print()
            print(common.version(None))
        sys.exit(2)

    # handle short commands first.
    if args.version:
        print(common.version(cur))
        sys.exit(0)

    elif args.list_abbrevs:
        print(orm.abbrevs_table(cur))
        sys.exit(0)

    elif args.abbrev:
        a = orm.abbrev_line(cur, args.abbrev)
        if a:
            print(a)
            sys.exit(0)
        else:
            print('Not found!')
            sys.exit(0)

    # handle query guesswork
    if args.query == '':
        ap.print_help()
        sys.exit(2)

    # 'word' doesn't work for Jap. anyway, and 'whole' is much faster.
    if args.extent == 'word' and args.field in ('kanji', 'reading'):
        args.extent = 'whole'

    # first, we need a dictionary of options with only keys understood
    # by search_by().
    search_args = vars(args).copy()  # turn Namespace to dict
    # keep only interesting keys
    for k in list(search_args.keys()):
        if k not in ('field', 'query', 'extent',
                     'regexp', 'case_sensitive', 'frequent'):
            del search_args[k]

    # we'll iterate over all required 'field' and 'extent' conditions.
    #
    # for code clarity, we always use a list of search conditions,
    # even if the size of the list is 1.

    if args.field == 'auto':
        if tt.is_latin(args.query):
            # if pure alphabet, try as English first, then as rōmaji
            fields = ('gloss', 'reading', 'kanji')
        elif tt.is_romaji(args.query):
            # latin with special chars; probably rōmaji
            fields = ('reading', 'gloss', 'kanji')
        elif tt.is_kana(args.query):
            fields = ('reading', 'kanji', 'gloss')
        else:
            fields = ('kanji', 'reading', 'gloss')
    else:
        fields = (args.field,)

    if args.extent != 'auto':
        extents = (args.extent,)
    else:
        extents = ('whole', 'word', 'partial')

    if args.regexp:
        regexp_flags = (True,)
    elif tt.has_regexp_special(args.query):
        regexp_flags = (False, True)
    else:
        regexp_flags = (False,)

    conditions = []
    for regexp in regexp_flags:
        for extent in extents:
            for field in fields:

                # the useless combination; we'll avoid it to avoid wasting
                # time.
                if extent == 'word' and field != 'gloss':

                    if args.extent == 'auto':
                        # we're trying all possibilities, so we can just
                        # skip this one.  other extents were/will be tried
                        # elsewhen in the loop.
                        continue
                    else:
                        # not trying all possibilities; this is our only
                        # pass in this field, so let's adjust it.
                        sa = search_args.copy()
                        sa['extent'] = 'whole'
                else:
                    # simple case.
                    sa = search_args.copy()
                    sa['extent'] = extent

                sa['field'] = field
                sa['regexp'] = regexp

                conditions.append(sa)

    # deal with rōmaji queries
    if (args.field in ('auto', 'reading') and tt.is_romaji(args.query)):

        if re.search('[A-Z]', args.query):
            kana_guess = (romkan.to_katakana, romkan.to_hiragana)
        else:
            kana_guess = (romkan.to_hiragana, romkan.to_katakana)

        new_conditions = conditions[:]
        for oldcond in conditions:
            if oldcond['field'] == 'reading':
                for kanafn in kana_guess:
                    # the query looks like romaji and the field is reading.
                    # so we try it converted to kana _first_, then try as-is.
                    # thus the insert.

                    for romaji in tt.expand_romaji(oldcond['query']):
                        newcond = oldcond.copy()
                        newcond['query'] = kanafn(romaji)
                        new_conditions.insert(new_conditions.index(oldcond),
                                              newcond)
        conditions = new_conditions

    chosen_search, ent_seqs = search.guess(cur, conditions)

    if chosen_search:
        entries = [orm.fetch_entry(cur, ent_seq) for ent_seq in ent_seqs]

        if args.output_mode == 'human':
            out = [entry.format_human(search_params=chosen_search,
                   romajifn=args.out_romaji)
                   for entry in entries]

            out = ("\n\n".join(out)) + "\n"

        elif args.output_mode == 'tab':
            out = [entry.format_tsv(search_params=chosen_search,
                   romajifn=args.out_romaji)
                   for entry in entries]

            # out = ("\n".join(out)) + "\n"

        return out
    else:
        return None