-
-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Expand file tree
/
Copy pathmanage-imports.py
More file actions
executable file
·220 lines (181 loc) · 6.86 KB
/
manage-imports.py
File metadata and controls
executable file
·220 lines (181 loc) · 6.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python
import datetime
import functools
import json
import logging
import multiprocessing
import os
import sys
import time
from openlibrary.api import OLError, OpenLibrary
from openlibrary.config import load_config
from openlibrary.core.ia import get_candidate_ocaids
from openlibrary.core.imports import Batch, ImportItem
logger = logging.getLogger("openlibrary.importer")
@functools.cache
def get_ol(servername=None):
if os.getenv("LOCAL_DEV"):
ol = OpenLibrary(base_url="http://localhost:8080")
ol.login("admin", "admin123")
else:
ol = OpenLibrary(base_url=servername)
ol.autologin()
return ol
def ol_import_request(item, retries=5, servername=None, require_marc=True):
"""Requests OL to import an item and retries on server errors."""
# logger uses batch_id:id for item.data identifier if no item.ia_id
_id = item.ia_id or f"{item.batch_id}:{item.id}"
logger.info(f"importing {_id}")
for i in range(retries):
if i != 0:
logger.info("sleeping for 5 seconds before next attempt.")
time.sleep(5)
try:
ol = get_ol(servername=servername)
if item.data:
return ol.import_data(item.data)
return ol.import_ocaid(item.ia_id, require_marc=require_marc)
except OSError as e:
logger.warning(f"Failed to contact OL server. error={e!r}")
except OLError as e:
logger.warning(f"Failed to contact OL server. error={e!r}")
if e.code < 500:
return e.text
def do_import(item, servername=None, require_marc=True):
logger.info(f"do_import START (pid:{os.getpid()})")
response = ol_import_request(item, servername=servername, require_marc=require_marc)
if response and response.startswith("{"):
d = json.loads(response)
if d.get("success") and "edition" in d:
edition = d["edition"]
logger.info(f"success: {edition['status']} {edition['key']}")
item.set_status(edition["status"], ol_key=edition["key"])
else:
error_code = d.get("error_code", "unknown-error")
logger.error(f"failed with error code: {error_code}")
item.set_status("failed", error=error_code)
else:
logger.error(f"failed with internal error: {response}")
item.set_status("failed", error="internal-error")
logger.info(f"do_import END (pid:{os.getpid()})")
def add_items(batch_name, filename):
batch = Batch.find(batch_name) or Batch.new(batch_name)
batch.load_items(filename)
def import_ocaids(*ocaids, **kwargs):
"""This method is mostly for testing. It allows you to import one more
archive.org items into Open Library by ocaid
Usage:
$ sudo -u openlibrary \
HOME=/home/openlibrary OPENLIBRARY_RCFILE=/olsystem/etc/olrc-importbot \
python scripts/manage-imports.py \
--config /olsystem/etc/openlibrary.yml \
import-all
"""
servername = kwargs.get("servername")
require_marc = not kwargs.get("no_marc", False)
date = datetime.date.today()
if not ocaids:
raise ValueError("Must provide at least one ocaid")
batch_name = f"import-{ocaids[0]}-{date.year:04}{date.month:02}"
try:
batch = Batch.new(batch_name)
except Exception as e:
logger.info(repr(e))
try:
batch.add_items(ocaids)
except Exception:
logger.info("skipping batch adding, already present")
for ocaid in ocaids:
item = ImportItem.find_by_identifier(ocaid)
if item:
do_import(item, servername=servername, require_marc=require_marc)
else:
logger.error(f"{ocaid} is not found in the import queue")
def add_new_scans(args):
"""Adds new scans from yesterday."""
if args:
datestr = args[0]
yyyy, mm, dd = datestr.split("-")
date = datetime.date(int(yyyy), int(mm), int(dd))
else:
# yesterday
date = datetime.date.today() - datetime.timedelta(days=1)
items = list(get_candidate_ocaids(date))
batch_name = f"new-scans-{date.year:04}{date.month:02}"
batch = Batch.find(batch_name) or Batch.new(batch_name)
batch.add_items(items)
def import_batch(args, **kwargs):
servername = kwargs.get("servername")
require_marc = not kwargs.get("no_marc", False)
batch_name = args[0]
batch = Batch.find(batch_name)
if not batch:
print("Unknown batch", batch, file=sys.stderr)
sys.exit(1)
for item in batch.get_items():
do_import(item, servername=servername, require_marc=require_marc)
def import_item(args, **kwargs):
servername = kwargs.get("servername")
require_marc = not kwargs.get("no_marc", False)
ia_id = args[0]
if item := ImportItem.find_by_identifier(ia_id):
do_import(item, servername=servername, require_marc=require_marc)
else:
logger.error(f"{ia_id} is not found in the import queue")
def import_all(args, **kwargs):
servername = kwargs.get("servername")
require_marc = False
# Use multiprocessing to call do_import on each item
with multiprocessing.Pool(processes=8) as pool:
while True:
logger.info("find_pending START")
items = ImportItem.find_pending()
logger.info("find_pending END")
if not items:
logger.info("No pending items found. sleeping for a minute.")
time.sleep(60)
continue
logger.info("starmap START")
pool.starmap(do_import, ((item, servername, require_marc) for item in items))
logger.info("starmap END")
def main():
if "--config" in sys.argv:
index = sys.argv.index("--config")
configfile = sys.argv[index + 1]
del sys.argv[index : index + 2]
else:
configfile = os.path.abspath(
os.path.join(
os.path.dirname(__file__),
os.pardir,
os.pardir,
"openlibrary",
"conf",
"openlibrary.yml",
)
)
load_config(configfile)
from infogami import config
cmd = sys.argv[1]
args, flags = [], {"servername": config.get("servername", "https://openlibrary.org")}
for i in sys.argv[2:]:
if i.startswith("--"):
flags[i[2:]] = True
else:
args.append(i)
if cmd == "import-ocaids":
return import_ocaids(*args, **flags)
if cmd == "add-items":
return add_items(*args)
elif cmd == "add-new-scans":
return add_new_scans(args)
elif cmd == "import-batch":
return import_batch(args, **flags)
elif cmd == "import-all":
return import_all(args, **flags)
elif cmd == "import-item":
return import_item(args, **flags)
else:
logger.error(f"Unknown command: {cmd}")
if __name__ == "__main__":
main()