forked from outbreak-info/outbreak.info-resources
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprotocolsio_json_retriever.py
More file actions
176 lines (132 loc) · 6.21 KB
/
protocolsio_json_retriever.py
File metadata and controls
176 lines (132 loc) · 6.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import csv
import os
import sys
import click
import pathlib
import logging
import calendar
import time
import pathlib
from encodeproject import download as encode_download # type: ignore
from colorama import Fore, Style
from datetime import datetime
from datetime import date
import json
# If set to True, will only process the number of terms
# specified by the DEFAULT_TEST_COUNT
DEFAULT_TEST_MODE = False
# This sets the default number of terms to be processed
DEFAULT_TEST_COUNT = 2
DEFAULT_VERBOSE = False
DEFAULT_COVID_TERMS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'covid_terms.txt')
DEFAULT_OUTDIR = "/tmp/" + os.path.basename(__file__) + '/' + str(datetime.today().strftime('%Y-%m-%d-%H%M%S'))
LOGGING_FORMAT = "%(levelname)s : %(asctime)s : %(pathname)s : %(lineno)d : %(message)s"
LOG_LEVEL = logging.INFO
g_verbose = False
g_test_mode = False
g_test_count = 0
def retrieve_datasets(terms_list, outdir):
"""Retrieve the datasets (JSON format) from protocols.io
for the specified terms.
:param terms_list: {list} the list of terms
:param outdir: {str} the output directory
:returns None:
"""
for i, term in enumerate(terms_list):
logging.info("Processing term '{}'".format(term))
url = 'https://www.protocols.io/api/v3/protocols?filter="public"&key="' + term.lower() + '"'
logging.info("URL is '{}'".format(url))
outfile_basename = term.replace(' ', '') + '.json'
outfile = os.path.join(outdir , outfile_basename)
logging.info("outfile is '{}'".format(outfile))
if g_verbose:
print(Fore.BLUE + "Processing term '{}'".format(term))
print(Style.RESET_ALL + '', end='')
print("URL is '{}'".format(url))
print("outfile is '{}'".format(outfile))
encode_download(url=url, path=outfile)
if not os.path.exists(outfile):
logging.error("output file '{}' does not exist".format(outfile))
else:
reformatted_outfile = outfile.replace('.json', '.reformatted.json')
with open(outfile) as json_fh:
lookup = json.load(json_fh)
with open(reformatted_outfile, 'w') as fout:
json.dump(lookup, fout, indent=2)
logging.info("Wrote reformatted output file '{}'".format(reformatted_outfile))
if g_verbose:
print("Wrote reformatted output file '{}'".format(reformatted_outfile))
if g_test_mode:
if i > g_test_count:
break
def load_terms_list(terms_file):
"""Load the search terms from a control file into a list
:param terms_file: {str}
:returns terms_list: {list}
"""
terms_list = []
with open(terms_file, 'r') as fh:
for line in fh:
if line.startswith('#'):
continue
line = line.strip()
terms_list.append(line)
logging.info("Loaded '{}' terms from file '{}' into the terms list".format(len(terms_list), terms_file))
if g_verbose:
print("Loaded '{}' terms from file '{}' into the terms list\n".format(len(terms_list), terms_file))
return terms_list
@click.command()
@click.option('--outdir', help='The default is the current working directory - default is {}'.format(DEFAULT_OUTDIR))
@click.option('--terms_file', help="The new-line separated list of covid terms - default is {}".format(DEFAULT_COVID_TERMS_FILE))
@click.option('--logfile', help="The log file - if not is specified a default will be assigned")
@click.option('--verbose', is_flag=True, help="Whether to execute in verbose mode - default is {}".format(DEFAULT_VERBOSE))
@click.option('--test_mode', is_flag=True, help="Whether to execute in test mode (will limit number of terms processed - default is {})".format(DEFAULT_TEST_MODE))
@click.option('--test_count', help="The number of terms to process when executing in test mode - default is {}".format(DEFAULT_TEST_COUNT))
def main(outdir, terms_file, logfile, verbose, test_mode, test_count):
"""Retrieve datasets (JSON format) from protocols.io for covid-related terms.
The terms are specified via the --terms_file option. This should be a newline
separated list of terms.
"""
if verbose is None:
verbose = DEFAULT_VERBOSE
print(Fore.YELLOW + "--verbose was not specified and therefore was set to default '{}'".format(verbose))
print(Style.RESET_ALL + '', end='')
global g_verbose
g_verbose = verbose
if test_mode is None:
test_mode = DEFAULT_TEST_MODE
print(Fore.YELLOW + "--test_mode was not specified and therefore was set to default '{}'".format(test_mode))
print(Style.RESET_ALL + '', end='')
global g_test_mode
g_test_mode = test_mode
if test_mode:
if test_count is None:
test_count = DEFAULT_TEST_COUNT
print(Fore.YELLOW + "--test_count was not specified and therefore was set to default '{}'".format(test_count))
print(Style.RESET_ALL + '', end='')
global g_test_count
g_test_count = test_count
if outdir is None:
outdir = DEFAULT_OUTDIR
print(Fore.YELLOW + "--outdir was not specified and therefore was set to default '{}'".format(outdir))
print(Style.RESET_ALL + '', end='')
assert isinstance(outdir, str)
if not os.path.exists(outdir):
pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)
print(Fore.YELLOW + "Created output directory '{}'".format(outdir))
print(Style.RESET_ALL + '', end='')
if logfile is None:
logfile = outdir + '/' + os.path.basename(__file__) + '.log'
print(Fore.YELLOW + "--logfile was not specified and therefore was set to '{}'".format(logfile))
print(Style.RESET_ALL + '', end='')
assert isinstance(logfile, str)
if terms_file is None:
terms_file = DEFAULT_COVID_TERMS_FILE
print(Fore.YELLOW + "--terms_file was not specified and therefore was set to '{}'".format(terms_file))
print(Style.RESET_ALL + '', end='')
assert isinstance(terms_file, str)
logging.basicConfig(filename=logfile, format=LOGGING_FORMAT, level=LOG_LEVEL)
terms_list = load_terms_list(terms_file)
retrieve_datasets(terms_list, outdir)
if __name__ == "__main__":
main()