Skip to content

Commit 708346f

Browse files
committed
usgs/hydrology scripts
1 parent ebba06b commit 708346f

5 files changed

Lines changed: 565 additions & 1 deletion

File tree

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import json
2+
import click
3+
4+
@click.command()
5+
@click.argument("geojson_file", type=click.Path(exists=True))
6+
@click.argument("output_file", type=click.Path())
7+
def extract_hru_ids(geojson_file, output_file):
8+
"""Extracts hru_id from a GeoJSON file and saves them as a JSON array."""
9+
10+
with open(geojson_file, "r") as f:
11+
geojson_data = json.load(f)
12+
13+
# Extract hru_id from each feature and ensure it's 8 digits with leading zeros
14+
hru_ids = [
15+
f"{int(feature['properties']['hru_id']):08d}"
16+
for feature in geojson_data.get("features", [])
17+
if "hru_id" in feature.get("properties", {})
18+
]
19+
20+
# Save output JSON
21+
with open(output_file, "w") as f:
22+
json.dump(hru_ids, f, indent=4)
23+
24+
click.echo(f"Extracted {len(hru_ids)} HRU IDs to {output_file}")
25+
26+
if __name__ == "__main__":
27+
extract_hru_ids()
Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
import click
2+
import json
3+
import pathlib
4+
import pandas as pd
5+
from dataretrieval import nwis
6+
import numpy as np
7+
import time
8+
import calendar
9+
10+
def get_unix_time(row, year_index, month_index, day_index):
11+
if year_index is not None:
12+
year = int(row[year_index])
13+
else:
14+
year = 1970
15+
if month_index is not None:
16+
month = int(row[month_index])
17+
else:
18+
month = 1
19+
if day_index is not None:
20+
day = int(row[day_index])
21+
else:
22+
day = 1
23+
if month < 1 or month > 12:
24+
month = 1
25+
if day < 1 or day > 31:
26+
day = 1
27+
28+
return calendar.timegm(time.strptime(f"{year}-{month}-{day}", "%Y-%m-%d"))
29+
# Define the function for fetching and formatting data
30+
@click.command()
31+
@click.option('-p', '--param-codes', default=['00060'], multiple=True, required=True, help="List of parameter codes to query.")
32+
@click.option('--input', required=True, type=click.Path(exists=True), help="JSON file containing an array of site numbers.")
33+
@click.option('--start-date', default='1989-10-01', help="Start date for the data query.")
34+
@click.option('--end-date', default='1999-09-30', help="End date for the data query.")
35+
@click.option('--output', default='output.json', help="Output JSON file where the results will be saved.")
36+
@click.option('--usgs-parameters', default='../nwis/USGSParameters.tsv', type=click.Path(exists=True), help="Path to the USGSParameters.tsv file.")
37+
def fetch_data(param_codes, input, start_date, end_date, output, usgs_parameters):
38+
"""Fetch data from NWIS and save it in a JSON format with descriptions for each table."""
39+
# Load the USGS parameters file
40+
usgs_df = pd.read_csv(usgs_parameters, sep='\t', comment='#')
41+
# Create a dictionary mapping parameter codes to their descriptions
42+
param_desc = dict(zip(usgs_df['parm_cd'], usgs_df['parm_nm']))
43+
44+
# Load geojson file
45+
with open(input, 'r') as f:
46+
site_loaded = json.load(f) # Expecting a JSON array of site numbers
47+
48+
site_numbers = [str(item) for item in site_loaded]
49+
# Split the site numbers into chunks of 10 (because NWIS allows a max of 10 sites per request)
50+
def split_list(l):
51+
n = 10
52+
for i in range(0, len(l), n):
53+
yield l[i:i + n]
54+
55+
site_lists = list(split_list(site_numbers))
56+
57+
# Prepare the result container
58+
result = {}
59+
60+
# Fetch data for each site and each report type (monthly, daily, annual)
61+
report_types = ['daily'] # Assuming you want monthly, daily, and annual reports
62+
63+
for report_type in report_types:
64+
for i, site_list in enumerate(site_lists):
65+
try:
66+
response = nwis.get_stats(
67+
sites=site_list,
68+
startDt=start_date,
69+
endDt=end_date,
70+
statReportType=report_type,
71+
parameterCd=",".join(param_codes),
72+
)
73+
df, meta = response
74+
except Exception as e:
75+
print(f"Error fetching {report_type} data for sites {site_list}: {e}")
76+
continue
77+
df, meta = response
78+
79+
# Replace NaN values with empty strings
80+
df = df.fillna('')
81+
82+
# Add the data to the result dictionary with site_number as the key
83+
for site_number in site_list:
84+
site_data = df[df['site_no'] == site_number]
85+
if not site_data.empty:
86+
# Remove the 'ts_id' column if it exists
87+
site_data = site_data.drop(columns=['site_no', 'loc_web_ds'], errors='ignore')
88+
89+
unique_param_codes = site_data['parameter_cd'].unique()
90+
# Create a description of the parameters
91+
param_names = [param_desc.get(code, 'Unknown parameter') for code in unique_param_codes]
92+
description = f"This is a table of the mean {report_type} values for the following parameters: {', '.join([f'{code} - {name}' for code, name in zip(unique_param_codes, param_names)])}"
93+
header = site_data.columns.tolist()
94+
parameter_cd_index = header.index('parameter_cd')
95+
mean_va_index = header.index('mean_va')
96+
97+
# Prepare table object
98+
rows = site_data.values.tolist()
99+
100+
base_set = set()
101+
year_index = None
102+
month_index = None
103+
day_index = None
104+
if 'year_nu' in header:
105+
year_index = header.index('year_nu')
106+
if 'begin_yr' in header:
107+
year_index = header.index('begin_yr')
108+
if 'month_nu' in header:
109+
month_index = header.index('month_nu')
110+
if 'day_nu' in header:
111+
day_index = header.index('day_nu')
112+
base_string_param_map = {}
113+
ts_id_index = header.index('ts_id')
114+
# update to take only the latest ts_id
115+
ts_id_map = {}
116+
for row in rows:
117+
ts_id = row[ts_id_index]
118+
year = row[year_index] if year_index is not None else 0000
119+
month = row[month_index] if month_index is not None else 00
120+
day = row[day_index] if day_index is not None else 00
121+
base_string = str(f'{str(year).zfill(4)}{str(month).zfill(2)}{str(day).zfill(2)}')
122+
base_set.add(base_string)
123+
if base_string_param_map.get(base_string, None) is None:
124+
base_string_param_map[base_string] = {}
125+
for param in unique_param_codes:
126+
base_string_param_map[base_string][param] = None
127+
val = row[mean_va_index]
128+
for code in unique_param_codes:
129+
if row[parameter_cd_index] == code:
130+
if ts_id_map.get(f'{base_string}_{code}', None) is None:
131+
ts_id_map[f'{base_string}_{code}'] = ts_id
132+
if base_string_param_map[base_string][code] is None or val is not None:
133+
base_string_param_map[base_string][code] = val
134+
if ts_id > ts_id_map[f'{base_string}_{code}']:
135+
ts_id_map[f'{base_string}_{code}'] = ts_id
136+
base_string_param_map[base_string][code] = val
137+
base_order = list(base_set)
138+
base_order.sort()
139+
time_base_index = {}
140+
header.append('index')
141+
header.append('unix_time')
142+
for code in unique_param_codes:
143+
header.append(code)
144+
parameter_cd_index = header.index('parameter_cd')
145+
header.pop(parameter_cd_index)
146+
mean_va_index = header.index('mean_va')
147+
header.pop(mean_va_index)
148+
unix_mapping = {}
149+
for row in rows:
150+
year = row[year_index] if year_index is not None else 0000
151+
month = row[month_index] if month_index is not None else 00
152+
day = row[day_index] if day_index is not None else 00
153+
base_string = str(f'{str(year).zfill(4)}{str(month).zfill(2)}{str(day).zfill(2)}')
154+
row.append(base_order.index(base_string))
155+
unix_timestamp = get_unix_time(row, year_index, month_index, day_index)
156+
row.append(unix_timestamp)
157+
row.pop(parameter_cd_index)
158+
row.pop(mean_va_index)
159+
if time_base_index.get(base_string, None) is None:
160+
for param in unique_param_codes:
161+
param_val = base_string_param_map.get(base_string, {}).get(param, None)
162+
row.append(param_val)
163+
if unix_mapping.get(unix_timestamp, None) is None:
164+
unix_mapping[unix_timestamp] = row
165+
else: # Combine the so we get rid of any missing data
166+
old_row = unix_mapping[unix_timestamp]
167+
for index in range(len(row)):
168+
if row[index] is not None and old_row[index] is None:
169+
old_row[index] = row[index]
170+
171+
# row_length = len(rows[0])
172+
# for row in rows:
173+
# if len(row) != row_length:
174+
# print(f'{site_number} - {row} != {row_length}')
175+
# else:
176+
# print(f'{site_number} - {row} == {row_length}')
177+
sorted_values = [value for _, value in sorted(unix_mapping.items())]
178+
179+
updated_df = pd.DataFrame(sorted_values, columns=header)
180+
table_object = {
181+
"name": f"{site_number}_{report_type}",
182+
"description": description,
183+
"type": f'USGS_gauge_{report_type}_{"_".join(param_codes)}',
184+
"header": header,
185+
"summary": generate_summary(updated_df, param_desc, updated_df.columns.tolist()),
186+
"rows": sorted_values,
187+
}
188+
if site_number not in result:
189+
result[site_number] = []
190+
191+
result[site_number].append(table_object)
192+
193+
print(f"Fetched {report_type} data for {len(site_list)} sites.")
194+
195+
# Save the result to the output file as JSON
196+
with open(output, 'w') as f:
197+
json.dump(result, f, indent=4)
198+
199+
print(f"Results saved to {output}.")
200+
201+
limit = 100
202+
def generate_summary(df, param_desc, rows):
203+
"""Generate a summary object for the table with column type and stats, focusing on unique parameter_cd."""
204+
summary = {}
205+
for header in rows:
206+
# Iterate over each unique parameter_cd
207+
if header == 'parameter_cd':
208+
if header not in summary.keys():
209+
summary[header] = {'type':'parameter_cd'}
210+
for parameter_cd in df['parameter_cd'].unique():
211+
param_data = df[df['parameter_cd'] == parameter_cd]
212+
213+
# Assuming the 'value' column contains the actual data values
214+
value_col = param_data['mean_va']
215+
216+
# Calculate the min, max, and mean for each parameter_cd
217+
summary[header][parameter_cd] = {
218+
"parameter_cd": parameter_cd,
219+
"parameter_name": param_desc.get(parameter_cd, 'Unknown parameter'),
220+
"min": float(value_col.min()),
221+
"max": float(value_col.max()),
222+
"mean": float(value_col.mean())
223+
}
224+
else: # Calculate type/min/max and other fields
225+
226+
if header not in summary.keys():
227+
summary[header] = {'type': None, 'values': set(), 'value_count': 0}
228+
parameter_cd = param_desc.get(header, None)
229+
if parameter_cd:
230+
summary[header]["description"] = parameter_cd[0] if isinstance(parameter_cd, tuple) else parameter_cd
231+
for value in df[header].unique():
232+
if isinstance(value, bool):
233+
summary[header]['type'] = 'bool'
234+
summary[header]['value_count'] += 1
235+
elif isinstance(value, (int, float, np.float64, np.int32, np.int64)):
236+
summary[header]['type'] = 'number'
237+
summary[header]['value_count'] += 1
238+
if 'min' not in summary[header] or value < summary[header]['min']:
239+
if np.isnan(float(value)) or value is None and summary[header].get('min', None) is None:
240+
summary[header]['min'] = float('inf')
241+
else:
242+
summary[header]['min'] = float(value)
243+
if 'max' not in summary[header] or value > summary[header]['max']:
244+
if np.isnan(float(value)) or value is None and summary[header].get('max', None) is None:
245+
summary[header]['max'] = float('-inf')
246+
else:
247+
summary[header]['max'] = float(value)
248+
elif isinstance(value, str):
249+
if 'values' not in summary[header]:
250+
summary[header]['values'] = set()
251+
summary[header]['value_count'] += 1
252+
summary[header]['type'] = 'string'
253+
summary[header]['values'].add(value)
254+
for header in summary.keys():
255+
if summary[header]['type'] is None:
256+
summary[header]['type'] = 'unknown'
257+
del summary[header]['values']
258+
continue
259+
if summary[header]['type'] == 'number':
260+
if summary[header]['value_count'] == 1:
261+
summary[header]['values'] = summary[header].get('min', summary[header].get('max'))
262+
del summary[header]['min']
263+
del summary[header]['max']
264+
elif summary[header]['min'] == summary[header]['max']:
265+
val = summary[header]['min']
266+
del summary[header]['values']
267+
del summary[header]['min']
268+
del summary[header]['max']
269+
summary[header]['static'] = True
270+
summary[header]['value'] = val
271+
else:
272+
if np.isnan(summary[header]['min']):
273+
summary[header]['min'] = None
274+
if np.isnan(summary[header]['max']):
275+
summary[header]['max'] = None
276+
del summary[header]['values']
277+
elif (
278+
summary[header]['type'] == 'string'
279+
and 'values' in summary[header]
280+
and not summary[header].get('searchable')
281+
):
282+
summary[header]['values'] = sorted(summary[header]['values'])
283+
if len(summary[header]['values']) >= limit:
284+
summary[header]['searchable'] = True
285+
summary[header]['unique'] = len(summary[header]['values'])
286+
del summary[header]['values']
287+
elif summary[header]['type'] == 'bool':
288+
del summary[header]['values']
289+
check_json_validity(summary)
290+
return summary
291+
292+
def check_json_validity(obj, path="root"):
293+
valid_types = (str, int, float, bool, type(None), list, dict)
294+
295+
if isinstance(obj, (np.float64, np.float128, np.int64, np.int32)):
296+
print(f"Invalid type at {path}: {type(obj).__name__} (Value: {obj})")
297+
return False
298+
299+
if isinstance(obj, dict):
300+
for key, value in obj.items():
301+
check_json_validity(value, path=f"{path}.{key}")
302+
303+
elif isinstance(obj, list):
304+
for idx, item in enumerate(obj):
305+
check_json_validity(item, path=f"{path}[{idx}]")
306+
307+
elif not isinstance(obj, valid_types):
308+
print(f"Invalid type at {path}: {type(obj).__name__} (Value: {obj})")
309+
return False
310+
311+
312+
if __name__ == '__main__':
313+
fetch_data()
314+

0 commit comments

Comments
 (0)