access_monitoring/access_means_update_new.py at master · MartinDix/access_monitoring · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# Update files of monthly and annual means using the netCDF files generated by the suite
# post-processing

# Script is given the suite archive directory and the output directory as arguments.

# This will also work with the archived post-processed netCDF files that have name in
# format YYYYMM_mon.nc.

import netCDF4, sys, glob, argparse, re
from pathlib import Path
import numpy as np
import stashvar_cmip6 as stashvar

parser = argparse.ArgumentParser(description="Create file with full time series of selected variables")
parser.add_argument('-v', dest='var_list', type=int, required=True,
                    nargs = '+', help = 'List of stash codes to include')
parser.add_argument('-r', dest='runid', required=True, help="Run name")
parser.add_argument('-i', dest='input_dir', default='.', help="Input directory")
parser.add_argument('-o', dest='output_dir', required=True, help="Output directory")
parser.add_argument('-n', dest='change_names', action='store_true',
                    help='Switch to meaningful variable names')
parser.add_argument('-m', dest='allow_missing', action='store_true',
                    help='Allow variables to be missing in some files')
parser.add_argument('--pp', dest='postproc', action='store_true',
                    help='Use post-processed files with _mon.nc suffix')
parser.add_argument('--esm', dest='esm', action='store_true',
                    help='Use ACCESS ESM1-5 files with names RUN_pa-YYYY0MM0DD.nc')
parser.add_argument('--verbose', dest='verbose', action='store_true')
args = parser.parse_args()

def get_year(filename):
    if args.esm:
        reg = re.compile('%s.pa-(\d{4,4})' % args.runid)
    else:
        reg = re.compile('%sa.pm(\d{4,4})' % args.runid)
    m = reg.match(Path(filename).name)
    return int(m.group(1))

def get_name(var):
    if args.change_names:
        if hasattr(var, 'um_stash_source'):
            itemcode = int(var.um_stash_source[4:6] + var.um_stash_source[7:])
        elif hasattr(var, 'stash_item') and hasattr(var, 'stash_section'):
            itemcode = 1000*var.stash_section + var.stash_item
        else:
            return var.name
        umvar = stashvar.StashVar(itemcode)
        if umvar.uniquename:
            return umvar.uniquename
    return var.name

def reverse_name(var):
    if hasattr(var, 'um_stash_source'):
        return 'fld_' + var.um_stash_source[3:]
    elif hasattr(var, 'stash_item') and hasattr(var, 'stash_section'):
        return 'fld_s%2.2di%3.3d' % (var.stash_section, var.stash_item)
    else:
        return var.name

def check_vlist(var, vlist):
    # Check whether a variable is in the required list
    if hasattr(var, 'um_stash_source'):
        return var.um_stash_source in vlist
    # Older files created with the cdms version of um2netcdf4
    if hasattr(var, 'stash_item') and hasattr(var, 'stash_section'):
        code = 'm01s%2.2di%3.3d' % (var.stash_section, var.stash_item)
        return code in vlist
    return False

def createfile(filename,lastfile):
    d = netCDF4.Dataset(filename, 'w')
    # Any file will do for the variable definition
    dnew = netCDF4.Dataset(lastfile)
    # First pass to work out which dimensions are required
    dused = set(['bnds'])
    vused = set()
    for vname, var in dnew.variables.items():
        if check_vlist(var,vlist):
            vused.add(var.name)
            if hasattr(var,'grid_mapping'):
                vused.update(set(var.grid_mapping.split()))
            if hasattr(var,'coordinates'):
                vused.update(set(var.coordinates.split()))
            for dim in var.dimensions:
                dused.add(dim)
    if args.verbose:
        print("DUSED", dused)
        print("VUSED", vused)
    dskip = set()
    for dname, dim in dnew.dimensions.items():
        if args.verbose:
            print("Creating dim", dname, dname in dused)
        if dname in dused or dname.replace('_bnds','') in dused:
            if dname.endswith('_bnds'):
                dused.add(dname)
            if dim.isunlimited():
                d.createDimension(dname,None)
            else:
                d.createDimension(dname,dim.size)
        else:
            dskip.add(dname)
    if args.verbose:
        print("DSKIP", dskip)
    for dname in dused:
        if dname in dnew.variables:
            vused.add(dname)
        if dname + '_bnds' in dnew.variables:
            vused.add(dname + '_bnds')
    if args.verbose:
        print("Final vused", vused)

    for vname in vused:
        var = dnew.variables[vname]
        newname = get_name(var)
        if args.verbose:
            print("Creating %s from %s" % (newname, vname))
        if hasattr(var,'_FillValue'):
            newv = d.createVariable(newname, var.datatype, var.dimensions,
                                    zlib=True, fill_value=getattr(var,'_FillValue'))
        else:
            newv = d.createVariable(newname, var.datatype, var.dimensions,
                                    zlib=True)
        for attr in var.ncattrs():
            if attr != '_FillValue':
                newv.setncattr(attr, getattr(var,attr))
        # If this variable doesn't have a time dimension, copy from input
        if 'time' not in var.dimensions:
            d.variables[vname][:] = dnew.variables[vname][:]
    return d

mname = {0:'jan', 1:'feb', 2:'mar', 3:'apr',
         4:'may', 5:'jun', 6:'jul', 7:'aug',
         8:'sep', 9:'oct', 10:'nov', 11:'dec'}

if args.postproc:
    suffix = '_mon.nc'
else:
    suffix = '.nc'

# Set up a list of um_stash_source attributes from the stash_codes
vlist = ['m01s%2.2di%3.3d' % (v//1000, v%1000) for v in args.var_list]

# Check for any new complete years
if args.postproc:
    # Uses month number
    if args.esm:
        flist = sorted(list(glob.glob('%s/%s.pa-*12%s' % (args.input_dir, args.runid, suffix))))
        print("GLOB '%s/%s.pa-*12%s" % (args.input_dir, args.runid, suffix))
    else:
        flist = sorted(list(glob.glob('%s/%sa.pm*12%s' % (args.input_dir, args.runid, suffix))))
else:
    if args.esm:
        flist = sorted(list(glob.glob('%s/%s.pa*012001%s' % (args.input_dir, args.runid, suffix))))
    else:
        flist = sorted(list(glob.glob('%s/%sa.pm*dec%s' % (args.input_dir, args.runid, suffix))))
if not flist:
    print("Nothing to process")
    sys.exit(0)

lastfile = flist[-1]
lastfile_year = get_year(lastfile)

output_file = "%s/%s.nc" % (args.output_dir, args.runid)
if Path(output_file).exists():
    d = netCDF4.Dataset(output_file, 'r+')
else:
    d = createfile(output_file, lastfile)

time = d.variables['time']
nt = len(time)

if nt%12 != 0:
    raise Exception("Unexpected state: nt=%d is not a multiple of 12 in run %s" % (nt, args.runid))

if nt > 0:
    lastdate = netCDF4.num2date(time[-1], time.units,time.calendar)
    lastyear = lastdate.year
else:
    # Set lastyear to year of first file -1 so loop starts correctly
    firstfile = flist[0]
    lastyear = get_year(firstfile) - 1

if lastfile_year > lastyear:
    print('Data to process', lastyear+1, lastfile_year)
    # Loop is over expected years, so missing files will cause an
    # error.
    for year in range(lastyear+1, lastfile_year+1):
        # Processing glitches sometimes mean there's a month missing.
        # Check this first to avoid partial updates
        if args.postproc:
            if args.esm:
                flist_year = sorted(list(glob.glob('%s/%s.pa-%4.4d??%s' % (args.input_dir, args.runid, year, suffix))))
            else:
                flist_year = sorted(list(glob.glob('%s/%sa.pm%4.4d??%s' % (args.input_dir, args.runid, year, suffix))))
        else:
            if args.esm:
                flist_year = glob.glob('%s/%s.pa-%4.4d???001%s' % (args.input_dir, args.runid, year, suffix))
            else:
                flist_year = glob.glob('%s/%sa.pm%4.4d???%s' % (args.input_dir, args.runid, year, suffix))
        if len(flist_year) != 12:
            print("Missing files for year %d:" % year, flist_year, file=sys.stderr)
            raise Exception("Missing files in %s" % args.runid)
        for mon in range(12):
            if args.postproc:
                if args.esm:
                    fname = '%s/%s.pa-%4.4d%2.2d%s' % (args.input_dir, args.runid, year, mon+1, suffix)
                else:
                    fname = '%s/%sa.pm%4.4d%2.2d%s' % (args.input_dir, args.runid, year, mon+1, suffix)
            else:
                if args.esm:
                    fname = '%s/%s.pa-%4.4d%3.3d001%s' % (args.input_dir, args.runid, year, mon+1, suffix)
                else:
                    fname = '%s/%sa.pm%4.4d%s%s' % (args.input_dir, args.runid, year, mname[mon], suffix)
            print(fname)
            dnew = netCDF4.Dataset(fname)
            for vname in d.variables:
                v = d.variables[vname]
                if check_vlist(v, vlist) or vname in ('time', 'time_bnds'):
                    try:
                        if 'time' in v.dimensions:
                            # Get the same variable from the new file
                            vnew = dnew.variables[reverse_name(v)]
                            # Append the data
                            v[nt] = vnew[:]
                    except KeyError:
                        if args.allow_missing:
                            print("Warning missing %s in %s" % (vname, fname))
                        else:
                            raise Exception("Missing %s in %s" % (vname, fname))
            nt += 1
            dnew.close()
        d.sync()  # Sync to disk once per year
d.close()

# Update the annual means
ann_file = "%s/%s_ann.nc" % (args.output_dir, args.runid)
if Path(ann_file).exists():
    d = netCDF4.Dataset(ann_file, 'r+')
else:
    d = createfile(ann_file, lastfile)

time = d.variables['time']
nt = len(time)

# Monthly data
dm = netCDF4.Dataset(output_file, 'r')
mtime = dm.variables['time']
nmon = len(mtime)
assert nmon%12 == 0

def isleap(year):
    # Proleptic gregreogian
    if year % 100 == 0:
        return year%400==0
    else:
        return year%4==0

if nt > 0:
    lastdate = netCDF4.num2date(time[-1], time.units, time.calendar)
    lastyear = lastdate.year
else:
    # Set lastyear to year of first file -1 so loop starts correctly
    firstdate = netCDF4.num2date(mtime[0], mtime.units, mtime.calendar)
    lastyear = firstdate.year - 1

if lastfile_year > lastyear:
    print('Data to process for annual means', lastyear+1, lastfile_year)
    for year in range(lastyear+1, lastfile_year+1):
        if isleap(year):
            mwts = np.array([31,29,31,30,31,30,31,31,30,31,30,31])
        else:
            mwts = np.array([31,28,31,30,31,30,31,31,30,31,30,31])
        for vname in d.variables:
            v = d.variables[vname]
            if 'time' in v.dimensions:
                vmon = dm.variables[vname]
                if vname == 'time_bnds':
                    v[nt,0] = vmon[nt*12,0]
                    v[nt,1] = vmon[nt*12+11,1]
                else:
                    v[nt] = np.average(vmon[nt*12:(nt+1)*12], axis=0, weights=mwts)
        nt += 1
        d.sync()  # Sync to disk once per year

d.close()