forked from ua-snap/precip-dot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-pipeline
executable file
·503 lines (438 loc) · 16.3 KB
/
run-pipeline
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
#!/usr/bin/perl
#SBATCH --cpus-per-task=64
#SBATCH --nodes=1
#SBATCH -p main,viz
#SBATCH --account=snap
#SBATCH --export=ALL,SLURMBATCH=yes
use strict;
use warnings;
=head1 NAME
run-pipeline - Execute all or part of the data-processing pipeline
=head1 SYNOPSIS
run-pipeline [-h|--help] [-n|--dry-run] [-p|--procs PROCS]
[-d|--dir DATA_DIR] [-e|--python EXEC] [--script-dir DIR]
[-g|--groups DATA_GROUPS] [-s|--steps STEPS]
run-pipeline ls
Options:
-h|--help: Print help information and exit
-n|--dry-run: Show which steps of the pipeline will get run
but don't actually run them.
-e|--python Override the path the to the python executable.
You'll want this if using SLURM so the code
still runs in your virtual environment.
-p|--procs: Maximum number of parallel processes to run. Not
all steps in the pipeline use this.
-d|--dir: Specify the directory where the data for the
pipeline lies. Defaults to:
'/workspace/Shared/Tech_Projects/DOT/project_data/wrf_pcpt/'
--script-dir: Specify the directory where the python scripts
for the pipeline steps lie. Defaults to './pipeline'
relative to the directory that this script is in.
The SLURM sbatch command makes a copy of the script
and puts it somewhere else, so you'll need to use this
if running through SLURM.
-g|--groups: Specify which data groups to run.
Defaults to: 'all'
-s|--steps: Specify which steps to run.
Defaults to: 'all'
-v|--variant: A string to append to the end of each directory in the
pipeline, useful for creating a parallel "branch" in
the pipeline. If an input directory with the variant
name isn't found, the default name will be used.
DATA_GROUPS is a comma-separated list of the data groups/models to execute
the pipeline for. The elements of the list can be any of the following values:
NCAR-CCSM4_historical
NCAR-CCSM4_rcp85
ERA-Interim_historical
GFDL-CM3_historical
GFDL-CM3_rcp85
Alternatively, this argument can simply be the string "all" to execute
for all groups. Technically, the values given here are treated as regular
expressions, so you can sepcify substrings like 'historical' or 'GFDL-CM3'
to match multiple groups.
Some steps in the pipeline may require specific data groups
or combinations of them. For example, the 'deltas' step and beyond requires
that both the 'rcp85' and 'historical' models for GFDL-CM3 or NCAR-CCSM4
be included.
STEPS is a comma-separated list of steps to run in the pipeline. The format
is analgous to the field selection of the cut(1) command. Each element of the
list can be either the name of a step, or a range in one of the formats
below:
NAME The single step with NAME
NAME- The step with NAME and every step after it
NAME-NAME2 All the steps from NAME to NAME2 (inclusive)
-NAME The steps from the first one up through NAME
Every step will only be run once and always in the order that the script
defines for the pipeline, REGARDLESS of how you specify them in the list.
Like with DATA_GROUPS, this can also be the string "all" to execute all steps.
Execute `run-pipeline ls` to see a list of the names for all the steps in
the pipeline and a description of each step, as well as a list of all
possible data group names.
=cut
#######################
# Global data and flags
#######################
use File::Basename;
use File::Spec qw(rel2abs);
# Ordering of steps
my @order = qw(durations ams intervals diff deltas warp multiply fudge undiff);
# Mapping of steps to descriptions
my %descriptions = (
'durations' => "Compute durations series from raw hourly data",
'ams' => "Compute annual maximum series form durations series",
'intervals' => "Compute return intervals (with confidence bounds) from annual maximum series",
'diff' => "Rewrite return interval confidence bounds as differences from the median",
'deltas' => "Compute ratios of differences between historical and projected data groups",
'warp' => "Warp grids of deltas to match grid of NOAA Atlas 14 data",
'multiply' => "Multiply by NOAA Atlas 14 data by deltas for final product.",
'fudge' => "Fudge values for consistency",
'undiff' => "Re-apply the diffs created in the diff step and add header information to data"
);
# Mapping indicating whether each step should be performed.
# (Defaults to true for all)
my %do_steps = map { $_ => 1 } @order;
# List of possible data set names
my @allowed_data_groups = qw(
NCAR-CCSM4_historical
NCAR-CCSM4_rcp85
ERA-Interim_historical
GFDL-CM3_historical
GFDL-CM3_rcp85
);
# Mapping indicating whether each data group is used
# (Defaults to true for all)
my %use_group = map { $_ => 1} @allowed_data_groups;
# Default arguments/flags
my $dry_run = 0;
my $python_exec = 'python3';
my $in_slurm = defined $ENV{'SLURMBATCH'};
my $max_procs = 5;
my $data_dir = '/workspace/Shared/Tech_Projects/DOT/project_data/wrf_pcpt/';
my $script_dir = dirname($0) . "/pipeline";
my $variant;
#######################
# MAIN
#######################
# Print list of steps and data groups if 'ls' is the
# first (and only) argument
if (@ARGV == 1 && $ARGV[0] eq 'ls') {
print "The steps of the pipeline are (in order):\n";
foreach (0..$#order) {
printf "\t\033[1m%d: %-10s\033[0m\t%s\n", $_+1, $order[$_], $descriptions{$order[$_]};
}
print "The allowed data groups are:\n";
foreach (@allowed_data_groups) {
print "\033[1m\t$_\033[0m\n";
}
exit 0;
}
use Getopt::Long qw(GetOptions Configure);
use Pod::Usage qw(pod2usage);
# Parse Options
Configure qw'auto_help';
GetOptions(
'dry-run|n' => \$dry_run,
'procs|p=i' => \$max_procs,
'python|e=s' => \$python_exec,
'dir|d=s' => \$data_dir,
'script-dir=s' => \$script_dir,
'groups|g=s' => \&parse_data_groups,
'steps|s=s' => \&parse_steps,
'variant|v=s' => \$variant
) or die "Invalid options!";
# There should be no other arguments after parsing options
pod2usage("Invalid arguments.") if (@ARGV);
# Confirm that data directory exists.
die "Data directory, $data_dir, is not a directory." unless ($dry_run || -d $data_dir);
#######################
# STAGE 1
#######################
# Stage 1 consists of the steps from durations through diffs.
# It is repeated for each of the five data groups.
for my $group (grep $use_group{$_}, @allowed_data_groups) {
print "\033[1m>>> $group <<<\033[0m\n";
#
# Durations
#
exec_script(
name => 'Durations',
indir => 'pcpt',
outdir => 'durations',
script => 'durations.py',
group => $group
) if $do_steps{durations};
#
# AMS
#
exec_script(
name => 'AMS',
indir => 'durations',
outdir => 'ams',
script => 'ams.py',
group => $group
) if $do_steps{ams};
#
# Intervals
#
exec_script(
name => 'Intervals',
indir => 'ams',
outdir => 'intervals',
script => 'intervals.py',
group => $group,
extra_args => ['-n', $max_procs]
) if $do_steps{intervals};
#
# Diffs
#
exec_script(
name => 'Diffs',
indir => 'intervals',
outdir => 'diff',
script => 'diff.py',
group => $group
) if $do_steps{diff};
} # end FOR $group
# If no steps after 'intervals' are being performed, we can stop now.
exit 0 unless (grep $do_steps{$_}, @order[get_order('deltas')..$#order]);
#######################
# STAGE 2
#######################
#
# The steps from this point forward act on pairs of historical and
# projected (rcp85) models, so for either the NCAR-CCSM4 or GFDL-CM3 models,
# both the historical and rcp85 versions need to have been included.
#
# Print warnings to the user if any specified groups cannot be used.
if ($use_group{'ERA-Interim_historical'}) {
print STDERR "\033[33;1mWARNING:\033[0m ERA-Interim_historical is not used".
" for the remaining steps, so it will be ignored.\n"
}
if ($use_group{'NCAR-CCSM4_historical'} xor $use_group{'NCAR-CCSM4_rcp85'}) {
print STDERR "\033[33;1mWARNING:\033[0m Both the historical and projected".
" (rcp85) versions of the NCAR-CCSM4 model must be specified.".
" But only one was specified, so it will be ignored.\n"
}
if ($use_group{'GFDL-CM3_historical'} xor $use_group{'GFDL-CM3_rcp85'}) {
print STDERR "\033[33;1mWARNING:\033[0m Both the historical and projected".
" (rcp85) versions of the GFDL-CM3 model must be specified.".
" But only one was specified, so it will be ignored.\n"
}
# Get which pairs of models were specified
my %pairs = (
'NCAR-CCSM4' =>
$use_group{'NCAR-CCSM4_historical'} && $use_group{'NCAR-CCSM4_rcp85'},
'GFDL-CM3' =>
$use_group{'GFDL-CM3_historical'} && $use_group{'GFDL-CM3_rcp85'}
);
my @valid_pairs = grep {$pairs{$_}} keys %pairs;
unless (@valid_pairs) {
print STDERR "\033[31;1mERROR:\033[0m No valid sets of data groups were".
" specified for the remaining steps.\n";
exit 1;
}
for my $group (@valid_pairs) {
print "\033[1m>>> $group <<<\033[0m\n";
#
# Deltas
#
exec_script(
name => 'Deltas',
indir => 'diff',
outdir => 'deltas',
script => 'deltas.py',
group => $group
) if $do_steps{deltas};
#
# Warp
#
exec_script(
name => 'Warp',
indir => [ 'deltas', 'NOAA' ],
outdir => 'warp',
script => 'warp.py',
group => $group,
extra_args => ['-a', "$data_dir/NOAA/"]
) if $do_steps{warp};
#
# Multiply
#
exec_script(
name => 'Multiply',
indir => [ 'warp', 'NOAA' ],
outdir => 'multiply',
script => 'multiply.py',
group => $group,
extra_args => ['-a', "$data_dir/NOAA/"]
) if $do_steps{multiply};
#
# Fudge
#
exec_script(
name => 'Fudging',
indir => 'multiply',
outdir => 'fudge',
script => 'fudge.py',
group => $group
) if $do_steps{fudge};
#
# Undiff
#
exec_script(
name => 'Undiff',
indir => 'fudge',
outdir => 'undiff',
script => 'undiff.py',
group => $group
) if $do_steps{undiff};
}
#######################
# Helper functions
#######################
use List::Util qw(first);
use File::Path qw(make_path);
=for comment
Execute a python script for a single step.
Arguments are interpreted as a hash with the following options.
- name: Name displayed in CLI output for this step
- indir: Directory containing input files for this step
(if this is an arrayref, multiple input directories
can be specified, but only the first one is passed to
the script, the others are just checked for existence)
- outdir: Directory to put output files from this step
- group: Data Group for this step
- extra_args: Array of additional arguments to pass to python script
=cut
sub exec_script {
my %params = @_;
print "Step: \033[1m${\$params{name}}\033[0m...\n";
# Confirm existence of input and output directories
# (unless its a dry run, in which case we don't care)
my $main_indir;
unless ($dry_run) {
if (ref $params{indir} eq 'ARRAY') {
$main_indir = ( map check_input_dir($_), @{$params{indir}} )[0];
}
else {
$main_indir = check_input_dir($params{indir});
}
create_output_dir($params{outdir});
}
# If its a dry run, just set $main_indir without checking for existence.
if ($dry_run) {
$main_indir = ref $params{indir} eq 'ARRAY' ? $params{indir}->[0] : $params{indir};
$main_indir .= $variant ? "-$variant" : "";
}
my $outdir = $variant ? $params{outdir}."-$variant" : $params{outdir};
# Assemble command
my @command = (
"$python_exec", "$script_dir/".$params{script},
'-p', "$data_dir/".$main_indir,
'-o', "$data_dir/".$outdir,
'-d', $params{group}
);
push (@command, @{$params{extra_args}}) if $params{extra_args};
unshift(@command, 'srun' ) if ($in_slurm);
# Execute command
# (or just print it if this is a dry run)
if ($dry_run) {
print "\033[3;36m@command\033[0m\n";
} else {
system @command;
if ( ($? >> 8) != 0) {
print STDERR "\033[31;1mERROR:\033[0m Error occured executing step! Aborting...\n";
exit 2;
}
}
}
=for comment
Get the order index of the specified step
(or die if the name doesn't match anything)
=cut
sub get_order {
my $step = shift @_;
my $idx = first { $order[$_] eq $step } 0..$#order;
die "Invalid step name '$step'." unless (defined($idx));
return $idx;
}
=for comment
Parser for DATA_GROUPS option.
Parses comma-separated list of group names and dies if it reaches
an invalid name.
=cut
sub parse_data_groups {
my $val = $_[1];
return if ($val eq 'all'); # all is the default already, so do nothing
# if 'all' was specified.
# Otherwise, assume all groups are false and just add the ones
# that the user specified.
$use_group{$_} = 0 foreach (keys %use_group);
foreach my $name (split /,/, $val) {
my @matches = grep m/$name/, @allowed_data_groups
or die "'$name' did not match any data groups.";
# Alert the user if a specified name matched more than one group
# (in case it was accidental)
if (@matches > 1) {
print STDERR "\033[36;1mINFO:\033[0m Pattern '$name' matched".
" multiple groups: " . join(', ', @matches) . "\n";
}
foreach my $group (@matches) {
$use_group{$group} = 1;
}
}
}
=for comment
Parse for STEPS option.
Parses a list of ranges (using the same syntax as the cut command) for
the steps and dies if it reaches an invalid step name or invalid syntax.
=cut
sub parse_steps {
my $val = $_[1];
return if ($val eq 'all'); # all is the default already, so do nothing
# if 'all' was specified.
# Otherwise, assume all steps are false and just add the ones
# that the user specified.
$do_steps{$_} = 0 foreach (keys %do_steps);
foreach (split /,/, $val) {
# For each segment the user specified, determine the appropriate slice
# of the step ordering and set those steps to true.
my @slice;
if (/^\w+$/) { @slice = get_order($_); }
elsif (/^-(\w+)$/) { @slice = 0..get_order($1); }
elsif (/^(\w+)-$/) { @slice = get_order($1)..$#order; }
elsif (/^(\w+)-(\w+)$/) { @slice = get_order($1)..get_order($2);}
else { die "Invalid list syntax."; }
$do_steps{$_} = 1 foreach (@order[@slice]);
}
}
=for comment
Check that a directory in the data directory exists.
Die if it does not.
If $variant is set, then looks for a directory name with "-$variant" appended
to it, but a directory name without the variant will still be accepted. Returns
the name of the found directory.
=cut
sub check_input_dir {
my $dir = shift;
if (defined $variant) {
my $vardir = "$dir-$variant";
return $vardir if -d "$data_dir/$dir-$variant";
}
return $dir if -d "$data_dir/$dir";
print STDERR "\033[31;1mERROR:\033[0m Directory '$dir' does not exist in data directory.\n";
exit 1;
}
=for comment
Create a directory in the data directory.
Die if unsuccessful.
If $variant is set, creates a directory with "-$variant" appended to the name.
=cut
sub create_output_dir {
my $dir = shift;
$dir .= "-$variant" if ($variant);
make_path("$data_dir/$dir", {error => \my $err});
if ($err && @$err) {
print STDERR "\033[31;1mERROR:\033[0m Unable to create directory '$dir' in data directory.\n";
exit 1;
}
}