-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfinetune_32.2M_mae_autocalibration.yaml
149 lines (143 loc) · 5.48 KB
/
finetune_32.2M_mae_autocalibration.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# finetune_32.2M_mae_autocalibration.yaml
# general
log_level: 'DEBUG'
experiment:
name: null # generate random name in wandb when set to null
project: "sdofm"
task: "finetune" # options: train, evaluate (not implemented)
model: "autocalibration"
resuming: false
checkpoint: null # this is the wandb run_id of the checkpoint to load
backbone:
checkpoint: "model-tk45el88:best" #"sdofm/runs/771lx6o3:best"
model: "mae"
seed: 0
disable_cuda: false
wandb:
enable: true
entity: "fdlx"
group: "sdofm-phase1"
job_type: "finetune"
tags: []
notes: ""
output_directory: "wandb_output"
log_model: "all" # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches)
gcp_storage: # this will checkpoint all epoches and upload them to a GCP bucket, W&B will store references (TODO: perhaps explain this better)
enabled: true
bucket: "sdofm-checkpoints"
fold: null
evaluate: false # skip training and only evaluate (requires checkpoint to be set)
device: null # this is set automatically using the disable_cuda flag and torch.cuda.is_available()
precision: 'bf16-true' # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
log_n_batches: 1000 # log every n training batches
save_results: true # save full results to file and wandb
accelerator: "auto" # options are "auto", "gpu", "tpu", "ipu", or "cpu"
profiler: null # options are 'XLAProfiler' (TPU), 'PyTorchProfiler', warning: PyTorchProfiler only works on cpu/gpu according to docs
distributed:
enabled: true # set to true to use more than one device
world_size: "auto" # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators.
strategy: "auto"
log_every_n_steps: 5
# dataset configuration
data:
min_date: '2011-10-01 00:00:00.00' # minimum is '2010-09-09 00:00:11.08'
max_date: '2011-12-31 23:59:59.99' # maximum is '2023-05-26 06:36:08.072'
month_splits: # non selected months will form training set
# train: [1,2,3,4,5,6,7,8,9,10]
val: [11]
test: [12]
holdout: []
num_workers: 8 # set appropriately for your machine
prefetch_factor: 3 # TODO: not implemented, 2 is default
num_frames: 1 # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE
sdoml:
base_directory: "/mnt/sdoml"
sub_directory:
hmi: "HMI.zarr"
aia: "AIA.zarr"
eve: "EVE_legacy.zarr"
cache: "cache"
components: null # null for select all magnetic components ["Bx", "By", "Bz"]
wavelengths: null # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"]
ions: null # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"]
frequency: '12min' # smallest is 12min
mask_with_hmi_threshold: null # None/null for no mask, float for threshold
# model configurations
model:
# PRETRAINERS
mae:
img_size: 512
patch_size: 16
num_frames: 1
tubelet_size: 1
in_chans: 9
embed_dim: 512
depth: 24
num_heads: 16
decoder_embed_dim: 512
decoder_depth: 8
decoder_num_heads: 16
mlp_ratio: 4.0
norm_layer: 'LayerNorm'
norm_pix_loss: False
masking_ratio: 0.5
samae:
# uses all parameters as in mae plus these
masking_type: "random" # 'random' or 'solar_aware'
active_region_mu_degs: 15.73
active_region_std_degs: 6.14
active_region_scale: 1.0
active_region_abs_lon_max_degs: 60
active_region_abs_lat_max_degs: 60
nvae:
use_se: true
res_dist: true
num_x_bits: 8
num_latent_scales: 3 # 5
num_groups_per_scale: 1 # 16
num_latent_per_group: 1 # 10
ada_groups: true
min_groups_per_scale: 1
num_channels_enc: 30
num_channels_dec: 30
num_preprocess_blocks: 2 # 1
num_preprocess_cells: 2
num_cell_per_cond_enc: 2
num_postprocess_blocks: 2 # 1
num_postprocess_cells: 2
num_cell_per_cond_dec: 2
num_mixture_dec: 1
num_nf: 2
kl_anneal_portion: 0.3
kl_const_portion: 0.0001
kl_const_coeff: 0.0001
# learning_rate: 1e-2
# weight_decay: 3e-4
weight_decay_norm_anneal: true
weight_decay_norm_init: 1.
weight_decay_norm: 1e-2
# FINE-TUNERS
autocalibration:
num_neck_filters: 32
output_dim: 9 # not sure why this is implemented for autocorrelation, should be a scalar
loss: "mse" # options: "mse", "heteroscedastic"
freeze_encoder: true
# ML optimization arguments:
opt:
loss: "mse" # options: "mae", "mse", "mape"
scheduler: "constant" #other options: "cosine", "plateau", "exp"
scheduler_warmup: 0
batch_size: 1
learning_rate: 0.0001
weight_decay: 3e-4 # 0.0
optimiser: "adam"
epochs: 100
patience: 2
# hydra configuration
hydra:
mode: RUN
# run:
# dir: ${data.output_directory}/${now:%Y-%m-%d-%H-%M-%S}
# sweep:
# dir: ${hydra.run.dir}
# subdir: ${hydra.job.num}