experiments/finetune_32.2M_mae_autocalibration.yaml

# finetune_32.2M_mae_autocalibration.yaml

# general
log_level: 'DEBUG'
experiment:
  name: null                    # generate random name in wandb when set to null
  project: "sdofm"
  task: "finetune"              # options: train, evaluate (not implemented)
  model: "autocalibration"
  resuming: false
  checkpoint: null              # this is the wandb run_id of the checkpoint to load
  backbone:
    checkpoint: "model-tk45el88:best" #"sdofm/runs/771lx6o3:best"
    model: "mae"
  seed: 0
  disable_cuda: false
  wandb:
    enable: true
    entity: "fdlx"
    group: "sdofm-phase1"
    job_type: "finetune"
    tags: []
    notes: ""
    output_directory: "wandb_output"
    log_model: "all"             # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches)
  gcp_storage:                  # this will checkpoint all epoches and upload them to a GCP bucket, W&B will store references (TODO: perhaps explain this better)
    enabled: true
    bucket: "sdofm-checkpoints"
  fold: null
  evaluate: false               # skip training and only evaluate (requires checkpoint to be set)
  device: null                  # this is set automatically using the disable_cuda flag and torch.cuda.is_available()
  precision: 'bf16-true'        # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
  log_n_batches: 1000           # log every n training batches
  save_results: true            # save full results to file and wandb
  accelerator: "auto"           # options are "auto", "gpu", "tpu", "ipu", or "cpu"
  profiler: null                # options are 'XLAProfiler' (TPU), 'PyTorchProfiler', warning: PyTorchProfiler only works on cpu/gpu according to docs
  distributed:
    enabled: true               # set to true to use more than one device
    world_size: "auto"          # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators.
    strategy: "auto"
  log_every_n_steps: 5

# dataset configuration
data:
  min_date: '2011-10-01 00:00:00.00'    # minimum is '2010-09-09 00:00:11.08'
  max_date: '2011-12-31 23:59:59.99'    # maximum is '2023-05-26 06:36:08.072'
  month_splits:                 # non selected months will form training set 
    # train: [1,2,3,4,5,6,7,8,9,10]
    val: [11]
    test: [12]
    holdout: []
  num_workers: 8               # set appropriately for your machine
  prefetch_factor: 3            # TODO: not implemented, 2 is default
  num_frames: 1 # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE
  sdoml:
    base_directory: "/mnt/sdoml"
    sub_directory:
      hmi: "HMI.zarr"
      aia: "AIA.zarr"
      eve: "EVE_legacy.zarr"
      cache: "cache"
    components: null            # null for select all magnetic components ["Bx", "By", "Bz"]
    wavelengths: null           # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"]
    ions: null                  # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"]
    frequency: '12min'          # smallest is 12min
    mask_with_hmi_threshold: null # None/null for no mask, float for threshold

# model configurations
model:
  # PRETRAINERS
  mae:
    img_size: 512
    patch_size: 16
    num_frames: 1
    tubelet_size: 1
    in_chans: 9
    embed_dim: 512
    depth: 24
    num_heads: 16
    decoder_embed_dim: 512
    decoder_depth: 8
    decoder_num_heads: 16
    mlp_ratio: 4.0
    norm_layer: 'LayerNorm'
    norm_pix_loss: False
    masking_ratio: 0.5
  samae:
    # uses all parameters as in mae plus these
    masking_type: "random"  # 'random' or 'solar_aware'
    active_region_mu_degs: 15.73
    active_region_std_degs: 6.14
    active_region_scale: 1.0
    active_region_abs_lon_max_degs: 60
    active_region_abs_lat_max_degs: 60
  nvae:
    use_se: true
    res_dist: true
    num_x_bits: 8
    num_latent_scales: 3  # 5
    num_groups_per_scale: 1  # 16
    num_latent_per_group: 1  # 10
    ada_groups: true
    min_groups_per_scale: 1
    num_channels_enc: 30
    num_channels_dec: 30
    num_preprocess_blocks: 2   # 1
    num_preprocess_cells: 2
    num_cell_per_cond_enc: 2
    num_postprocess_blocks: 2  # 1
    num_postprocess_cells: 2
    num_cell_per_cond_dec: 2
    num_mixture_dec: 1
    num_nf: 2
    kl_anneal_portion: 0.3
    kl_const_portion: 0.0001
    kl_const_coeff: 0.0001
    # learning_rate: 1e-2
    # weight_decay: 3e-4
    weight_decay_norm_anneal: true
    weight_decay_norm_init: 1.
    weight_decay_norm: 1e-2        
    
  # FINE-TUNERS
  autocalibration:
    num_neck_filters: 32
    output_dim: 9 # not sure why this is implemented for autocorrelation, should be a scalar
    loss: "mse" # options: "mse", "heteroscedastic"
    freeze_encoder: true
    
  # ML optimization arguments:
  opt:
    loss: "mse" # options: "mae", "mse", "mape"
    scheduler: "constant" #other options: "cosine", "plateau", "exp"
    scheduler_warmup: 0
    batch_size: 1
    learning_rate: 0.0001
    weight_decay: 3e-4 # 0.0
    optimiser: "adam"
    epochs: 100
    patience: 2

# hydra configuration
hydra:
  mode: RUN
  # run:
  #   dir: ${data.output_directory}/${now:%Y-%m-%d-%H-%M-%S}
  # sweep:
  #   dir: ${hydra.run.dir}
  #   subdir: ${hydra.job.num}