Skip to content

Commit

Permalink
update param
Browse files Browse the repository at this point in the history
  • Loading branch information
mieskolainen committed Jul 23, 2024
1 parent dd33f69 commit 33479ca
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 54 deletions.
10 changes: 5 additions & 5 deletions configs/zee/active_models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
#- lzmlp3
#- lzmlp4

# Neural Nets
- fastkan0
- lzmlp0
- dmlp0

# BDT based
- iceboost_swd
- iceboost0
#- iceboost_plus
- xgb0

# Neural Nets
- fastkan0
- lzmlp0
- dmlp0
88 changes: 44 additions & 44 deletions configs/zee/models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ iceboost3D:
plot_trees: false

# Read/Write of epochs
evalmode: 10 # Evaluation and saving of the model every n-th epoch (int) during training
evalmode: 5 # Evaluation and saving of the model every n-th epoch (int) during training
readmode: -1 # -1 takes the minimum loss model


Expand All @@ -71,11 +71,11 @@ xgb0:

# booster parameters
model_param:
num_boost_round: 500 # number of epochs (equal to the number of trees!)
num_boost_round: 500 # number of epochs (equal to the number of trees!)

booster: 'gbtree' # 'gbtree' (default), 'dart' (dropout boosting)
booster: 'gbtree' # 'gbtree' (default), 'dart' (dropout boosting)
tree_method: 'hist'
device: 'auto' # 'auto', 'cpu', 'cuda'
device: 'auto' # 'auto', 'cpu', 'cuda'

learning_rate: 0.08
gamma: 1.5
Expand All @@ -99,7 +99,7 @@ xgb0:
plot_trees: false

# Read/Write of epochs
evalmode: 10 # Evaluation and saving of the model every n-th epoch (int) during training
evalmode: 5 # Evaluation and saving of the model every n-th epoch (int) during training
readmode: -1 # -1 takes the minimum loss model


Expand All @@ -117,25 +117,25 @@ iceboost0: &ICEBOOST0

# booster parameters
model_param:
num_boost_round: 500 # number of epochs (equal to the number of trees!)
num_boost_round: 500 # number of epochs (equal to the number of trees!)

booster: 'gbtree' # 'gbtree' (default), 'dart' (dropout boosting)
booster: 'gbtree' # 'gbtree' (default), 'dart' (dropout boosting)
tree_method: 'hist'
device: 'auto' # 'auto', 'cpu', 'cuda'
device: 'auto' # 'auto', 'cpu', 'cuda'

learning_rate: 0.08
gamma: 1.5
max_depth: 13
min_child_weight: 1.0
max_delta_step: 1.0
subsample: 1

colsample_bytree: 0.9
colsample_bylevel: 0.9
colsample_bynode: 0.9
learning_rate: 0.08 # Learning rate (slower -> more conservative updates)
gamma: 1.5 # Minimum loss reduction required to make a further partition on a leaf node
max_depth: 13 # Maximum depth of a tree (too high may overfit)
min_child_weight: 1.0 # Minimum sum of instance weight (hessian) needed in a child (higher -> more conservative)
max_delta_step: 1.0 # Constraint on the maximum change in the model's weights
subsample: 1 # Subsample ratio of the training instance (=1 uses all data)

reg_lambda: 2.0 # L2 regularization
reg_alpha: 0.05 # L1 regularization
colsample_bytree: 0.9 # Specifies the fraction of features to be randomly selected for each tree
colsample_bylevel: 0.9 # As 'bytree' but applied to individual tree levels
colsample_bynode: 0.9 # The fraction of features to be randomly selected for each split in a tree

reg_lambda: 2.0 # L2 reg. A penalty to the weights of the features to prevent overfitting
reg_alpha: 0.05 # L1 reg. A penalty to the absolute value of the weights to induce sparsity

# learning task parameters
objective: 'custom:binary_cross_entropy' # Note that 'multi:softprob' does not work with distillation
Expand All @@ -154,7 +154,7 @@ iceboost0: &ICEBOOST0
plot_trees: false

# Read/Write of epochs
evalmode: 10 # Evaluation and saving of the model every n-th epoch (int) during training
evalmode: 5 # Evaluation and saving of the model every n-th epoch (int) during training
readmode: -1 # -1 takes the minimum loss model


Expand All @@ -180,7 +180,7 @@ iceboost_swd:
p: 1 # p-norm (1,2, ...)
num_slices: 500 # Number of MC projections (Higher the better)
mode: 'SWD' # 'SWD' (basic), 'EBSW' (see icefit/transport.py)
max_N: 500000 # Max events limit (500k & 500 slices works with 32 GB Nvidia V100)
max_N: 400000 # Max events limit (400k & 500 slices works with 32 GB Nvidia V100)


# ICEBOOST with an additional re-weighting in-the-loop regularization
Expand Down Expand Up @@ -233,13 +233,13 @@ lzmlp0: &LZMLP
out_dim: 1 # We want to use sigmoid 1D-output model, comment out for default softmax multiclass

model_param:
mlp_dim: [64, 64, 64, 64] # hidden layer dimensions
mlp_dim: [64, 64, 64, 64] # hidden layer dimensions
activation: 'silu'
layer_norm: True
batch_norm: False # normalization layers & dropout can be ill-posed here (operators not 1-to-1 compatible with weighted events)
dropout: 0.01
act_after_norm: True

# Optimization
opt_param:
lossfunc: 'binary_cross_entropy' # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
Expand All @@ -248,21 +248,21 @@ lzmlp0: &LZMLP

SWD_beta: 0.01 # Sliced Wasserstein [reweighting regularization]
SWD_p: 1 # p-norm (1,2,..), 1 perhaps more robust
SWD_num_slices: 1000 # Number of MC projections (higher the better)
SWD_num_slices: 10000 # Number of MC projections (higher the better)
SWD_mode: 'SWD' # 'SWD' (basic), 'EBSW' (see icefit/transport.py)

lipschitz_beta: 5.0e-5 # lipschitz regularization (use with 'lzmlp')
#logit_L1_beta: 1.0e-2 # logit norm reg. ~ beta * torch.sum(|logits|)
logit_L2_beta: 5.0e-3 # logit norm reg. ~ beta * torch.sum(logits**2)

#gamma: -0.5 # focal_entropy "exponent"
#gamma: -0.5 # focal_entropy "exponent"
#temperature: 1 # logit_norm_cross_entropy "temperature"

optimizer: 'AdamW'
clip_norm: 1.0

epochs: 300
batch_size: 8096
epochs: 150
batch_size: 4096
lr: 5.0e-4
weight_decay: 1.0e-4 # L2-regularization

Expand All @@ -281,7 +281,7 @@ lzmlp0: &LZMLP
tensorboard: true

# Read/Write of epochs
evalmode: 10 # Evaluation and saving of the model every n-th epoch (int) during training
evalmode: 5 # Evaluation and saving of the model every n-th epoch (int) during training
readmode: -1 # -1 takes the minimum loss model

eval_batch_size: 4096
Expand Down Expand Up @@ -330,13 +330,13 @@ fastkan0: &FASTKAN

# Optimization
opt_param:
lossfunc: 'binary_cross_entropy' # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
#lossfunc: 'binary_Lq_entropy' # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
#q: 0.8 # Lq exponent (q < 1 -> high density vals emphasized, q > 1 then low emphasized)
lossfunc: 'binary_cross_entropy' # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
#lossfunc: 'binary_Lq_entropy' # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
#q: 0.8 # Lq exponent (q < 1 -> high density vals emphasized, q > 1 then low emphasized)

SWD_beta: 0.01 # Sliced Wasserstein [reweighting regularization]
SWD_p: 1 # p-norm (1,2,..), 1 perhaps more robust
SWD_num_slices: 1000 # Number of MC projections (higher the better)
SWD_num_slices: 10000 # Number of MC projections (higher the better)
SWD_mode: 'SWD' # 'SWD' (basic), 'EBSW' (see icefit/transport.py)

#lipshitz_beta: 1.0e-4 # Lipshitz regularization (use with 'lzmlp')
Expand All @@ -349,10 +349,10 @@ fastkan0: &FASTKAN
optimizer: 'AdamW'
clip_norm: 1.0

epochs: 300
epochs: 150
batch_size: 4096
lr: 5.0e-4
weight_decay: 1.0e-2 # L2-regularization
weight_decay: 1.0e-4 # L2-regularization

# Scheduler
scheduler_param:
Expand All @@ -368,13 +368,13 @@ fastkan0: &FASTKAN
tensorboard: true

# Read/Write of epochs
evalmode: 10 # Evaluation and saving of the model every n-th epoch (int) during training
evalmode: 5 # Evaluation and saving of the model every n-th epoch (int) during training
readmode: -1 # -1 takes the minimum loss model

eval_batch_size: 4096

# Deploy (or test) mode device
deploy_device: 'cpu' # 'auto', 'cpu', 'cuda'
deploy_device: 'cpu' # 'auto', 'cpu', 'cuda'


## FastKAN
Expand Down Expand Up @@ -419,7 +419,7 @@ dmlp0: &DMLP
skip_connections: False
last_tanh: True # Extra tanh layer
last_tanh_scale: 10.0 # Scale after tanh()

# Optimization
opt_param:
lossfunc: 'binary_cross_entropy' # binary_cross_entropy, cross_entropy, focal_entropy, logit_norm_cross_entropy
Expand All @@ -428,20 +428,20 @@ dmlp0: &DMLP

SWD_beta: 0.01 # Sliced Wasserstein [reweighting regularization]
SWD_p: 1 # p-norm (1,2,..), 1 perhaps more robust
SWD_num_slices: 1000 # Number of MC projections (higher the better)
SWD_num_slices: 10000 # Number of MC projections (higher the better)
SWD_mode: 'SWD' # 'SWD' (basic), 'EBSW' (see icefit/transport.py)

#logit_L1_beta: 1.0e-2 # logit norm reg. ~ lambda * torch.sum(|logits|)
logit_L2_beta: 5.0e-3 # logit norm reg. ~ lambda * torch.sum(logits**2)

#gamma: 2 # focal_entropy "exponent"
#temperature: 1 # logit_norm_cross_entropy "temperature"

optimizer: 'AdamW'
clip_norm: 1.0

epochs: 300
batch_size: 8096
epochs: 150
batch_size: 4096
lr: 5.0e-4
weight_decay: 1.0e-4 # L2-regularization

Expand All @@ -459,7 +459,7 @@ dmlp0: &DMLP
tensorboard: true

# Read/Write of epochs
evalmode: 10 # Evaluation and saving of the model every n-th epoch (int) during training
evalmode: 5 # Evaluation and saving of the model every n-th epoch (int) during training
readmode: -1 # -1 takes the minimum loss model

eval_batch_size: 4096
Expand Down
4 changes: 2 additions & 2 deletions icefit/transport.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def sliced_W_vectorized(u_values: torch.Tensor, v_values: torch.Tensor,

def sliced_wasserstein_distance(u_values: torch.Tensor, v_values: torch.Tensor,
u_weights: torch.Tensor=None, v_weights: torch.Tensor=None,
p: int=1, num_slices: int=1000, mode='EBSW', vectorized=True):
p: int=1, num_slices: int=1000, mode='SWD', vectorized=True):
"""
Sliced Wasserstein Distance over arbitrary dimensional samples
Expand All @@ -187,7 +187,7 @@ def sliced_wasserstein_distance(u_values: torch.Tensor, v_values: torch.Tensor,
p: p-norm parameter (p = 1 is 'Earth Movers', 2 = is W-2, ...)
num_slices: number of random MC projections (slices) (higher the better)
mode: 'SWD' (basic uniform MC random)
'EBSW' (faster convergence and smaller variance)
'EBSW' (may have faster convergence and smaller variance)
vectorized: fully vectorized (may take more GPU/CPU memory, but 10x faster)
Returns:
Expand Down
14 changes: 11 additions & 3 deletions icenet/deep/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,21 +513,29 @@ def torch_construct(X_trn, Y_trn, X_val, Y_val, X_trn_2D, X_val_2D, trn_weights,
validation_set = optimize.Dataset(X=X_val, Y=Y_val, W=val_weights, Y_DA=Y_val_DA, W_DA=val_weights_DA, X_MI=data_val_MI)

### ** Optimization hyperparameters [possibly from Raytune] **
opt_param = aux.replace_param(default=param['opt_param'], raytune=config['params'])
opt_param = aux.replace_param(default=param['opt_param'], raytune=config['params'])

# N.B. We use 'sampler' with 'BatchSampler', which loads a set of events using multiple event indices (faster) than the default
# one which takes events one-by-one and concatenates the results (slow).

## ------------------------
# If True, then all batches are the same size (i.e. the last small one is skipped)
if 'drop_last' in opt_param:
drop_last = opt_param['drop_last']
else:
drop_last = True

params_train = {'batch_size' : None,
'num_workers' : param['num_workers'],
'sampler' : torch.utils.data.BatchSampler(
torch.utils.data.RandomSampler(training_set), opt_param['batch_size'], drop_last=False
torch.utils.data.RandomSampler(training_set), opt_param['batch_size'], drop_last=drop_last
),
'pin_memory' : True}

params_test = {'batch_size' : None,
'num_workers' : param['num_workers'],
'sampler' : torch.utils.data.BatchSampler(
torch.utils.data.RandomSampler(validation_set), param['eval_batch_size'], drop_last=False
torch.utils.data.RandomSampler(validation_set), param['eval_batch_size'], drop_last=drop_last
),
'pin_memory' : True}

Expand Down

0 comments on commit 33479ca

Please sign in to comment.