Skip to content

Commit

Permalink
update golden configs
Browse files Browse the repository at this point in the history
  • Loading branch information
apoorvtintin committed Jan 23, 2025
1 parent 33aa361 commit fe96240
Show file tree
Hide file tree
Showing 40 changed files with 3,704 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,101 @@ mesh_axis_names[2]: 'expert'
mesh_axis_names[3]: 'fsdp'
mesh_axis_names[4]: 'seq'
mesh_axis_names[5]: 'model'
mesh_rules[0][0]: 'neuron-(trn2|trn2n).48xlarge-64'
mesh_rules[0][1].config_modifiers[0].klass: 'axlearn.common.trainer_config_modifier.MeshShapeModifier'
mesh_rules[0][1].config_modifiers[0].mesh_shape[0]: 1
mesh_rules[0][1].config_modifiers[0].mesh_shape[1]: 1
mesh_rules[0][1].config_modifiers[0].mesh_shape[2]: 1
mesh_rules[0][1].config_modifiers[0].mesh_shape[3]: -1
mesh_rules[0][1].config_modifiers[0].mesh_shape[4]: 1
mesh_rules[0][1].config_modifiers[0].mesh_shape[5]: 4
mesh_rules[0][1].config_modifiers[1].klass: 'axlearn.common.trainer_config_modifier.ModelConfigModifier'
mesh_rules[0][1].config_modifiers[1].modification.klass: 'axlearn.common.attention.StackedTransformerLayer'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.activation: 'nn.relu'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.dropout.klass: 'axlearn.common.layers.Dropout'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.klass: 'axlearn.common.attention.TransformerFeedForwardLayer'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.linear1.bias: True
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.linear1.klass: 'axlearn.common.layers.Linear'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.linear1.param_partition_spec[0]: None
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.linear1.param_partition_spec[1]: 'model'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.linear2.bias: True
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.linear2.klass: 'axlearn.common.layers.Linear'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.linear2.param_partition_spec[0]: 'model'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.linear2.param_partition_spec[1]: None
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.norm.eps: 1e-08
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.norm.forward_dtype: 'jax.numpy.float32'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.norm.klass: 'axlearn.common.layers.LayerNorm'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.residual_weight: 1.0
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.stochastic_depth.klass: 'axlearn.common.layers.StochasticDepth'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.stochastic_depth.mode: 'row'
mesh_rules[0][1].config_modifiers[1].modification.layer.feed_forward.structure: 'prenorm'
mesh_rules[0][1].config_modifiers[1].modification.layer.klass: 'axlearn.common.attention.TransformerLayer'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.dropout.klass: 'axlearn.common.layers.Dropout'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.input_linear.klass: 'axlearn.common.attention.QKVLinear'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.input_linear.layer.bias: True
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.input_linear.layer.klass: 'axlearn.common.attention.MultiheadInputLinear'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.input_linear.layer.param_partition_spec[0]: None
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.input_linear.layer.param_partition_spec[1]: 'model'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.input_linear.layer.param_partition_spec[2]: None
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.key_scale.klass: 'axlearn.common.attention.ScaleKey'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.klass: 'axlearn.common.attention.MultiheadAttention'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.output_linear.bias: True
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.output_linear.klass: 'axlearn.common.attention.MultiheadOutputLinear'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.output_linear.param_partition_spec[0]: None
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.output_linear.param_partition_spec[1]: 'model'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.output_linear.param_partition_spec[2]: None
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.attention.query_scale.klass: 'axlearn.common.attention.ScaleQuery'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.dropout.klass: 'axlearn.common.layers.Dropout'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.klass: 'axlearn.common.attention.TransformerAttentionLayer'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.norm.eps: 1e-08
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.norm.forward_dtype: 'jax.numpy.float32'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.norm.klass: 'axlearn.common.layers.LayerNorm'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.stochastic_depth.klass: 'axlearn.common.layers.StochasticDepth'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.stochastic_depth.mode: 'row'
mesh_rules[0][1].config_modifiers[1].modification.layer.self_attention.structure: 'prenorm'
mesh_rules[0][1].config_modifiers[1].target_config: 'model.decoder.transformer'
mesh_rules[0][1].config_modifiers[2].klass: 'axlearn.common.trainer_config_modifier.ModelConfigModifier'
mesh_rules[0][1].config_modifiers[2].modification.klass: 'axlearn.common.attention.GroupedQKVLinear'
mesh_rules[0][1].config_modifiers[2].modification.layer.bias: True
mesh_rules[0][1].config_modifiers[2].modification.layer.klass: 'axlearn.common.attention.MultiheadInputLinear'
mesh_rules[0][1].config_modifiers[2].modification.layer.param_partition_spec[0]: None
mesh_rules[0][1].config_modifiers[2].modification.layer.param_partition_spec[1]: 'model'
mesh_rules[0][1].config_modifiers[2].modification.layer.param_partition_spec[2]: None
mesh_rules[0][1].config_modifiers[2].target_config: 'model.decoder.transformer.layer.self_attention.attention.input_linear.input_linear'
mesh_rules[0][1].config_modifiers[3].klass: 'axlearn.common.trainer_config_modifier.PartitionSpecModifier'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['param_partition_spec'][0]: 'model'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['param_partition_spec'][1][0]: 'expert'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['param_partition_spec'][1][1]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['param_partition_spec'][1][2]: 'seq'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['input_partition_spec'][0]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['input_partition_spec'][1]: None
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['output_partition_spec'][0]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['output_partition_spec'][1]: 'model'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['embedding_partition_spec'][0]: 'model'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.emb.token_emb']['embedding_partition_spec'][1]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.lm_head']['param_partition_spec'][0]: 'model'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.lm_head']['param_partition_spec'][1][0]: 'expert'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.lm_head']['param_partition_spec'][1][1]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.lm_head']['param_partition_spec'][1][2]: 'seq'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.self_attention.norm']['input_partition_spec'][0]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.self_attention.norm']['input_partition_spec'][1]: 'model'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.self_attention.norm']['input_partition_spec'][2]: None
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.self_attention.norm']['output_partition_spec'][0]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.self_attention.norm']['output_partition_spec'][1]: None
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.self_attention.norm']['output_partition_spec'][2]: None
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.feed_forward.norm']['input_partition_spec'][0]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.feed_forward.norm']['input_partition_spec'][1]: 'model'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.feed_forward.norm']['input_partition_spec'][2]: None
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.feed_forward.norm']['output_partition_spec'][0]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.feed_forward.norm']['output_partition_spec'][1]: None
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.transformer.layer.feed_forward.norm']['output_partition_spec'][2]: None
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.output_norm']['input_partition_spec'][0]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.output_norm']['input_partition_spec'][1]: 'model'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.output_norm']['input_partition_spec'][2]: None
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.output_norm']['output_partition_spec'][0]: 'fsdp'
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.output_norm']['output_partition_spec'][1]: None
mesh_rules[0][1].config_modifiers[3].partition_specs['model.decoder.output_norm']['output_partition_spec'][2]: None
mesh_rules[0][1].klass: 'axlearn.common.trainer_config_modifier.ChainConfigModifier'
mesh_shape[0]: 1
mesh_shape[1]: -1
mesh_shape[2]: 1
Expand Down
Loading

0 comments on commit fe96240

Please sign in to comment.