diff --git a/bytelatent/configs/debug_internal.yaml b/bytelatent/configs/debug_internal.yaml deleted file mode 100644 index 8f16a17..0000000 --- a/bytelatent/configs/debug_internal.yaml +++ /dev/null @@ -1,110 +0,0 @@ -# Template config, need to change dump_dir, data.root_dir and tokenizer.path -# Evals can be activated by uncommenting its config -# python -m launchers.stool config=apps/main/configs/debug.yaml nodes=8 account=fair_amaia_cw_codegen qos=lowest - -dump_dir: /tmp/ -name: "debug" -steps: 100_000 -probe_freq: null -seed: 777 -optim: - lr: 4e-04 - warmup: 500 - lr_min_ratio: 0.1 - clip: 10.0 - -distributed: - fsdp_type: full_shard - compile: true - model_dtype: bf16 - matmul_allow_tf32: false - selective_activation_checkpointing: false - tp_size: 1 - -model: - n_heads: 8 - dim: 512 - vocab_size: 260 - dim_token: 256 - patch_size: 6 - tokenization_mode: "bytes" - patching_mode: "space" - tie_local_encoder_decoder_logits: false - data_loader_patching: true - max_encoder_seq_length: 12288 - pad_to_max_length: true - patching_threshold: 3.1439168453216553 - encoder_hash_byte_group_size: [4] - encoder_hash_byte_group_vocab: 50002 - encoder_hash_byte_group_nb_functions: 3 - encoder_enable_byte_ngrams: false - cross_attn_encoder: true # assuming cross_attention is true - cross_attn_decoder: true # assuming cross_attention is true - cross_attn_window_encoder: 512 - cross_attn_window_decoder: 512 - dim_local_encoder: 256 - dim_local_decoder: 256 - cross_attn_k: 8 - cross_attn_nheads: 4 - cross_attn_all_layers_decoder: true - cross_attn_all_layers_encoder: true - cross_attn_use_flex_attention: true - cross_attn_init_by_pooling: true - log_patch_lengths: true - non_linearity: "swiglu" - use_rope: true - recompute_fc1_out: false - recompute_fc3_out: false - recompute_attn: false - custom_bwd: false - layer_ckpt: "none" - efficient_attn: "sdpa" - patch_only_encoder: false - patch_only_decoder: false - use_local_encoder_transformer: true - init_use_gaussian: true - init_use_depth: "current" - attn_bias_type: "block_causal" - alpha_depth: "disabled" - max_length: 256 - local_attention_window_len: 512 - max_seqlen: 12288 - downsampling_by_pooling: "max" - -data: - root_dir: /checkpoint/amaia/explore/datasets/ - sources: - dclm_baseline_1.0: 1.0 - batch_size: 2 - prefetch_size: 64 - seq_len: 4096 - load_async: true - preprocess_dir: /checkpoint/amaia/explore/dynabyte/datasets/corpora/dclm/entropy_preprocess/ - tokenizer_args: - name: blt - init_kwargs: - bpe_tokenizer_path: /checkpoint/amaia/explore/tokenizers/tokenizer_final_32k.minus_inf_ws.model - -profiling: - run: false - -checkpoint: - dump: - every: 500 - keep: 3 - eval: - every: 1000 - keep: -1 - -logging: - freq: 10 - -eval_on_gpus: 8 -eval: - dataset_dir: /checkpoint/amaia/codegen/datasets/eval - tasks: boolq,hellaswag,nq,piqa,siqa,tqa,winogrande,obqa,arc_easy,arc_challenge,race.middle,race.high,gsm8k,math,bbh,copa,human_eval_plus,mbpp,mmlu - generator: - max_tokens: 65536 - dtype: bf16 - - mp_size: 1