Skip to content

Commit

Permalink
Merge pull request #28 from YerevaNN/minor_changes_for_run
Browse files Browse the repository at this point in the history
Minor changes for run
  • Loading branch information
philippguevorguian authored May 18, 2024
2 parents 7be6d91 + 210ba73 commit 595e029
Show file tree
Hide file tree
Showing 12 changed files with 156 additions and 238 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ train_config:
fp16: false
tf32: true
evaluation_strategy: "steps"
save_total_limit: 4
save_total_limit: 8
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ model_config:
block_size: 2048
vocab_size: 256000
separator_token: <bos>
separator_token_id: 2
# tokenizer_path: "./chemlactica/tokenizer/GemmaTokenizer"
tokenizer_path: "/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/GemmaTokenizer"
# tokenizer_path: "google/gemma-2b"
31 changes: 31 additions & 0 deletions chemlactica/config/config_yamls/gemma_2b_sft_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
train_config:
adam_beta1: 0.9
adam_beta2: 0.95
batch_size: 500000
dropout_prob: 0.1
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 2.0e-5
warmup_steps: 0
weight_decay: 0.1
bf16: true
bf16_full_eval: true
fp16: false
tf32: true
evaluation_strategy: "steps"
save_total_limit: 4
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
grad_accumulation_max: 256
grad_accumulation_delta_steps: 100
grad_accumulation_delta_percentage: 0.02
model_config:
n_heads: 12
n_layers: 18
block_size: 2048
vocab_size: 256000
separator_token: <bos>
separator_token_id: 2
tokenizer_path: "/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/GemmaTokenizer"
1 change: 1 addition & 0 deletions chemlactica/config/default_train_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class ModelConfig:
block_size: int = 2048
vocab_size: int = 50000
separator_token: str = "</s>"
separator_token_id: int = 2
tokenizer_path: str = "chemlactica/tokenizer/ChemLacticaTokenizer66"


Expand Down
2 changes: 1 addition & 1 deletion chemlactica/config/galactica_accelerate_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ fsdp_config:
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
fsdp_backward_prefetch: BACKWARD_PRE
fsdp_offload_params: false
fsdp_forward_prefetch: false
fsdp_forward_prefetch: true
fsdp_sharding_strategy: 1
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_transformer_layer_cls_to_wrap: OPTForCausalLM
Expand Down
2 changes: 1 addition & 1 deletion chemlactica/custom_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class CustomArguments(TrainingArguments):
class CustomTrainer(Trainer):
def __init__(self, *args, **kwargs):
# the number of samples to print when the training begins, for debugging purposes
self.num_samples_to_print = 5
self.num_samples_to_print = 10
self.tokenizer_path = kwargs["args"].tokenizer_path
super().__init__(*args, **kwargs)

Expand Down
4 changes: 2 additions & 2 deletions chemlactica/jsonl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def samples_generator(
distributed_state.process_index,
):
returned = True
ret = format_sample(line)
yield ret
ret = format_sample(line)
yield ret
counter = counter + 1
shared_jsonl_files[file] = state
line = f.readline()
2 changes: 1 addition & 1 deletion chemlactica/utils/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def process_dataset(
assay=True,
):
tokenizer = get_tokenizer(model_config.tokenizer_path)
eos_token_id = tokenizer.eos_token_id
eos_token_id = model_config.separator_token_id
rng = np.random.default_rng()

if assay:
Expand Down
2 changes: 1 addition & 1 deletion chemlactica/utils/text_format_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def generate_formatted_string(compound_json, rng, model_config):
key_value_pairs = []
key = "SMILES"
value = compound_json.get(key, "")
if rng.integers(0, 1) == 0:
if rng.integers(2) == 0:
if value:
key_value_pairs.append(format_key_value(key, value, rng))
del compound_json[key]
Expand Down
321 changes: 103 additions & 218 deletions notebooks/playground.ipynb

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions submit_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@
import submitit

use_accelerate = True
rsync_enabled = False
executor_name = "local" # options are ["slurm", "local"]
rsync_enabled = True
executor_name = "slurm" # options are ["slurm", "local"]
root_path = ""
num_gpus = 2
num_gpus = 6
model_name = "galactica"
model_size = "125m"
train_type = "pretrain"
train_name = "_".join([model_name, model_size, train_type])
job_name = "gal_relform"
job_name = "gal_relform2"

slurm_params = {
"slurm_job_name": job_name,
"timeout_min": 30,
"timeout_min": 60 * 24 * 2,
"nodes": 1,
"tasks_per_node": 1,
"gpus_per_node": num_gpus,
"cpus_per_task": num_gpus * 20,
"mem_gb": num_gpus * 20.0 + 20.0,
"mem_gb": num_gpus * 40.0 + 20.0,
"stderr_to_stdout": True,
}

Expand Down Expand Up @@ -50,7 +50,7 @@
"dataloader_num_workers": 1,
"experiment_name": job_name,
"checkpoints_root_dir": "/nfs/dgx/raid/chem/checkpoints/",
"flash_attn": False,
"flash_attn": True,
"track": True,
"track_dir": "/nfs/dgx/raid/chem/aim/",
# "profile":,
Expand Down
12 changes: 6 additions & 6 deletions submit_run_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@
rsync_enabled = True
executor_name = "slurm" # options are ["slurm", "local"]
root_path = ""
num_gpus = 4
num_gpus = 3
model_name = "gemma"
model_size = "2b"
train_type = "pretrain"
train_name = "_".join([model_name, model_size, train_type])
job_name = "gemma_4Btokens"
job_name = "gemma_400Mtokens_qedfirst"

slurm_params = {
"slurm_job_name": job_name,
"timeout_min": 60 * 24 * 2,
"nodes": 1,
"tasks_per_node": 1,
"gpus_per_node": num_gpus,
"cpus_per_task": num_gpus * 11,
"cpus_per_task": num_gpus * 17,
"mem_gb": num_gpus * 30.0 + 20.0,
"stderr_to_stdout": True,
}
Expand All @@ -43,10 +43,10 @@
"training_data_dirs": "/nfs/ap/mnt/sxtn/rdkit_computed_rel+form/train_rdkit_computed_rel+form",
# "training_data_dirs": "/auto/home/menuab/code/data",
"valid_data_dir": "/nfs/ap/mnt/sxtn/rdkit_computed_rel+form/valid_rdkit_computed_rel+form",
"max_steps": 30000,
"max_steps": 2100,
# "num_train_epochs": 2,
"eval_steps": 0,
"save_steps": 5000,
"save_steps": 1000,
"train_batch_size": 1,
# "valid_batch_size":,s
"dataloader_num_workers": 1,
Expand All @@ -57,7 +57,7 @@
"track_dir": "/nfs/dgx/raid/chem/aim/",
# "profile":,
# "profile_dir":,
"gradient_accumulation_steps": 16,
"gradient_accumulation_steps": 32,
# "gradient_checkpointing": False,
# "evaluate_only":,
# "check_reproducability":,
Expand Down

0 comments on commit 595e029

Please sign in to comment.