Skip to content

Commit

Permalink
[update] tokenizer and background arg
Browse files Browse the repository at this point in the history
  • Loading branch information
zhiyi committed Mar 22, 2024
1 parent 556be9d commit 1b8cb69
Showing 1 changed file with 123 additions and 112 deletions.
235 changes: 123 additions & 112 deletions auto_monitor_eval.py
Original file line number Diff line number Diff line change
@@ -1,120 +1,131 @@
import time
import subprocess
import os
import argparse

# run with: nohup python auto_monitor_eval.py > auto_monitor_eval.master.log 2>&1 &

MODEL_TEMPLATE = {
'type': 'HuggingFaceCausalLM',
'abbr': 'exp2.6/20.00B',
'path': '/workspace/aifs4su/code/checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6/hf_ckpt/20.00B',
'tokenizer_path': '/workspace/aifs4su/code/Megatron-LM/hkgai/hf/hkg_amber_hf',
'tokenizer_kwargs': {
'padding_side': 'left',
'truncation_side': 'left',
'use_fast': False,
'trust_remote_code': True,
},
'max_out_len': 100,
'max_seq_len': 4096,
'batch_size': 16,
'model_kwargs': {
'device_map': 'auto',
'trust_remote_code': True,
},
'batch_padding': False,
'run_cfg': {'num_gpus': 1, 'num_procs': 1},
}


MAXIMUM_RUN_HOUR = 300
# megatron_ckpt_path = 'checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6/checkpoint'
hf_ckpt_path = '/workspace/aifs4su/code/checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6/hf_ckpt'
model_config_file = "/workspace/opencompass_new/hf_llama_7b.py"

MINIMUM_TOKEN_TO_TEST = 760.0
MAXIMUM_TOKEN_TO_TEST = float('inf')

# Record the start time
start_time = time.time()



# initial_files = []
MAX_INIT_TO_ADD = 750.0
initial_files = os.listdir(hf_ckpt_path)
initial_files = [
file for file in initial_files
if file.endswith('B') and float(file.replace('B','')) < MAX_INIT_TO_ADD
]
initial_files = sorted(initial_files, key=lambda x: float(x.replace('B','')), reverse=False)

print('initial_files:', initial_files)
while True:
# watch new generated file in the folder
current_files = os.listdir(hf_ckpt_path)
current_files = [
file for file in current_files
if file.endswith('B') and float(file.replace('B',''))>=MINIMUM_TOKEN_TO_TEST
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--detect_new_after_finish', action='store_true', help='detect new ckpt after the current evaluation is finished')
args = parser.parse_args()

# run with: nohup python auto_monitor_eval.py > auto_monitor_eval.master.log 2>&1 &

MODEL_TEMPLATE = {
'type': 'HuggingFaceCausalLM',
'abbr': 'exp2.6/20.00B',
'path': '/workspace/aifs4su/code/checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6/hf_ckpt/20.00B',
'tokenizer_path': '/workspace/aifs4su/code/checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6/hf_ckpt/hkg_hk50B_hf',
'tokenizer_kwargs': {
'padding_side': 'left',
'truncation_side': 'left',
'use_fast': False,
'trust_remote_code': True,
},
'max_out_len': 100,
'max_seq_len': 4096,
'batch_size': 16,
'model_kwargs': {
'device_map': 'auto',
'trust_remote_code': True,
},
'batch_padding': False,
'run_cfg': {'num_gpus': 1, 'num_procs': 1},
}


MAXIMUM_RUN_HOUR = 300
# megatron_ckpt_path = 'checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6/checkpoint'
hf_ckpt_path = '/workspace/aifs4su/code/checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6/hf_ckpt'
model_config_file = "/workspace/opencompass_new/hf_llama_7b.py"

MINIMUM_TOKEN_TO_TEST = 600.0
MAXIMUM_TOKEN_TO_TEST = float('inf')
SKIP_TOKEN_TO_TEST = ['959.93B']
# Record the start time
start_time = time.time()



# initial_files = []
MAX_INIT_TO_ADD = 600.0
initial_files = os.listdir(hf_ckpt_path)
initial_files = [
file for file in initial_files
if file.endswith('B') and float(file.replace('B','')) < MAX_INIT_TO_ADD
]
current_files = sorted(current_files, key=lambda x: float(x.replace('B','')), reverse=False)
print('current_files:', current_files)
# Find new files
new_files = [file for file in current_files if file not in initial_files]
if len(new_files) > 0:
print("New checkpoint(s) detected:")
for file in new_files:
print(os.path.join(hf_ckpt_path, file))

model_configs = []
for file in new_files:
ckpt_to_eval = os.path.join(hf_ckpt_path,file)
# print(f'evaluating {ckpt_to_eval}')
trained_token = float(file.replace('B',''))
initial_files = sorted(initial_files, key=lambda x: float(x.replace('B','')), reverse=False)

print('initial_files:', initial_files)
while True:
# watch new generated file in the folder
current_files = os.listdir(hf_ckpt_path)
current_files = [
file for file in current_files
if file.endswith('B') and float(file.replace('B',''))>=MINIMUM_TOKEN_TO_TEST and file not in SKIP_TOKEN_TO_TEST
]
current_files = sorted(current_files, key=lambda x: float(x.replace('B','')), reverse=False)
print('current_files:', current_files)
# Find new files
new_files = [file for file in current_files if file not in initial_files]
if len(new_files) > 0:
print("New checkpoint(s) detected:")
for file in new_files:
print(os.path.join(hf_ckpt_path, file))

model_configs = []
for file in new_files:
ckpt_to_eval = os.path.join(hf_ckpt_path,file)
# print(f'evaluating {ckpt_to_eval}')
trained_token = float(file.replace('B',''))

if trained_token < MINIMUM_TOKEN_TO_TEST or trained_token > MAXIMUM_TOKEN_TO_TEST:
continue
else:
new_model_conf = MODEL_TEMPLATE.copy()
base_abbr = os.path.basename(new_model_conf['abbr'])
new_model_conf['abbr'] = new_model_conf['abbr'].replace(base_abbr, file)
new_model_conf['path'] = ckpt_to_eval
model_configs.append(new_model_conf)
# upadte the minimum
MINIMUM_TOKEN_TO_TEST = trained_token if trained_token >= MINIMUM_TOKEN_TO_TEST else MINIMUM_TOKEN_TO_TEST

# write to configs
with open('hf_llama_7b.py', 'w') as file:
print(f'writing {len(model_configs)} models to hf_llama_7b.py:', model_configs)
file.write('from opencompass.models import HuggingFaceCausalLM\n\n')
file.write('models = ' + repr(model_configs).replace(", ", ",\n") + '\n')

if trained_token < MINIMUM_TOKEN_TO_TEST or trained_token > MAXIMUM_TOKEN_TO_TEST:
continue
print(f'##### Submitted Evaluation on checkpoint(s): #####')
print("\n".join(new_files))
# Define your bash command
# remember to change the config in hkgai/launcher/scripts/pretrain/pt2/batch_convert_ckpt.pt2.6.sh
# get the current date time
# bash_command = f"nohup python run.py eval_llama_7b_test.py > {eval_log_file} 2>&1 &"
# bash_command = f"nohup python test_print.py > {eval_log_file} 2>&1 &"
if args.detect_new_after_finish:
bash_command = f"python run.py eval_llama_7b_test.py -l"
else:
new_model_conf = MODEL_TEMPLATE.copy()
base_abbr = os.path.basename(new_model_conf['abbr'])
new_model_conf['abbr'] = new_model_conf['abbr'].replace(base_abbr, file)
new_model_conf['path'] = ckpt_to_eval
model_configs.append(new_model_conf)
# upadte the minimum
MINIMUM_TOKEN_TO_TEST = trained_token if trained_token >= MINIMUM_TOKEN_TO_TEST else MINIMUM_TOKEN_TO_TEST

# write to configs
with open('hf_llama_7b.py', 'w') as file:
print(f'writing {len(model_configs)} models to hf_llama_7b.py:', model_configs)
file.write('models = ' + repr(model_configs))
current_date_time = time.strftime("%Y%m%d-%H%M%S")
eval_log_file = f'auto_eval_{current_date_time}.log'
bash_command = f"bash auto_submit.sh {eval_log_file}"

print(f'run command:', bash_command)
subprocess.run(bash_command.split())

# Update the initial file list
print('update tested file list')
initial_files = current_files

print(f'##### Submitted Evaluation on checkpoint(s): #####')
print("\n".join(new_files))
# Define your bash command
# remember to change the config in hkgai/launcher/scripts/pretrain/pt2/batch_convert_ckpt.pt2.6.sh
# get the current date time
current_date_time = time.strftime("%Y%m%d-%H%M%S")
eval_log_file = f'auto_eval_{current_date_time}.log'
# bash_command = f"python run.py eval_llama_7b_test.py"
# bash_command = f"nohup python run.py eval_llama_7b_test.py > {eval_log_file} 2>&1 &"
# bash_command = f"nohup python test_print.py > {eval_log_file} 2>&1 &"
bash_command = f"bash auto_submit.sh {eval_log_file}"
subprocess.run(bash_command.split())

# Update the initial file list
print('update tested file list')
initial_files = current_files


# Record the end time
current_time = time.time()
print(f'The program has run {current_time-start_time} seconds')
if (current_time - start_time)/3600>=MAXIMUM_RUN_HOUR:
print(f'exceed maximum run time {MAXIMUM_RUN_HOUR} hour')
break
else:
print('no new file, hang')
time.sleep(300)

# mannually run
# nohup python run.py eval_llama_7b_test.py > eval_659.95B.log 2>&1 &

# Record the end time
current_time = time.time()
print(f'The program has run {current_time-start_time} seconds')
if (current_time - start_time)/3600>=MAXIMUM_RUN_HOUR:
print(f'exceed maximum run time {MAXIMUM_RUN_HOUR} hour')
break
else:
print('no new file, hang')
time.sleep(300)

# mannually run
# nohup python run.py eval_llama_7b_test.py > eval_659.95B.log 2>&1 &

0 comments on commit 1b8cb69

Please sign in to comment.