Skip to content

Commit

Permalink
[add] re-scan failed tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
kangreen0210 committed Mar 24, 2024
1 parent 1b8cb69 commit 4d8f42d
Showing 1 changed file with 44 additions and 14 deletions.
58 changes: 44 additions & 14 deletions auto_monitor_eval.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,30 @@
import subprocess
import os
import argparse
from pprint import pprint

def get_folder_size(Folderpath):
size=0
for path, dirs, files in os.walk(Folderpath):
for f in files:
fp = os.path.join(path, f)
size += os.path.getsize(fp)
return size

def dump_model_configs(config_list, config_file='hf_llama_7b.py',):
with open(config_file, 'w') as file:
print(f'writing {len(config_list)} models to hf_llama_7b.py:')
pprint(config_list)
file.write('from opencompass.models import HuggingFaceCausalLM\n\n')
file.write('models = ' + repr(config_list).replace(", ", ",\n").replace('}','}\n') + '\n')

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--detect_new_after_finish', action='store_true', help='detect new ckpt after the current evaluation is finished')
parser.add_argument('-s', '--serial', action='store_true', help='detect new ckpt after the current evaluation is finished')
args = parser.parse_args()

# run with: nohup python auto_monitor_eval.py > auto_monitor_eval.master.log 2>&1 &
# run with: nohup python -u auto_monitor_eval.py > auto_monitor_eval.master_on_dgx-021.log 2>&1 &
# run with: nohup python -u auto_monitor_eval.py -s > auto_monitor_eval.master_on_dgx-021.log 2>&1 &

MODEL_TEMPLATE = {
'type': 'HuggingFaceCausalLM',
Expand Down Expand Up @@ -55,6 +72,8 @@
]
initial_files = sorted(initial_files, key=lambda x: float(x.replace('B','')), reverse=False)

TESTED_MODELS = []

print('initial_files:', initial_files)
while True:
# watch new generated file in the folder
Expand All @@ -63,6 +82,11 @@
file for file in current_files
if file.endswith('B') and float(file.replace('B',''))>=MINIMUM_TOKEN_TO_TEST and file not in SKIP_TOKEN_TO_TEST
]
# exclude folder that smaller than 10GB
current_files = [
file for file in current_files
if get_folder_size(os.path.join(hf_ckpt_path, file)) > 12*(1024**3)
]
current_files = sorted(current_files, key=lambda x: float(x.replace('B','')), reverse=False)
print('current_files:', current_files)
# Find new files
Expand All @@ -72,7 +96,7 @@
for file in new_files:
print(os.path.join(hf_ckpt_path, file))

model_configs = []
model_configs = [] # new detected checkpoints
for file in new_files:
ckpt_to_eval = os.path.join(hf_ckpt_path,file)
# print(f'evaluating {ckpt_to_eval}')
Expand All @@ -89,11 +113,7 @@
# upadte the minimum
MINIMUM_TOKEN_TO_TEST = trained_token if trained_token >= MINIMUM_TOKEN_TO_TEST else MINIMUM_TOKEN_TO_TEST

# write to configs
with open('hf_llama_7b.py', 'w') as file:
print(f'writing {len(model_configs)} models to hf_llama_7b.py:', model_configs)
file.write('from opencompass.models import HuggingFaceCausalLM\n\n')
file.write('models = ' + repr(model_configs).replace(", ", ",\n") + '\n')
TESTED_MODELS = TESTED_MODELS + model_configs

print(f'##### Submitted Evaluation on checkpoint(s): #####')
print("\n".join(new_files))
Expand All @@ -102,15 +122,25 @@
# get the current date time
# bash_command = f"nohup python run.py eval_llama_7b_test.py > {eval_log_file} 2>&1 &"
# bash_command = f"nohup python test_print.py > {eval_log_file} 2>&1 &"
if args.detect_new_after_finish:
bash_command = f"python run.py eval_llama_7b_test.py -l"

# write the new checkpoints to configs
dump_model_configs(model_configs, 'hf_llama_7b.py')
if args.serial:
bash_command = f"python -u run.py eval_llama_7b_test.py -l -r 20240324_101010"
print(f'run command:', bash_command)
subprocess.run(bash_command.split())

print("#"*10 + "\nRe-scan the previous failed evalution\n" + "#"*10)
dump_model_configs(TESTED_MODELS, 'hf_llama_7b.py')
bash_command = f"python -u run.py eval_llama_7b_test.py -l -r 20240324_101010"
print(f'run command:', bash_command)
subprocess.run(bash_command.split())
else:
current_date_time = time.strftime("%Y%m%d-%H%M%S")
eval_log_file = f'auto_eval_{current_date_time}.log'
bash_command = f"bash auto_submit.sh {eval_log_file}"

print(f'run command:', bash_command)
subprocess.run(bash_command.split())
print(f'run command:', bash_command)
subprocess.run(bash_command.split())

# Update the initial file list
print('update tested file list')
Expand All @@ -125,7 +155,7 @@
break
else:
print('no new file, hang')
time.sleep(300)
time.sleep(600)

# mannually run
# nohup python run.py eval_llama_7b_test.py > eval_659.95B.log 2>&1 &

0 comments on commit 4d8f42d

Please sign in to comment.