Skip to content

Commit

Permalink
improve DOWN code change; check for enough nodes
Browse files Browse the repository at this point in the history
Signed-off-by: Maureen Jean <[email protected]>
  • Loading branch information
mjean308 committed Sep 18, 2024
1 parent 0981113 commit 9a07efc
Showing 1 changed file with 20 additions and 10 deletions.
30 changes: 20 additions & 10 deletions src/tests/ftest/util/soak_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ def schedule_jobs(self):
job_queue = multiprocessing.Queue()
jobid_list = []
jobs_not_done = []
processing_jobs = True
# remove any nodes marked as DOWN
node_list = self.hostlist_clients
node_list.difference_update(self.down_nodes)
Expand All @@ -322,10 +323,28 @@ def schedule_jobs(self):
jobs_not_done.append(job_dict["jobid"])
self.log.info("Submitting %s jobs at %s", str(len(jobid_list)), time.ctime())
job_threads = []
while True:
while processing_jobs:
if time.time() > self.end_time or len(jobs_not_done) == 0:
processing_jobs = False
break
job_results = {}
# check if nodes are still configured properly
cmd = f"ls {self.test_env.log_dir}"
node_results = run_remote(log, node_list, cmd, verbose=False)

Check failure on line 333 in src/tests/ftest/util/soak_test_base.py

View workflow job for this annotation

GitHub Actions / Flake8 check

F821 undefined name 'log'

Check failure on line 333 in src/tests/ftest/util/soak_test_base.py

View workflow job for this annotation

GitHub Actions / Pylint check

used-before-assignment, Using variable 'log' before assignment
if node_results.failed_hosts:
node_list.remove(node_results.failed_hosts)
self.down_nodes.update(node_results.failed_hosts)
self.log.info(f"DBG: Nodes {node_results.failed_hosts} are DOWN")

Check warning on line 337 in src/tests/ftest/util/soak_test_base.py

View workflow job for this annotation

GitHub Actions / Pylint check

logging-fstring-interpolation, Use lazy % formatting in logging functions
# verify that there are enough nodes to run remaining jobs
if len(job_threads) == 0:
for job_dict in self.joblist:
job_id = job_dict["jobid"]
if job_id in jobs_not_done:
node_count = job_dict["nodesperjob"]
if len(node_list) < node_count:
processing_jobs = False
raise SoakTestError(
"<<FAILED: There are not enough client nodes to continue")
for job_dict in self.joblist:
job_id = job_dict["jobid"]
if job_id in jobid_list:
Expand Down Expand Up @@ -388,15 +407,6 @@ def schedule_jobs(self):
job_done_id = job_results["handle"]
jobs_not_done.remove(job_done_id)

# check if nodes are still configured properly
cmd = f"ls {self.test_env.log_dir}"
node_results = run_remote(log, job_results["host_list"], cmd, verbose=False)
if node_results.failed_hosts:
node_list.remove(node_results.failed_hosts)
self.down_nodes.update(node_results.failed_hosts)
log.info(
f"DBG: Nodes {node_results.failed_hosts} are DOWN in job {job_done_id}")

debug_logging(
self.log,
self.enable_debug_msg,
Expand Down

0 comments on commit 9a07efc

Please sign in to comment.