Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/aurora/2.6' into mjean/aurora/2.…
Browse files Browse the repository at this point in the history
…6/DAOS-16167-09112024
  • Loading branch information
mjean308 committed Sep 18, 2024
2 parents 03e99cd + abb20a9 commit b6fe65e
Show file tree
Hide file tree
Showing 12 changed files with 389 additions and 72 deletions.
182 changes: 182 additions & 0 deletions src/tests/ftest/control/dmg_scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""
(C) Copyright 2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
from apricot import TestWithServers
from telemetry_utils import TelemetryUtils
from test_utils_pool import time_pool_create

ENGINE_POOL_METRICS_SHORT = [
"engine_pool_entries_dtx_batched_degree",
"engine_pool_entries_dtx_batched_total",
"engine_pool_ops_akey_enum",
"engine_pool_ops_akey_punch",
"engine_pool_ops_compound",
"engine_pool_ops_dkey_enum",
"engine_pool_ops_dkey_punch",
"engine_pool_ops_dtx_abort",
"engine_pool_ops_dtx_check",
"engine_pool_ops_dtx_commit",
"engine_pool_ops_dtx_refresh",
"engine_pool_ops_ec_agg",
"engine_pool_ops_ec_rep",
"engine_pool_ops_fetch",
"engine_pool_ops_key_query",
"engine_pool_ops_migrate",
"engine_pool_ops_obj_enum",
"engine_pool_ops_obj_punch",
"engine_pool_ops_obj_sync",
"engine_pool_ops_recx_enum",
"engine_pool_ops_tgt_akey_punch",
"engine_pool_ops_tgt_dkey_punch",
"engine_pool_ops_tgt_punch",
"engine_pool_ops_tgt_update",
"engine_pool_ops_update",
"engine_pool_ops_pool_connect",
"engine_pool_ops_pool_disconnect",
"engine_pool_ops_pool_evict",
"engine_pool_ops_pool_query",
"engine_pool_ops_pool_query_space",
"engine_pool_resent",
"engine_pool_restarted",
"engine_pool_retry",
"engine_pool_scrubber_busy_time",
"engine_pool_scrubber_bytes_scrubbed_current",
"engine_pool_scrubber_bytes_scrubbed_prev",
"engine_pool_scrubber_bytes_scrubbed_total",
"engine_pool_scrubber_corruption_current",
"engine_pool_scrubber_corruption_total",
"engine_pool_scrubber_csums_current",
"engine_pool_scrubber_csums_prev",
"engine_pool_scrubber_csums_total",
"engine_pool_scrubber_next_csum_scrub",
"engine_pool_scrubber_next_tree_scrub",
"engine_pool_scrubber_prev_duration",
"engine_pool_scrubber_prev_duration_max",
"engine_pool_scrubber_prev_duration_mean",
"engine_pool_scrubber_prev_duration_min",
"engine_pool_scrubber_prev_duration_stddev",
"engine_pool_scrubber_scrubber_started",
"engine_pool_scrubber_scrubs_completed",
"engine_pool_started_at",
"engine_pool_vos_aggregation_akey_deleted",
"engine_pool_vos_aggregation_akey_scanned",
"engine_pool_vos_aggregation_akey_skipped",
"engine_pool_vos_aggregation_csum_errors",
"engine_pool_vos_aggregation_deleted_ev",
"engine_pool_vos_aggregation_deleted_sv",
"engine_pool_vos_aggregation_dkey_deleted",
"engine_pool_vos_aggregation_dkey_scanned",
"engine_pool_vos_aggregation_dkey_skipped",
"engine_pool_vos_aggregation_epr_duration",
"engine_pool_vos_aggregation_epr_duration_max",
"engine_pool_vos_aggregation_epr_duration_mean",
"engine_pool_vos_aggregation_epr_duration_min",
"engine_pool_vos_aggregation_epr_duration_stddev",
"engine_pool_vos_aggregation_merged_recs",
"engine_pool_vos_aggregation_merged_size",
"engine_pool_vos_aggregation_obj_deleted",
"engine_pool_vos_aggregation_obj_scanned",
"engine_pool_vos_aggregation_obj_skipped",
"engine_pool_vos_aggregation_uncommitted",
"engine_pool_vos_space_nvme_used",
"engine_pool_vos_space_scm_used",
"engine_pool_xferred_fetch",
"engine_pool_xferred_update",
"engine_pool_EC_update_full_stripe",
"engine_pool_EC_update_partial",
"engine_pool_block_allocator_alloc_hint",
"engine_pool_block_allocator_alloc_large",
"engine_pool_block_allocator_alloc_small",
"engine_pool_block_allocator_frags_aging",
"engine_pool_block_allocator_frags_large",
"engine_pool_block_allocator_frags_small",
"engine_pool_block_allocator_free_blks",
"engine_pool_ops_key2anchor"
]


class DmgScale(TestWithServers):
"""Verify dmg commands works as expected in a large scale system.
:avocado: recursive
"""

def test_dmg_scale(self):
"""Run the following steps and manually collect duration for each step.
0. Format storage
1. System query
2. Create a 100% pool that spans all engines
3. Pool query
4. Pool destroy
5. Create 49 pools spanning all the engines with each pool using a 1/50th of the capacity
6. Pool list
7. Query around 80 pool metrics
8. Destroy all 49 pools
9. System stop
10. System start
Jira ID: DAOS-10508.
:avocado: tags=all,manual
:avocado: tags=deployment
:avocado: tags=DmgScale,test_dmg_scale
"""
# This is a manual test and we need to find the durations from job.log, so add "##" to make
# it easy to search. The log is usually over 1 million lines.
self.log_step("## System query")
dmg_command = self.get_dmg_command()
dmg_command.system_query()

self.log_step("## Create a 100% pool that spans all engines")
pool = self.get_pool(namespace="/run/pool_100/*", create=False)
duration = time_pool_create(log=self.log, number=1, pool=pool)
self.log.info("## Single pool create duration = %.1f", duration)

self.log_step("## Pool query")
pool.query()

self.log_step("## Pool destroy")
pool.destroy()

quantity = self.params.get("quantity", "/run/pool_small/*", 1)
msg = (f"## Create {quantity} small pools spanning all the engines where the pools fill up "
f"the capacity")
self.log_step(msg)
pool_0 = self.get_pool(namespace="/run/pool_small/*", create=False)
duration_0 = time_pool_create(log=self.log, number=0, pool=pool_0)
pools = [pool_0]
durations = [duration_0]
for count in range(1, quantity):
pools.append(self.get_pool(create=False))
# Use the SCM and NVMe size of the first pool for the rest of the (quantity - 1) pools.
pools[-1].scm_size.update(pool_0.scm_per_rank)
pools[-1].nvme_size.update(pool_0.nvme_per_rank)
durations.append(time_pool_create(log=self.log, number=count, pool=pools[-1]))
msg = (f"Pool {count} created. SCM = {pools[-1].scm_per_rank}; "
f"NVMe = {pools[-1].nvme_per_rank}")
self.log.info(msg)
self.log.info("## durations = %s", durations)
total_duration = sum(durations)
self.log.info("## %d pools create duration = %.1f", quantity, total_duration)

self.log_step("## Pool list")
dmg_command.pool_list()

self.log_step("## Query around 80 pool metrics")
# To save time and logs, call telemetry on the first host only. With the 80 pool metrics
# above, ~100K lines are printed per host.
telemetry_utils = TelemetryUtils(
dmg=dmg_command, servers=[self.server_managers[0].hosts[0]])
telemetry_utils.get_metrics(name=",".join(ENGINE_POOL_METRICS_SHORT))

self.log_step(f"## Destroy all {quantity} pools")
self.destroy_pools(pools=pools)

self.log_step("## System stop")
self.server_managers[0].system_stop()

self.log_step("## System start")
self.server_managers[0].system_start()
37 changes: 37 additions & 0 deletions src/tests/ftest/control/dmg_scale.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Note: We usually use the extra yaml in aurora-tools, but that extra yaml has test_clients while
# this test doesn't need any client, so update the extra yaml or provide some dummy client to -tc.
hosts:
test_servers: 256

timeout: 900

daos_server:
pattern_timeout: 60

server_config:
name: daos_server
engines_per_host: 2
engines:
0:
pinned_numa_node: 0
nr_xs_helpers: 1
fabric_iface: ib0
fabric_iface_port: 31317
log_file: daos_server0.log
storage: auto
targets: 8
1:
pinned_numa_node: 1
nr_xs_helpers: 1
fabric_iface: ib1
fabric_iface_port: 31417
log_file: daos_server1.log
storage: auto
targets: 8

pool_100:
size: 100%
pool_small:
size: 2%
# If we use --size=2% during pool create, we can only create up to 49 pools.
quantity: 49
2 changes: 1 addition & 1 deletion src/tests/ftest/deployment/basic_checkout.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ mdtest_easy: &mdtest_easy_base
write_bytes: 0
num_of_files_dirs: 100000000
stonewall_timer: 30
stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile"
stonewall_statusfile: stoneWallingStatusFile
dfs_destroy: false
mdtest_dfs_s1:
<<: *mdtest_easy_base
Expand Down
140 changes: 81 additions & 59 deletions src/tests/ftest/deployment/io_sys_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,66 +40,88 @@ def test_io_sys_admin(self):
new_cont_user = self.params.get("user", "/run/container_set_owner/*")
new_cont_group = self.params.get("group", "/run/container_set_owner/*")

# Toggle independent steps
steps_to_run = {
"pool_create_ownership": True,
"storage_system_query": True,
"io": True,
"snapshot": True,
"datamover": True
}
for step in steps_to_run:
run = self.params.get(step, "/run/io_sys_admin/steps_to_run/*", None)
if run is not None:
steps_to_run[step] = run

dmg = self.get_dmg_command()
daos = self.get_daos_command()

for idx in range(1, 4):
pool = self.get_pool(namespace=f"/run/pool_{idx}/", create=False)
check_pool_creation(self, [pool], 60)
containers = []
for cont_idx in range(1, 4):
containers.append(
self.get_container(pool, namespace=f"/run/container_{cont_idx}/"))
containers[-1].set_owner(f"{new_cont_user}@", f"{new_cont_group}@")

daos.container_list(pool.identifier)
self.destroy_containers(containers)
pool.destroy()

# dmg storage scan
dmg.storage_scan()
dmg.system_query()
dmg.system_leader_query()

# write large data sets
self.run_file_count()
# create snapshot
self.container[-1].create_snap()
# overwrite the last ior file
self.ior_cmd.signature.update('456')
self.processes = self.ior_np
self.ppn = self.ior_ppn
self.run_ior_with_pool(create_pool=False, create_cont=False)

nvme_free_space_before_snap_destroy = self.get_free_space()[1]
# delete snapshot
self.container[-1].destroy_snap(epc=self.container[-1].epoch)
# Now check if the space is returned back.
counter = 1
returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy

data_written = (int(self.ppn) * human_to_bytes(self.ior_cmd.block_size.value))
while returned_space < int(data_written):
# try to wait for 4 x 60 secs for aggregation to be completed or
# else exit the test with a failure.
if counter > 4:
self.log.info("Free space before snapshot destroy: %s",
nvme_free_space_before_snap_destroy)
self.log.info("Free space when test terminated: %s",
self.get_free_space()[1])
self.fail("Aggregation did not complete as expected")

time.sleep(60)
if steps_to_run["pool_create_ownership"]:
self.log_step("Verify pool creation time and container set-owner")
for idx in range(1, 4):
pool = self.get_pool(namespace=f"/run/pool_{idx}/", create=False)
check_pool_creation(self, [pool], 60)
containers = []
for cont_idx in range(1, 4):
containers.append(
self.get_container(pool, namespace=f"/run/container_{cont_idx}/"))
containers[-1].set_owner(f"{new_cont_user}@", f"{new_cont_group}@")

daos.container_list(pool.identifier)
self.destroy_containers(containers)
pool.destroy()

if steps_to_run["storage_system_query"]:
self.log_step("Verify storage scan and system query")
dmg.storage_scan()
dmg.system_query()
dmg.system_leader_query()

if steps_to_run["io"]:
self.log_step("Verifying large dataset IO")
self.run_file_count()

if steps_to_run["snapshot"]:
self.log_step("Verifying snapshot creation and aggregation")
self.container[-1].create_snap()
# overwrite the last ior file
self.ior_cmd.signature.update('456')
self.processes = self.ior_np
self.ppn = self.ior_ppn
self.run_ior_with_pool(create_pool=False, create_cont=False)

nvme_free_space_before_snap_destroy = self.get_free_space()[1]
# delete snapshot
self.container[-1].destroy_snap(epc=self.container[-1].epoch)
# Now check if the space is returned back.
counter = 1
returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy
counter += 1

self.log.info("#####Starting FS_COPY Test")
self.run_dm_activities_with_ior("FS_COPY", self.pool, self.container[-1])
self.log.info("#####Starting DCP Test")
self.run_dm_activities_with_ior("DCP", self.pool, self.container[-1])
self.log.info("#####Starting DSERIAL Test")
self.run_dm_activities_with_ior("DSERIAL", self.pool, self.container[-1])
self.log.info("#####Starting CONT_CLONE Test")
self.run_dm_activities_with_ior("CONT_CLONE", self.pool, self.container[-1])
self.log.info("#####Completed all Datamover tests")
self.container.pop(0)

data_written = (int(self.ppn) * human_to_bytes(self.ior_cmd.block_size.value))
while returned_space < int(data_written):
# try to wait for 4 x 60 secs for aggregation to be completed or
# else exit the test with a failure.
if counter > 4:
self.log.info(
"Free space before snapshot destroy: %s",
nvme_free_space_before_snap_destroy)
self.log.info(
"Free space when test terminated: %s", self.get_free_space()[1])
self.fail("Aggregation did not complete as expected")

time.sleep(60)
returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy
counter += 1

if steps_to_run["datamover"]:
self.log_step("Verifying datamover")
self.log.info("#####Starting FS_COPY Test")
self.run_dm_activities_with_ior("FS_COPY", self.pool, self.container[-1])
self.log.info("#####Starting DCP Test")
self.run_dm_activities_with_ior("DCP", self.pool, self.container[-1])
self.log.info("#####Starting DSERIAL Test")
self.run_dm_activities_with_ior("DSERIAL", self.pool, self.container[-1])
self.log.info("#####Starting CONT_CLONE Test")
self.run_dm_activities_with_ior("CONT_CLONE", self.pool, self.container[-1])
self.log.info("#####Completed all Datamover tests")
self.container.pop(0)
8 changes: 8 additions & 0 deletions src/tests/ftest/deployment/io_sys_admin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,11 @@ dcp:
np: 16
hdf5_vol:
plugin_path: /usr/lib64/mpich/lib

io_sys_admin:
steps_to_run:
pool_create_ownership: True
storage_system_query: True
io: True
snapshot: True
datamover: True
2 changes: 1 addition & 1 deletion src/tests/ftest/performance/ior_easy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ server_config:

pool:
size: 95%
properties: ec_cell_sz:128KiB
properties: ec_cell_sz:1MiB

container:
type: POSIX
Expand Down
Loading

0 comments on commit b6fe65e

Please sign in to comment.