-
Notifications
You must be signed in to change notification settings - Fork 304
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DAOS-16076 test: Automate dmg scale test to be run on Aurora (#14616)
Steps: 1. Format storages 2. System query 3. Create a 100% pool that spans all engines 4. Pool query 5. Pool destroy 6. Create 49 pools spanning all the engines with each pool using a 1/50th of the capacity 7. Pool list 8. Get around 80 pool metrics 9. Destroy all 49 pools 10. System stop 11. System start Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Makito Kano <[email protected]>
- Loading branch information
1 parent
4e201a0
commit 8649e3d
Showing
2 changed files
with
219 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
""" | ||
(C) Copyright 2024 Intel Corporation. | ||
SPDX-License-Identifier: BSD-2-Clause-Patent | ||
""" | ||
from apricot import TestWithServers | ||
from telemetry_utils import TelemetryUtils | ||
from test_utils_pool import time_pool_create | ||
|
||
ENGINE_POOL_METRICS_SHORT = [ | ||
"engine_pool_entries_dtx_batched_degree", | ||
"engine_pool_entries_dtx_batched_total", | ||
"engine_pool_ops_akey_enum", | ||
"engine_pool_ops_akey_punch", | ||
"engine_pool_ops_compound", | ||
"engine_pool_ops_dkey_enum", | ||
"engine_pool_ops_dkey_punch", | ||
"engine_pool_ops_dtx_abort", | ||
"engine_pool_ops_dtx_check", | ||
"engine_pool_ops_dtx_commit", | ||
"engine_pool_ops_dtx_refresh", | ||
"engine_pool_ops_ec_agg", | ||
"engine_pool_ops_ec_rep", | ||
"engine_pool_ops_fetch", | ||
"engine_pool_ops_key_query", | ||
"engine_pool_ops_migrate", | ||
"engine_pool_ops_obj_enum", | ||
"engine_pool_ops_obj_punch", | ||
"engine_pool_ops_obj_sync", | ||
"engine_pool_ops_recx_enum", | ||
"engine_pool_ops_tgt_akey_punch", | ||
"engine_pool_ops_tgt_dkey_punch", | ||
"engine_pool_ops_tgt_punch", | ||
"engine_pool_ops_tgt_update", | ||
"engine_pool_ops_update", | ||
"engine_pool_ops_pool_connect", | ||
"engine_pool_ops_pool_disconnect", | ||
"engine_pool_ops_pool_evict", | ||
"engine_pool_ops_pool_query", | ||
"engine_pool_ops_pool_query_space", | ||
"engine_pool_resent", | ||
"engine_pool_restarted", | ||
"engine_pool_retry", | ||
"engine_pool_scrubber_busy_time", | ||
"engine_pool_scrubber_bytes_scrubbed_current", | ||
"engine_pool_scrubber_bytes_scrubbed_prev", | ||
"engine_pool_scrubber_bytes_scrubbed_total", | ||
"engine_pool_scrubber_corruption_current", | ||
"engine_pool_scrubber_corruption_total", | ||
"engine_pool_scrubber_csums_current", | ||
"engine_pool_scrubber_csums_prev", | ||
"engine_pool_scrubber_csums_total", | ||
"engine_pool_scrubber_next_csum_scrub", | ||
"engine_pool_scrubber_next_tree_scrub", | ||
"engine_pool_scrubber_prev_duration", | ||
"engine_pool_scrubber_prev_duration_max", | ||
"engine_pool_scrubber_prev_duration_mean", | ||
"engine_pool_scrubber_prev_duration_min", | ||
"engine_pool_scrubber_prev_duration_stddev", | ||
"engine_pool_scrubber_scrubber_started", | ||
"engine_pool_scrubber_scrubs_completed", | ||
"engine_pool_started_at", | ||
"engine_pool_vos_aggregation_akey_deleted", | ||
"engine_pool_vos_aggregation_akey_scanned", | ||
"engine_pool_vos_aggregation_akey_skipped", | ||
"engine_pool_vos_aggregation_csum_errors", | ||
"engine_pool_vos_aggregation_deleted_ev", | ||
"engine_pool_vos_aggregation_deleted_sv", | ||
"engine_pool_vos_aggregation_dkey_deleted", | ||
"engine_pool_vos_aggregation_dkey_scanned", | ||
"engine_pool_vos_aggregation_dkey_skipped", | ||
"engine_pool_vos_aggregation_epr_duration", | ||
"engine_pool_vos_aggregation_epr_duration_max", | ||
"engine_pool_vos_aggregation_epr_duration_mean", | ||
"engine_pool_vos_aggregation_epr_duration_min", | ||
"engine_pool_vos_aggregation_epr_duration_stddev", | ||
"engine_pool_vos_aggregation_merged_recs", | ||
"engine_pool_vos_aggregation_merged_size", | ||
"engine_pool_vos_aggregation_obj_deleted", | ||
"engine_pool_vos_aggregation_obj_scanned", | ||
"engine_pool_vos_aggregation_obj_skipped", | ||
"engine_pool_vos_aggregation_uncommitted", | ||
"engine_pool_vos_space_nvme_used", | ||
"engine_pool_vos_space_scm_used", | ||
"engine_pool_xferred_fetch", | ||
"engine_pool_xferred_update", | ||
"engine_pool_EC_update_full_stripe", | ||
"engine_pool_EC_update_partial", | ||
"engine_pool_block_allocator_alloc_hint", | ||
"engine_pool_block_allocator_alloc_large", | ||
"engine_pool_block_allocator_alloc_small", | ||
"engine_pool_block_allocator_frags_aging", | ||
"engine_pool_block_allocator_frags_large", | ||
"engine_pool_block_allocator_frags_small", | ||
"engine_pool_block_allocator_free_blks", | ||
"engine_pool_ops_key2anchor" | ||
] | ||
|
||
|
||
class DmgScale(TestWithServers): | ||
"""Verify dmg commands works as expected in a large scale system. | ||
:avocado: recursive | ||
""" | ||
|
||
def test_dmg_scale(self): | ||
"""Run the following steps and manually collect duration for each step. | ||
0. Format storage | ||
1. System query | ||
2. Create a 100% pool that spans all engines | ||
3. Pool query | ||
4. Pool destroy | ||
5. Create 49 pools spanning all the engines with each pool using a 1/50th of the capacity | ||
6. Pool list | ||
7. Query around 80 pool metrics | ||
8. Destroy all 49 pools | ||
9. System stop | ||
10. System start | ||
Jira ID: DAOS-10508. | ||
:avocado: tags=all,manual | ||
:avocado: tags=deployment | ||
:avocado: tags=DmgScale,test_dmg_scale | ||
""" | ||
# This is a manual test and we need to find the durations from job.log, so add "##" to make | ||
# it easy to search. The log is usually over 1 million lines. | ||
self.log_step("## System query") | ||
dmg_command = self.get_dmg_command() | ||
dmg_command.system_query() | ||
|
||
self.log_step("## Create a 100% pool that spans all engines") | ||
pool = self.get_pool(namespace="/run/pool_100/*", create=False) | ||
duration = time_pool_create(log=self.log, number=1, pool=pool) | ||
self.log.info("## Single pool create duration = %.1f", duration) | ||
|
||
self.log_step("## Pool query") | ||
pool.query() | ||
|
||
self.log_step("## Pool destroy") | ||
pool.destroy() | ||
|
||
quantity = self.params.get("quantity", "/run/pool_small/*", 1) | ||
msg = (f"## Create {quantity} small pools spanning all the engines where the pools fill up " | ||
f"the capacity") | ||
self.log_step(msg) | ||
pool_0 = self.get_pool(namespace="/run/pool_small/*", create=False) | ||
duration_0 = time_pool_create(log=self.log, number=0, pool=pool_0) | ||
pools = [pool_0] | ||
durations = [duration_0] | ||
for count in range(1, quantity): | ||
pools.append(self.get_pool(create=False)) | ||
# Use the SCM and NVMe size of the first pool for the rest of the (quantity - 1) pools. | ||
pools[-1].scm_size.update(pool_0.scm_per_rank) | ||
pools[-1].nvme_size.update(pool_0.nvme_per_rank) | ||
durations.append(time_pool_create(log=self.log, number=count, pool=pools[-1])) | ||
msg = (f"Pool {count} created. SCM = {pools[-1].scm_per_rank}; " | ||
f"NVMe = {pools[-1].nvme_per_rank}") | ||
self.log.info(msg) | ||
self.log.info("## durations = %s", durations) | ||
total_duration = sum(durations) | ||
self.log.info("## %d pools create duration = %.1f", quantity, total_duration) | ||
|
||
self.log_step("## Pool list") | ||
dmg_command.pool_list() | ||
|
||
self.log_step("## Query around 80 pool metrics") | ||
# To save time and logs, call telemetry on the first host only. With the 80 pool metrics | ||
# above, ~100K lines are printed per host. | ||
telemetry_utils = TelemetryUtils( | ||
dmg=dmg_command, servers=[self.server_managers[0].hosts[0]]) | ||
telemetry_utils.get_metrics(name=",".join(ENGINE_POOL_METRICS_SHORT)) | ||
|
||
self.log_step(f"## Destroy all {quantity} pools") | ||
self.destroy_pools(pools=pools) | ||
|
||
self.log_step("## System stop") | ||
self.server_managers[0].system_stop() | ||
|
||
self.log_step("## System start") | ||
self.server_managers[0].system_start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Note: We usually use the extra yaml in aurora-tools, but that extra yaml has test_clients while | ||
# this test doesn't need any client, so update the extra yaml or provide some dummy client to -tc. | ||
hosts: | ||
test_servers: 256 | ||
|
||
timeout: 900 | ||
|
||
daos_server: | ||
pattern_timeout: 60 | ||
|
||
server_config: | ||
name: daos_server | ||
engines_per_host: 2 | ||
engines: | ||
0: | ||
pinned_numa_node: 0 | ||
nr_xs_helpers: 1 | ||
fabric_iface: ib0 | ||
fabric_iface_port: 31317 | ||
log_file: daos_server0.log | ||
storage: auto | ||
targets: 8 | ||
1: | ||
pinned_numa_node: 1 | ||
nr_xs_helpers: 1 | ||
fabric_iface: ib1 | ||
fabric_iface_port: 31417 | ||
log_file: daos_server1.log | ||
storage: auto | ||
targets: 8 | ||
|
||
pool_100: | ||
size: 100% | ||
pool_small: | ||
size: 2% | ||
# If we use --size=2% during pool create, we can only create up to 49 pools. | ||
quantity: 49 |