Skip to content

Commit

Permalink
Hog scenario porting from arcaflow to native (#748)
Browse files Browse the repository at this point in the history
* added new native hog scenario

* removed arcaflow dependency + legacy hog scenarios

* config update

* changed hog configuration structure + added average samples

* fix on cpu count

* removes tripledes warning

* changed selector format

* changed selector syntax

* number of nodes option

* documentation

* functional tests

* exception handling on hog deployment thread

Signed-off-by: Paige Patton <[email protected]>
  • Loading branch information
tsebastiani authored and paigerube14 committed Jan 31, 2025
1 parent c7e068a commit cafa930
Show file tree
Hide file tree
Showing 14 changed files with 294 additions and 129 deletions.
17 changes: 10 additions & 7 deletions krkn/scenario_plugins/abstract_scenario_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def run_scenarios(
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
wait_duration = krkn_config["tunings"]["wait_duration"]
events_backup = krkn_config["telemetry"]["events_backup"]
for scenario_config in scenarios_list:
if isinstance(scenario_config, list):
logging.error(
Expand Down Expand Up @@ -99,13 +100,15 @@ def run_scenarios(
int(scenario_telemetry.start_timestamp),
int(scenario_telemetry.end_timestamp),
)
utils.populate_cluster_events(
scenario_telemetry,
parsed_scenario_config,
telemetry.get_lib_kubernetes(),
int(scenario_telemetry.start_timestamp),
int(scenario_telemetry.end_timestamp),
)

if events_backup:
utils.populate_cluster_events(
scenario_telemetry,
parsed_scenario_config,
telemetry.get_lib_kubernetes(),
int(scenario_telemetry.start_timestamp),
int(scenario_telemetry.end_timestamp),
)

if scenario_telemetry.exit_status != 0:
failed_scenarios.append(scenario_config)
Expand Down
19 changes: 14 additions & 5 deletions krkn/scenario_plugins/node_actions/abstract_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@
import krkn.invoke.command as runcommand
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
from krkn_lib.k8s import KrknKubernetes

from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus

# krkn_lib
class abstract_node_scenarios:
kubecli: KrknKubernetes
affected_nodes_status: AffectedNodeStatus

def __init__(self, kubecli: KrknKubernetes):
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
self.kubecli = kubecli
self.affected_nodes_status = affected_nodes_status

# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):
Expand All @@ -28,6 +30,7 @@ def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration)
logging.info("Waiting for %s seconds before starting the node" % (duration))
time.sleep(duration)
self.node_start_scenario(instance_kill_count, node, timeout)
self.affected_nodes_status.merge_affected_nodes()
logging.info("node_stop_start_scenario has been successfully injected!")

def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
Expand Down Expand Up @@ -61,13 +64,15 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
# Node scenario to stop the kubelet
def stop_kubelet_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting stop_kubelet_scenario injection")
logging.info("Stopping the kubelet of the node %s" % (node))
runcommand.run(
"oc debug node/" + node + " -- chroot /host systemctl stop kubelet"
)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)

logging.info("The kubelet of the node %s has been stopped" % (node))
logging.info("stop_kubelet_scenario has been successfuly injected!")
except Exception as e:
Expand All @@ -77,17 +82,20 @@ def stop_kubelet_scenario(self, instance_kill_count, node, timeout):
)
logging.error("stop_kubelet_scenario injection failed!")
raise e
self.add_affected_node(affected_node)

# Node scenario to stop and start the kubelet
def stop_start_kubelet_scenario(self, instance_kill_count, node, timeout):
logging.info("Starting stop_start_kubelet_scenario injection")
self.stop_kubelet_scenario(instance_kill_count, node, timeout)
self.node_reboot_scenario(instance_kill_count, node, timeout)
self.affected_nodes_status.merge_affected_nodes()
logging.info("stop_start_kubelet_scenario has been successfully injected!")

# Node scenario to restart the kubelet
def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting restart_kubelet_scenario injection")
logging.info("Restarting the kubelet of the node %s" % (node))
Expand All @@ -96,8 +104,8 @@ def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
+ node
+ " -- chroot /host systemctl restart kubelet &"
)
nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli, affected_node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli,affected_node)
logging.info("The kubelet of the node %s has been restarted" % (node))
logging.info("restart_kubelet_scenario has been successfuly injected!")
except Exception as e:
Expand All @@ -107,6 +115,7 @@ def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
)
logging.error("restart_kubelet_scenario injection failed!")
raise e
self.add_affected_node(affected_node)

# Node scenario to crash the node
def node_crash_scenario(self, instance_kill_count, node, timeout):
Expand Down
48 changes: 35 additions & 13 deletions krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
abstract_node_scenarios,
)
from krkn_lib.k8s import KrknKubernetes

from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus

class Alibaba:
def __init__(self):
Expand Down Expand Up @@ -161,8 +161,9 @@ def get_vm_status(self, instance_id):
return None

# Wait until the node instance is running
def wait_until_running(self, instance_id, timeout):
def wait_until_running(self, instance_id, timeout, affected_node):
time_counter = 0
start_time = time.time()
status = self.get_vm_status(instance_id)
while status != "Running":
status = self.get_vm_status(instance_id)
Expand All @@ -174,11 +175,15 @@ def wait_until_running(self, instance_id, timeout):
if time_counter >= timeout:
logging.info("ECS %s is still not ready in allotted time" % instance_id)
return False
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("running", end_time - start_time)
return True

# Wait until the node instance is stopped
def wait_until_stopped(self, instance_id, timeout):
def wait_until_stopped(self, instance_id, timeout, affected_node):
time_counter = 0
start_time = time.time()
status = self.get_vm_status(instance_id)
while status != "Stopped":
status = self.get_vm_status(instance_id)
Expand All @@ -192,10 +197,14 @@ def wait_until_stopped(self, instance_id, timeout):
"Vm %s is still not stopped in allotted time" % instance_id
)
return False
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("running", end_time - start_time)
return True

# Wait until the node instance is terminated
def wait_until_released(self, instance_id, timeout):
def wait_until_released(self, instance_id, timeout, affected_node):
start_time = time.time()
statuses = self.get_vm_status(instance_id)
time_counter = 0
while statuses and statuses != "Released":
Expand All @@ -210,26 +219,32 @@ def wait_until_released(self, instance_id, timeout):
return False

logging.info("ECS %s is released" % instance_id)
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("terminated", end_time - start_time)
return True


# krkn_lib
class alibaba_node_scenarios(abstract_node_scenarios):
def __init__(self, kubecli: KrknKubernetes):
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
super().__init__(kubecli, affected_nodes_status)
self.alibaba = Alibaba()


# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_start_scenario injection")
vm_id = self.alibaba.get_instance_id(node)
logging.info(
"Starting the node %s with instance ID: %s " % (node, vm_id)
)
self.alibaba.start_instances(vm_id)
self.alibaba.wait_until_running(vm_id, timeout)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
self.alibaba.wait_until_running(vm_id, timeout, affected_node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
logging.info("Node with instance ID: %s is in running state" % node)
logging.info("node_start_scenario has been successfully injected!")
except Exception as e:
Expand All @@ -239,44 +254,48 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
)
logging.error("node_start_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)

# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_stop_scenario injection")
vm_id = self.alibaba.get_instance_id(node)
logging.info(
"Stopping the node %s with instance ID: %s " % (node, vm_id)
)
self.alibaba.stop_instances(vm_id)
self.alibaba.wait_until_stopped(vm_id, timeout)
self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
logging.info("Node with instance ID: %s is in stopped state" % vm_id)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
except Exception as e:
logging.error(
"Failed to stop node instance. Encountered following exception: %s. "
"Test Failed" % e
)
logging.error("node_stop_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)

# Might need to stop and then release the instance
# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info(
"Starting node_termination_scenario injection by first stopping instance"
)
vm_id = self.alibaba.get_instance_id(node)
self.alibaba.stop_instances(vm_id)
self.alibaba.wait_until_stopped(vm_id, timeout)
self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
logging.info(
"Releasing the node %s with instance ID: %s " % (node, vm_id)
)
self.alibaba.release_instance(vm_id)
self.alibaba.wait_until_released(vm_id, timeout)
self.alibaba.wait_until_released(vm_id, timeout, affected_node)
logging.info("Node with instance ID: %s has been released" % node)
logging.info(
"node_termination_scenario has been successfully injected!"
Expand All @@ -288,17 +307,19 @@ def node_termination_scenario(self, instance_kill_count, node, timeout):
)
logging.error("node_termination_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)

# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_reboot_scenario injection")
instance_id = self.alibaba.get_instance_id(node)
logging.info("Rebooting the node with instance ID: %s " % (instance_id))
self.alibaba.reboot_instances(instance_id)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
logging.info(
"Node with instance ID: %s has been rebooted" % (instance_id)
)
Expand All @@ -310,3 +331,4 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
)
logging.error("node_reboot_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)
Loading

0 comments on commit cafa930

Please sign in to comment.