Skip to content

Commit

Permalink
start of node timing
Browse files Browse the repository at this point in the history
  • Loading branch information
paigerube14 committed Jan 10, 2025
1 parent 0372013 commit 67a22d2
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 33 deletions.
31 changes: 16 additions & 15 deletions krkn/scenario_plugins/node_actions/abstract_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,23 @@ def __init__(self, kubecli: KrknKubernetes):
self.kubecli = kubecli

# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):
def node_start_scenario(self, instance_kill_count, node, timeout, affected_node):
pass

# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
def node_stop_scenario(self, instance_kill_count, node, timeout, affected_node):
pass

# Node scenario to stop and then start the node
def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration):
def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration, affected_node):
logging.info("Starting node_stop_start_scenario injection")
self.node_stop_scenario(instance_kill_count, node, timeout)
logging.info("Waiting for %s seconds before starting the node" % (duration))
time.sleep(duration)
self.node_start_scenario(instance_kill_count, node, timeout)
logging.info("node_stop_start_scenario has been successfully injected!")

def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout, affected_node):
logging.info("Starting helper_node_stop_start_scenario injection")
self.helper_node_stop_scenario(instance_kill_count, node, timeout)
self.helper_node_start_scenario(instance_kill_count, node, timeout)
Expand All @@ -51,23 +51,24 @@ def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, d
logging.error("node_disk_detach_attach_scenario failed!")

# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
def node_termination_scenario(self, instance_kill_count, node, timeout, affected_node):
pass

# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
def node_reboot_scenario(self, instance_kill_count, node, timeout, affected_node):
pass

# Node scenario to stop the kubelet
def stop_kubelet_scenario(self, instance_kill_count, node, timeout):
def stop_kubelet_scenario(self, instance_kill_count, node, timeout, affected_node):
for _ in range(instance_kill_count):
try:
logging.info("Starting stop_kubelet_scenario injection")
logging.info("Stopping the kubelet of the node %s" % (node))
runcommand.run(
"oc debug node/" + node + " -- chroot /host systemctl stop kubelet"
)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)

logging.info("The kubelet of the node %s has been stopped" % (node))
logging.info("stop_kubelet_scenario has been successfuly injected!")
except Exception as e:
Expand All @@ -79,14 +80,14 @@ def stop_kubelet_scenario(self, instance_kill_count, node, timeout):
raise e

# Node scenario to stop and start the kubelet
def stop_start_kubelet_scenario(self, instance_kill_count, node, timeout):
def stop_start_kubelet_scenario(self, instance_kill_count, node, timeout, affected_node):
logging.info("Starting stop_start_kubelet_scenario injection")
self.stop_kubelet_scenario(instance_kill_count, node, timeout)
self.node_reboot_scenario(instance_kill_count, node, timeout)
self.stop_kubelet_scenario(instance_kill_count, node, timeout, affected_node)
self.node_reboot_scenario(instance_kill_count, node, timeout, affected_node)
logging.info("stop_start_kubelet_scenario has been successfully injected!")

# Node scenario to restart the kubelet
def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
def restart_kubelet_scenario(self, instance_kill_count, node, timeout, affected_node):
for _ in range(instance_kill_count):
try:
logging.info("Starting restart_kubelet_scenario injection")
Expand All @@ -96,8 +97,8 @@ def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
+ node
+ " -- chroot /host systemctl restart kubelet &"
)
nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli, affected_node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
logging.info("The kubelet of the node %s has been restarted" % (node))
logging.info("restart_kubelet_scenario has been successfuly injected!")
except Exception as e:
Expand All @@ -109,7 +110,7 @@ def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
raise e

# Node scenario to crash the node
def node_crash_scenario(self, instance_kill_count, node, timeout):
def node_crash_scenario(self, instance_kill_count, node, timeout, affected_node):
for _ in range(instance_kill_count):
try:
logging.info("Starting node_crash_scenario injection")
Expand Down
20 changes: 13 additions & 7 deletions krkn/scenario_plugins/node_actions/common_node_functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import datetime
import time
import random
import logging
import paramiko
from krkn_lib.models.k8s import AffectedNode
import krkn.invoke.command as runcommand
from krkn_lib.k8s import KrknKubernetes

Expand Down Expand Up @@ -43,20 +45,24 @@ def get_node(label_selector, instance_kill_count, kubecli: KrknKubernetes):

# krkn_lib
# Wait until the node status becomes Ready
def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes):
kubecli.watch_node_status(node, "True", timeout)
def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode ):
ready_time = kubecli.watch_node_status(node, "True", timeout)
affected_node.set_not_ready_time(ready_time)



# krkn_lib
# Wait until the node status becomes Not Ready
def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes):
kubecli.watch_node_status(node, "False", timeout)

def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode):
not_ready_time = kubecli.watch_node_status(node, "False", timeout)
affected_node.set_not_ready_time(not_ready_time)


# krkn_lib
# Wait until the node status becomes Unknown
def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes):
kubecli.watch_node_status(node, "Unknown", timeout)
def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode):
unknown_time = kubecli.watch_node_status(node, "Unknown", timeout)
affected_node.set_unknown_time(unknown_time)


# Get the ip of the cluster node
Expand Down
25 changes: 14 additions & 11 deletions krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import yaml
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.models.k8s import AffectedNode
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
from krkn_lib.utils import get_yaml_item_value, log_exception

Expand Down Expand Up @@ -49,6 +50,7 @@ def run(
node_scenario,
node_scenario_object,
lib_telemetry.get_lib_kubernetes(),
scenario_telemetry,
)
end_time = int(time.time())
cerberus.get_status(krkn_config, start_time, end_time)
Expand Down Expand Up @@ -120,7 +122,7 @@ def get_node_scenario_object(self, node_scenario, kubecli: KrknKubernetes):
)

def inject_node_scenario(
self, action, node_scenario, node_scenario_object, kubecli: KrknKubernetes
self, action, node_scenario, node_scenario_object, kubecli: KrknKubernetes, scenario_telemetry: ScenarioTelemetry
):

# Get the node scenario configurations for setting nodes
Expand All @@ -145,6 +147,7 @@ def inject_node_scenario(
else:
for single_node in nodes:
self.run_node(single_node, node_scenario_object, action, node_scenario)
scenario_telemetry.

def multiprocess_nodes(self, nodes, node_scenario_object, action, node_scenario):
try:
Expand Down Expand Up @@ -172,7 +175,7 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
node_scenario, "ssh_private_key", "~/.ssh/id_rsa"
)
generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")

affected_node = AffectedNode()
if node_general and action not in generic_cloud_scenarios:
logging.info(
"Scenario: "
Expand All @@ -182,42 +185,42 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
else:
if action == "node_start_scenario":
node_scenario_object.node_start_scenario(
run_kill_count, single_node, timeout
run_kill_count, single_node, timeout, affected_node
)
elif action == "node_stop_scenario":
node_scenario_object.node_stop_scenario(
run_kill_count, single_node, timeout
run_kill_count, single_node, timeout, affected_node
)
elif action == "node_stop_start_scenario":
node_scenario_object.node_stop_start_scenario(
run_kill_count, single_node, timeout, duration
run_kill_count, single_node, timeout, duration, affected_node
)
elif action == "node_termination_scenario":
node_scenario_object.node_termination_scenario(
run_kill_count, single_node, timeout
run_kill_count, single_node, timeout, affected_node
)
elif action == "node_reboot_scenario":
node_scenario_object.node_reboot_scenario(
run_kill_count, single_node, timeout
run_kill_count, single_node, timeout, affected_node
)
elif action == "node_disk_detach_attach_scenario":
node_scenario_object.node_disk_detach_attach_scenario(
run_kill_count, single_node, timeout, duration)
elif action == "stop_start_kubelet_scenario":
node_scenario_object.stop_start_kubelet_scenario(
run_kill_count, single_node, timeout
run_kill_count, single_node, timeout, affected_node
)
elif action == "restart_kubelet_scenario":
node_scenario_object.restart_kubelet_scenario(
run_kill_count, single_node, timeout
run_kill_count, single_node, timeout, affected_node
)
elif action == "stop_kubelet_scenario":
node_scenario_object.stop_kubelet_scenario(
run_kill_count, single_node, timeout
run_kill_count, single_node, timeout, affected_node
)
elif action == "node_crash_scenario":
node_scenario_object.node_crash_scenario(
run_kill_count, single_node, timeout
run_kill_count, single_node, timeout, affected_node
)
elif action == "stop_start_helper_node_scenario":
if node_scenario["cloud_type"] != "openstack":
Expand Down

0 comments on commit 67a22d2

Please sign in to comment.