Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Node timing #747

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions krkn/scenario_plugins/abstract_scenario_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def run_scenarios(
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
wait_duration = krkn_config["tunings"]["wait_duration"]
events_backup = krkn_config["telemetry"]["events_backup"]
for scenario_config in scenarios_list:
if isinstance(scenario_config, list):
logging.error(
Expand Down Expand Up @@ -99,13 +100,15 @@ def run_scenarios(
int(scenario_telemetry.start_timestamp),
int(scenario_telemetry.end_timestamp),
)
utils.populate_cluster_events(
scenario_telemetry,
parsed_scenario_config,
telemetry.get_lib_kubernetes(),
int(scenario_telemetry.start_timestamp),
int(scenario_telemetry.end_timestamp),
)

if events_backup:
utils.populate_cluster_events(
scenario_telemetry,
parsed_scenario_config,
telemetry.get_lib_kubernetes(),
int(scenario_telemetry.start_timestamp),
int(scenario_telemetry.end_timestamp),
)

if scenario_telemetry.exit_status != 0:
failed_scenarios.append(scenario_config)
Expand Down
19 changes: 14 additions & 5 deletions krkn/scenario_plugins/node_actions/abstract_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@
import krkn.invoke.command as runcommand
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
from krkn_lib.k8s import KrknKubernetes

from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus

# krkn_lib
class abstract_node_scenarios:
kubecli: KrknKubernetes
affected_nodes_status: AffectedNodeStatus

def __init__(self, kubecli: KrknKubernetes):
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
self.kubecli = kubecli
self.affected_nodes_status = affected_nodes_status

# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):
Expand All @@ -28,6 +30,7 @@ def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration)
logging.info("Waiting for %s seconds before starting the node" % (duration))
time.sleep(duration)
self.node_start_scenario(instance_kill_count, node, timeout)
self.affected_nodes_status.merge_affected_nodes()
logging.info("node_stop_start_scenario has been successfully injected!")

def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
Expand Down Expand Up @@ -61,13 +64,15 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
# Node scenario to stop the kubelet
def stop_kubelet_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting stop_kubelet_scenario injection")
logging.info("Stopping the kubelet of the node %s" % (node))
runcommand.run(
"oc debug node/" + node + " -- chroot /host systemctl stop kubelet"
)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)

logging.info("The kubelet of the node %s has been stopped" % (node))
logging.info("stop_kubelet_scenario has been successfuly injected!")
except Exception as e:
Expand All @@ -77,17 +82,20 @@ def stop_kubelet_scenario(self, instance_kill_count, node, timeout):
)
logging.error("stop_kubelet_scenario injection failed!")
raise e
self.add_affected_node(affected_node)

# Node scenario to stop and start the kubelet
def stop_start_kubelet_scenario(self, instance_kill_count, node, timeout):
logging.info("Starting stop_start_kubelet_scenario injection")
self.stop_kubelet_scenario(instance_kill_count, node, timeout)
self.node_reboot_scenario(instance_kill_count, node, timeout)
self.affected_nodes_status.merge_affected_nodes()
logging.info("stop_start_kubelet_scenario has been successfully injected!")

# Node scenario to restart the kubelet
def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting restart_kubelet_scenario injection")
logging.info("Restarting the kubelet of the node %s" % (node))
Expand All @@ -96,8 +104,8 @@ def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
+ node
+ " -- chroot /host systemctl restart kubelet &"
)
nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli, affected_node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli,affected_node)
logging.info("The kubelet of the node %s has been restarted" % (node))
logging.info("restart_kubelet_scenario has been successfuly injected!")
except Exception as e:
Expand All @@ -107,6 +115,7 @@ def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
)
logging.error("restart_kubelet_scenario injection failed!")
raise e
self.add_affected_node(affected_node)

# Node scenario to crash the node
def node_crash_scenario(self, instance_kill_count, node, timeout):
Expand Down
48 changes: 35 additions & 13 deletions krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
abstract_node_scenarios,
)
from krkn_lib.k8s import KrknKubernetes

from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus

class Alibaba:
def __init__(self):
Expand Down Expand Up @@ -161,8 +161,9 @@ def get_vm_status(self, instance_id):
return None

# Wait until the node instance is running
def wait_until_running(self, instance_id, timeout):
def wait_until_running(self, instance_id, timeout, affected_node):
time_counter = 0
start_time = time.time()
status = self.get_vm_status(instance_id)
while status != "Running":
status = self.get_vm_status(instance_id)
Expand All @@ -174,11 +175,15 @@ def wait_until_running(self, instance_id, timeout):
if time_counter >= timeout:
logging.info("ECS %s is still not ready in allotted time" % instance_id)
return False
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("running", end_time - start_time)
return True

# Wait until the node instance is stopped
def wait_until_stopped(self, instance_id, timeout):
def wait_until_stopped(self, instance_id, timeout, affected_node):
time_counter = 0
start_time = time.time()
status = self.get_vm_status(instance_id)
while status != "Stopped":
status = self.get_vm_status(instance_id)
Expand All @@ -192,10 +197,14 @@ def wait_until_stopped(self, instance_id, timeout):
"Vm %s is still not stopped in allotted time" % instance_id
)
return False
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("running", end_time - start_time)
return True

# Wait until the node instance is terminated
def wait_until_released(self, instance_id, timeout):
def wait_until_released(self, instance_id, timeout, affected_node):
start_time = time.time()
statuses = self.get_vm_status(instance_id)
time_counter = 0
while statuses and statuses != "Released":
Expand All @@ -210,26 +219,32 @@ def wait_until_released(self, instance_id, timeout):
return False

logging.info("ECS %s is released" % instance_id)
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("terminated", end_time - start_time)
return True


# krkn_lib
class alibaba_node_scenarios(abstract_node_scenarios):
def __init__(self, kubecli: KrknKubernetes):
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
super().__init__(kubecli, affected_nodes_status)
self.alibaba = Alibaba()


# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_start_scenario injection")
vm_id = self.alibaba.get_instance_id(node)
logging.info(
"Starting the node %s with instance ID: %s " % (node, vm_id)
)
self.alibaba.start_instances(vm_id)
self.alibaba.wait_until_running(vm_id, timeout)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
self.alibaba.wait_until_running(vm_id, timeout, affected_node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
logging.info("Node with instance ID: %s is in running state" % node)
logging.info("node_start_scenario has been successfully injected!")
except Exception as e:
Expand All @@ -239,44 +254,48 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
)
logging.error("node_start_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)

# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_stop_scenario injection")
vm_id = self.alibaba.get_instance_id(node)
logging.info(
"Stopping the node %s with instance ID: %s " % (node, vm_id)
)
self.alibaba.stop_instances(vm_id)
self.alibaba.wait_until_stopped(vm_id, timeout)
self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
logging.info("Node with instance ID: %s is in stopped state" % vm_id)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
except Exception as e:
logging.error(
"Failed to stop node instance. Encountered following exception: %s. "
"Test Failed" % e
)
logging.error("node_stop_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)

# Might need to stop and then release the instance
# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info(
"Starting node_termination_scenario injection by first stopping instance"
)
vm_id = self.alibaba.get_instance_id(node)
self.alibaba.stop_instances(vm_id)
self.alibaba.wait_until_stopped(vm_id, timeout)
self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
logging.info(
"Releasing the node %s with instance ID: %s " % (node, vm_id)
)
self.alibaba.release_instance(vm_id)
self.alibaba.wait_until_released(vm_id, timeout)
self.alibaba.wait_until_released(vm_id, timeout, affected_node)
logging.info("Node with instance ID: %s has been released" % node)
logging.info(
"node_termination_scenario has been successfully injected!"
Expand All @@ -288,17 +307,19 @@ def node_termination_scenario(self, instance_kill_count, node, timeout):
)
logging.error("node_termination_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)

# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_reboot_scenario injection")
instance_id = self.alibaba.get_instance_id(node)
logging.info("Rebooting the node with instance ID: %s " % (instance_id))
self.alibaba.reboot_instances(instance_id)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
logging.info(
"Node with instance ID: %s has been rebooted" % (instance_id)
)
Expand All @@ -310,3 +331,4 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
)
logging.error("node_reboot_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)
Loading
Loading