From 42c7dc4f47c835b0b8a72af670efeda5823fb640 Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Thu, 24 Oct 2024 15:23:24 -0700 Subject: [PATCH 01/14] Initial commit. --- newrelic/config.py | 57 +++++++- newrelic/core/agent.py | 12 ++ newrelic/core/agent_protocol.py | 23 ++++ newrelic/core/application.py | 12 +- newrelic/core/config.py | 1 + newrelic/core/super_agent_health.py | 203 ++++++++++++++++++++++++++++ 6 files changed, 306 insertions(+), 2 deletions(-) create mode 100644 newrelic/core/super_agent_health.py diff --git a/newrelic/config.py b/newrelic/config.py index b4368fe70..a2cbde06f 100644 --- a/newrelic/config.py +++ b/newrelic/config.py @@ -16,7 +16,10 @@ import fnmatch import logging import os +import sched import sys +import threading +import time import traceback import newrelic.api.application @@ -46,6 +49,8 @@ default_host, fetch_config_setting, ) +from newrelic.core.super_agent_health import super_agent_health_instance, is_valid_file_delivery_location, HEALTH_FILE_LOCATION + __all__ = ["initialize", "filter_app_factory"] @@ -100,6 +105,7 @@ def _map_aws_account_id(s): # all the settings have been read. _cache_object = [] +super_agent_instance = super_agent_health_instance() def _reset_config_parser(): @@ -224,7 +230,6 @@ def _map_default_host_value(license_key): # to be the region aware host _default_host = default_host(license_key) _settings.host = os.environ.get("NEW_RELIC_HOST", _default_host) - return license_key @@ -4815,6 +4820,46 @@ def _setup_agent_console(): newrelic.core.agent.Agent.run_on_startup(_startup_agent_console) +# def schedule_super_agent(): +# reporting_frequency = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY", 5) +# scheduler = sched.scheduler() + +# while True: +# scheduler.enter(reporting_frequency, 1, super_agent.write_to_health_file) +# scheduler.run() + + +def super_agent_healthcheck_loop(): + reporting_frequency = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY", 5) + scheduler = sched.scheduler(time.time, time.sleep) + + scheduler.enter(reporting_frequency, 1, super_agent_healthcheck, (scheduler, reporting_frequency)) + scheduler.run() + + +def super_agent_healthcheck(scheduler, reporting_frequency): + scheduler.enter(reporting_frequency, 1, super_agent_healthcheck, (scheduler, reporting_frequency)) + + super_agent_instance.write_to_health_file() + + +super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Super-Agent") +super_agent_health_thread.daemon = True + + +def _setup_super_agent_health(): + health_check_enabled = os.environ.get("NEW_RELIC_SUPERAGENT_FLEET_ID", None) + if not health_check_enabled: + _logger.warning("Super Agent fleet ID not found in environment. Health reporting will not be enabled.") + return + # + valid_file_location = is_valid_file_delivery_location(HEALTH_FILE_LOCATION) + if not valid_file_location: + return + + super_agent_health_thread.start() + + def initialize( config_file=None, environment=None, @@ -4822,6 +4867,8 @@ def initialize( log_file=None, log_level=None, ): + super_agent_instance.start_time_unix_nano = time.time_ns() + if config_file is None: config_file = os.environ.get("NEW_RELIC_CONFIG_FILE", None) @@ -4831,8 +4878,15 @@ def initialize( if ignore_errors is None: ignore_errors = newrelic.core.config._environ_as_bool("NEW_RELIC_IGNORE_STARTUP_ERRORS", True) + _load_configuration(config_file, environment, ignore_errors, log_file, log_level) + _setup_super_agent_health() + + if _settings.monitor_mode: + if not _settings.license_key: + super_agent_instance.set_health_status("missing_license") + if _settings.monitor_mode or _settings.developer_mode: _settings.enabled = True _setup_instrumentation() @@ -4841,6 +4895,7 @@ def initialize( _setup_agent_console() else: _settings.enabled = False + super_agent_instance.set_health_status("agent_disabled") def filter_app_factory(app, global_conf, config_file, environment=None): diff --git a/newrelic/core/agent.py b/newrelic/core/agent.py index 1b81bb9b4..2ee7cd2ac 100644 --- a/newrelic/core/agent.py +++ b/newrelic/core/agent.py @@ -35,6 +35,7 @@ from newrelic.samplers.cpu_usage import cpu_usage_data_source from newrelic.samplers.gc_data import garbage_collector_data_source from newrelic.samplers.memory_usage import memory_usage_data_source +from newrelic.core.super_agent_health import super_agent_healthcheck_loop _logger = logging.getLogger(__name__) @@ -189,6 +190,7 @@ def agent_singleton(): return Agent._instance + def __init__(self, config): """Initialises the agent and attempt to establish a connection to the core application. Will start the harvest loop running but @@ -208,6 +210,9 @@ def __init__(self, config): self._harvest_thread.daemon = True self._harvest_shutdown = threading.Event() + # self._super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Control-Harvest-Thread") + # self._super_agent_health_thread.daemon = True + self._default_harvest_count = 0 self._flexible_harvest_count = 0 self._last_default_harvest = 0.0 @@ -261,6 +266,7 @@ def dump(self, file): print(f"Agent Shutdown: {self._harvest_shutdown.isSet()}", file=file) print(f"Applications: {sorted(self._applications.keys())!r}", file=file) + def global_settings(self): """Returns the global default settings object. If access is needed to this prior to initialising the agent, use the @@ -603,6 +609,12 @@ def _harvest_shutdown_is_set(self): return self._harvest_shutdown.isSet() def _harvest_flexible(self, shutdown=False): + # Stop existing thread that is running super agent health checks + # running_threads = threading.enumerate() + # for thread in running_threads: + # if thread.name == "NR-Control-Main-Thread": + # pass + if not self._harvest_shutdown_is_set(): event_harvest_config = self.global_settings().event_harvest_config diff --git a/newrelic/core/agent_protocol.py b/newrelic/core/agent_protocol.py index ddbfa051d..db62028ac 100644 --- a/newrelic/core/agent_protocol.py +++ b/newrelic/core/agent_protocol.py @@ -47,6 +47,7 @@ NetworkInterfaceException, RetryDataForRequest, ) +from newrelic.core.super_agent_health import super_agent_health_instance _logger = logging.getLogger(__name__) @@ -188,6 +189,7 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient): "marshal_format": "json", } self._headers = {} + self._license_key = settings.license_key # In Python 2, the JSON is loaded with unicode keys and values; # however, the header name must be a non-unicode value when given to @@ -209,6 +211,7 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient): # Do not access configuration anywhere inside the class self.configuration = settings + self._super_agent = super_agent_health_instance() def __enter__(self): self.client.__enter__() @@ -242,7 +245,27 @@ def send( f"Supportability/Python/Collector/MaxPayloadSizeLimit/{method}", 1, ) + if status == 401: + # Check for license key presence again so the original missing license key status set in the + # initialize function doesn't get overridden with invalid_license as a missing license key is also + # treated as a 401 status code + if not self._license_key: + self._super_agent.set_health_status("missing_license") + else: + self._super_agent.set_health_status("invalid_license") + + if status == 407: + self._super_agent.set_health_status("proxy_error", status) + + if status == 410: + self._super_agent.set_health_status("forced_disconnect") + level, message = self.LOG_MESSAGES.get(status, self.LOG_MESSAGES["default"]) + + # If the default error message was used, then we know we have a general HTTP error + if message.startswith("Received a non 200 or 202"): + self._super_agent.set_health_status("http_error", status, method) + _logger.log( level, message, diff --git a/newrelic/core/application.py b/newrelic/core/application.py index 8b444321c..7f0fb56b5 100644 --- a/newrelic/core/application.py +++ b/newrelic/core/application.py @@ -41,6 +41,7 @@ from newrelic.core.profile_sessions import profile_session_manager from newrelic.core.rules_engine import RulesEngine, SegmentCollapseEngine from newrelic.core.stats_engine import CustomMetrics, StatsEngine +from newrelic.core.super_agent_health import super_agent_health_instance from newrelic.network.exceptions import ( DiscardDataForRequest, ForceAgentDisconnect, @@ -49,6 +50,7 @@ RetryDataForRequest, ) from newrelic.samplers.data_sampler import DataSampler +from newrelic.core.super_agent_health import super_agent_healthcheck_loop _logger = logging.getLogger(__name__) @@ -110,6 +112,10 @@ def __init__(self, app_name, linked_applications=None): self._remaining_plugins = True + self._super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Control-Harvest-Thread") + self._super_agent_health_thread.daemon = True + + # We setup empty rules engines here even though they will be # replaced when application first registered. This is done to # avoid a race condition in setting it later. Otherwise we have @@ -195,6 +201,7 @@ def activate_session(self, activate_agent=None, timeout=0.0): to be activated. """ + self._super_agent_health_thread.start() if self._agent_shutdown: return @@ -225,7 +232,6 @@ def activate_session(self, activate_agent=None, timeout=0.0): # timeout has likely occurred. deadlock_timeout = 0.1 - if timeout >= deadlock_timeout: self._detect_deadlock = True @@ -1688,6 +1694,9 @@ def internal_agent_shutdown(self, restart=False): optionally triggers activation of a new session. """ + super_agent = super_agent_health_instance() + super_agent.set_health_status("agent_shutdown") + super_agent.write_to_health_file() # We need to stop any thread profiler session related to this # application. @@ -1749,6 +1758,7 @@ def internal_agent_shutdown(self, restart=False): else: self._agent_shutdown = True + def process_agent_commands(self): """Fetches agents commands from data collector and process them.""" diff --git a/newrelic/core/config.py b/newrelic/core/config.py index 02096401c..cd2aa8f06 100644 --- a/newrelic/core/config.py +++ b/newrelic/core/config.py @@ -33,6 +33,7 @@ from newrelic.common.object_names import parse_exc_info from newrelic.core.attribute import MAX_ATTRIBUTE_LENGTH from newrelic.core.attribute_filter import AttributeFilter +from newrelic.core.super_agent_health import super_agent_health_instance try: import grpc diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py new file mode 100644 index 000000000..ec5c36aa5 --- /dev/null +++ b/newrelic/core/super_agent_health.py @@ -0,0 +1,203 @@ +# Copyright 2010 New Relic, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import threading +import time +from pathlib import Path, PurePath +from urllib.parse import urlparse +import sched + +_logger = logging.getLogger(__name__) + + +HEALTH_CHECK_STATUSES = { + "healthy": ("NR-APM-000", "Healthy"), + "invalid_license": ("NR-APM-001", "Invalid license key (HTTP status code 401)"), + "missing_license": ("NR-APM-002", "License key missing in configuration"), + "forced_disconnect": ("NR-APM-003", "Forced disconnect received from New Relic (HTTP status code 410)"), + "http_error": ("NR-APM-004", "HTTP error response code received from New Relic"), + "max_app_names": ("NR-APM-006", "The maximum number of configured app names (3) exceeded"), + "proxy_error": ("NR-APM-007", "HTTP Proxy configuration error"), + "agent_disabled": ("NR-APM-008", "Agent is disabled via configuration"), + "agent_shutdown": ("NR-APM-099", "Agent has shutdown"), +} + +HEALTH_FILE_LOCATION = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) +PARSED_URI = urlparse(HEALTH_FILE_LOCATION) + + +class SuperAgentHealth(): + _instance_lock = threading.Lock() + _instance = None + + @staticmethod + def super_agent_health_singleton(): + if SuperAgentHealth._instance: + return SuperAgentHealth._instance + + with SuperAgentHealth._instance_lock: + if not SuperAgentHealth._instance: + instance = SuperAgentHealth() + + SuperAgentHealth._instance = instance + + return SuperAgentHealth._instance + + def __init__(self): + self.last_error = "NR-APM-000" + self.status = "Healthy" + self.start_time_unix_nano = None + self.agent_start_time = None + + def set_health_status(self, health_status, response_code=None, info=None): + last_error, current_status = HEALTH_CHECK_STATUSES[health_status] + + if health_status == "http_error" and response_code and info: + current_status = ( + f"HTTP error response code {response_code} received from New Relic while sending data type {info}" + ) + + if health_status == "proxy_error" and response_code: + current_status = f"HTTP Proxy configuration error; response code {response_code}" + + if health_status == "agent_shutdown" and self.status != "Healthy": + pass + + else: + self.last_error = last_error + self.status = current_status + + def write_to_health_file(self): + is_healthy = True if self.status == "Healthy" else False + status_time_unix_nano = time.time_ns() + + file_path = urlparse(HEALTH_FILE_LOCATION).path + pid = os.getpid() + file_name = f"health_{pid}.yml" + full_path = os.path.join(file_path, file_name) + + try: + with open(full_path, "w") as f: + f.write(f"healthy: {is_healthy}\n") + f.write(f"status: {self.status}\n") + f.write(f"start_time_unix_nano: {self.start_time_unix_nano}\n") + f.write(f"status_time_unix_nano: {status_time_unix_nano}\n") + f.write(f"last_error: {self.last_error}\n") + except: + _logger.warning("Unable to write agent health.") + + +def super_agent_health_instance(): + return SuperAgentHealth.super_agent_health_singleton() + + +def super_agent_healthcheck_loop(): + reporting_frequency = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY", 5) + scheduler = sched.scheduler(time.time, time.sleep) + + scheduler.enter(reporting_frequency, 1, super_agent_healthcheck, (scheduler, reporting_frequency)) + scheduler.run() + + +def super_agent_healthcheck(scheduler, reporting_frequency): + scheduler.enter(reporting_frequency, 1, super_agent_healthcheck, (scheduler, reporting_frequency)) + + super_agent_health_instance().write_to_health_file() + +# class SuperAgentHealthCheck: +# _last_error = "NR-APM-000" +# _status = "Healthy" +# _start_time_unix_nano = None +# +# def __init__(self): +# pass +# +# @classmethod +# def set_health_status(cls, health_status, response_code=None, info=None): +# last_error, current_status = HEALTH_CHECK_STATUSES[health_status] +# +# if health_status == "http_error" and response_code and info: +# current_status = ( +# f"HTTP error response code {response_code} received from New Relic while sending data type {info}" +# ) +# +# if health_status == "proxy_error" and response_code: +# current_status = f"HTTP Proxy configuration error; response code {response_code}" +# +# if health_status == "agent_shutdown" and cls._status != "Healthy": +# pass +# +# else: +# cls._last_error = last_error +# cls._status = current_status +# +# @classmethod +# def set_agent_start_time(cls, agent_start_time): +# cls._start_time_unix_nano = agent_start_time +# +# @classmethod +# def write_to_health_file(cls): +# is_healthy = True if cls._status == "Healthy" else False +# status_time_unix_nano = time.time_ns() +# file_path = urlparse(HEALTH_FILE_LOCATION).path +# +# with open(file_path, "w") as f: +# f.write(f"healthy: {is_healthy}\n") +# f.write(f"status: {cls._status}\n") +# f.write(f"start_time_unix_nano: {cls._start_time_unix_nano}\n") +# f.write(f"status_time_unix_nano: {status_time_unix_nano}\n") +# f.write(f"last_error: {cls._last_error}\n") +# +# + +def is_valid_file_delivery_location(file_uri): + if not file_uri: + _logger.warning( + "Configured Super Agent health delivery location is empty. Super Agent health check will not be enabled." + ) + return False + + try: + parsed_uri = urlparse(file_uri) + + if not parsed_uri.scheme or not parsed_uri.path: + _logger.warning( + "Configured Super Agent health delivery location is not a complete file URI. Super Agent health check will not be enabled." + ) + return False + + if parsed_uri.scheme != "file": + _logger.warning( + "Configured Super Agent health delivery location does not have a valid scheme. Super Agent health check will not be enabled." + ) + return False + + path = Path(parsed_uri.path) + + # Check if the path exists + if not path.parent.exists(): + _logger.warning( + "Configured Super Agent health delivery location does not exist. Super Agent health check will not be enabled." + ) + return False + + return True + + except Exception as e: + _logger.warning( + "Configured Super Agent health delivery location is not valid. Super Agent health check will not be enabled." + ) + return False From d1ad566e8cf630ba80b6dc4e4d44b1851188cdd2 Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Wed, 30 Oct 2024 13:33:39 -1000 Subject: [PATCH 02/14] Add status change logic. --- newrelic/config.py | 39 +--- newrelic/core/agent.py | 6 - newrelic/core/agent_protocol.py | 1 + newrelic/core/application.py | 12 +- newrelic/core/super_agent_health.py | 177 ++++++++--------- .../test_super_agent_health_check.py | 179 ++++++++++++++++++ 6 files changed, 273 insertions(+), 141 deletions(-) create mode 100644 tests/agent_features/test_super_agent_health_check.py diff --git a/newrelic/config.py b/newrelic/config.py index a2cbde06f..60aa0d8ff 100644 --- a/newrelic/config.py +++ b/newrelic/config.py @@ -49,7 +49,7 @@ default_host, fetch_config_setting, ) -from newrelic.core.super_agent_health import super_agent_health_instance, is_valid_file_delivery_location, HEALTH_FILE_LOCATION +from newrelic.core.super_agent_health import super_agent_health_instance, super_agent_healthcheck_loop, HEALTH_CHECK_ENABLED, should_start_health_check __all__ = ["initialize", "filter_app_factory"] @@ -4820,44 +4820,13 @@ def _setup_agent_console(): newrelic.core.agent.Agent.run_on_startup(_startup_agent_console) -# def schedule_super_agent(): -# reporting_frequency = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY", 5) -# scheduler = sched.scheduler() - -# while True: -# scheduler.enter(reporting_frequency, 1, super_agent.write_to_health_file) -# scheduler.run() - - -def super_agent_healthcheck_loop(): - reporting_frequency = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY", 5) - scheduler = sched.scheduler(time.time, time.sleep) - - scheduler.enter(reporting_frequency, 1, super_agent_healthcheck, (scheduler, reporting_frequency)) - scheduler.run() - - -def super_agent_healthcheck(scheduler, reporting_frequency): - scheduler.enter(reporting_frequency, 1, super_agent_healthcheck, (scheduler, reporting_frequency)) - - super_agent_instance.write_to_health_file() - - -super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Super-Agent") +super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Control-Health-Main-Thread") super_agent_health_thread.daemon = True def _setup_super_agent_health(): - health_check_enabled = os.environ.get("NEW_RELIC_SUPERAGENT_FLEET_ID", None) - if not health_check_enabled: - _logger.warning("Super Agent fleet ID not found in environment. Health reporting will not be enabled.") - return - # - valid_file_location = is_valid_file_delivery_location(HEALTH_FILE_LOCATION) - if not valid_file_location: - return - - super_agent_health_thread.start() + if should_start_health_check(): + super_agent_health_thread.start() def initialize( diff --git a/newrelic/core/agent.py b/newrelic/core/agent.py index 2ee7cd2ac..d27b614d9 100644 --- a/newrelic/core/agent.py +++ b/newrelic/core/agent.py @@ -609,12 +609,6 @@ def _harvest_shutdown_is_set(self): return self._harvest_shutdown.isSet() def _harvest_flexible(self, shutdown=False): - # Stop existing thread that is running super agent health checks - # running_threads = threading.enumerate() - # for thread in running_threads: - # if thread.name == "NR-Control-Main-Thread": - # pass - if not self._harvest_shutdown_is_set(): event_harvest_config = self.global_settings().event_harvest_config diff --git a/newrelic/core/agent_protocol.py b/newrelic/core/agent_protocol.py index db62028ac..dd5c5dbfe 100644 --- a/newrelic/core/agent_protocol.py +++ b/newrelic/core/agent_protocol.py @@ -284,6 +284,7 @@ def send( exception = self.STATUS_CODE_RESPONSE.get(status, DiscardDataForRequest) raise exception if status == 200: + self._super_agent.check_for_healthy_status() return self.decode_response(data) def decode_response(self, response): diff --git a/newrelic/core/application.py b/newrelic/core/application.py index 7f0fb56b5..2ea13f0db 100644 --- a/newrelic/core/application.py +++ b/newrelic/core/application.py @@ -50,7 +50,7 @@ RetryDataForRequest, ) from newrelic.samplers.data_sampler import DataSampler -from newrelic.core.super_agent_health import super_agent_healthcheck_loop +from newrelic.core.super_agent_health import super_agent_healthcheck_loop, HEALTH_CHECK_ENABLED, should_start_health_check _logger = logging.getLogger(__name__) @@ -112,7 +112,7 @@ def __init__(self, app_name, linked_applications=None): self._remaining_plugins = True - self._super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Control-Harvest-Thread") + self._super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Control-Health-Session-Thread") self._super_agent_health_thread.daemon = True @@ -201,8 +201,6 @@ def activate_session(self, activate_agent=None, timeout=0.0): to be activated. """ - self._super_agent_health_thread.start() - if self._agent_shutdown: return @@ -212,6 +210,9 @@ def activate_session(self, activate_agent=None, timeout=0.0): if self._active_session: return + if should_start_health_check(): + self._super_agent_health_thread.start() + self._process_id = os.getpid() self._connected_event.clear() @@ -1696,7 +1697,8 @@ def internal_agent_shutdown(self, restart=False): """ super_agent = super_agent_health_instance() super_agent.set_health_status("agent_shutdown") - super_agent.write_to_health_file() + if HEALTH_CHECK_ENABLED: + super_agent.write_to_health_file() # We need to stop any thread profiler session related to this # application. diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py index ec5c36aa5..b99600f13 100644 --- a/newrelic/core/super_agent_health.py +++ b/newrelic/core/super_agent_health.py @@ -14,6 +14,7 @@ import logging import os +import uuid import threading import time from pathlib import Path, PurePath @@ -29,20 +30,75 @@ "missing_license": ("NR-APM-002", "License key missing in configuration"), "forced_disconnect": ("NR-APM-003", "Forced disconnect received from New Relic (HTTP status code 410)"), "http_error": ("NR-APM-004", "HTTP error response code received from New Relic"), - "max_app_names": ("NR-APM-006", "The maximum number of configured app names (3) exceeded"), + # "max_app_names": ("NR-APM-006", "The maximum number of configured app names (3) exceeded"), "proxy_error": ("NR-APM-007", "HTTP Proxy configuration error"), "agent_disabled": ("NR-APM-008", "Agent is disabled via configuration"), "agent_shutdown": ("NR-APM-099", "Agent has shutdown"), } -HEALTH_FILE_LOCATION = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) -PARSED_URI = urlparse(HEALTH_FILE_LOCATION) +def is_valid_file_delivery_location(file_uri): + if not file_uri: + _logger.warning( + "Configured Super Agent health delivery location is empty. Super Agent health check will not be enabled." + ) + return False -class SuperAgentHealth(): + try: + parsed_uri = urlparse(file_uri) + + if not parsed_uri.scheme or not parsed_uri.path: + _logger.warning( + "Configured Super Agent health delivery location is not a complete file URI. Super Agent health check will not be enabled." + ) + return False + + if parsed_uri.scheme != "file": + _logger.warning( + "Configured Super Agent health delivery location does not have a valid scheme. Super Agent health check will not be enabled." + ) + return False + + path = Path(parsed_uri.path) + + # Check if the path exists + if not path.exists(): + _logger.warning( + "Configured Super Agent health delivery location does not exist. Super Agent health check will not be enabled." + ) + return False + + return True + + except Exception as e: + _logger.warning( + "Configured Super Agent health delivery location is not valid. Super Agent health check will not be enabled." + ) + return False + + +def should_start_health_check(): + health_check_enabled = os.environ.get("NEW_RELIC_SUPERAGENT_FLEET_ID", None) + if not health_check_enabled: + _logger.warning("Super Agent fleet ID not found in environment. Health reporting will not be enabled.") + return False + + health_file_location = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) + valid_file_location = is_valid_file_delivery_location(health_file_location) + if not valid_file_location: + return False + + return True + + +HEALTH_CHECK_ENABLED = should_start_health_check() + + +class SuperAgentHealth: _instance_lock = threading.Lock() _instance = None + # Define a way to access/create a single super agent object instance similar to the agent_singleton @staticmethod def super_agent_health_singleton(): if SuperAgentHealth._instance: @@ -60,11 +116,11 @@ def __init__(self): self.last_error = "NR-APM-000" self.status = "Healthy" self.start_time_unix_nano = None - self.agent_start_time = None def set_health_status(self, health_status, response_code=None, info=None): last_error, current_status = HEALTH_CHECK_STATUSES[health_status] + # Update status messages to be more descriptive if necessary data is present if health_status == "http_error" and response_code and info: current_status = ( f"HTTP error response code {response_code} received from New Relic while sending data type {info}" @@ -73,6 +129,7 @@ def set_health_status(self, health_status, response_code=None, info=None): if health_status == "proxy_error" and response_code: current_status = f"HTTP Proxy configuration error; response code {response_code}" + # Do not override status with agent_shutdown unless the agent was previously healthy if health_status == "agent_shutdown" and self.status != "Healthy": pass @@ -80,13 +137,25 @@ def set_health_status(self, health_status, response_code=None, info=None): self.last_error = last_error self.status = current_status + def check_for_healthy_status(self): + # If our unhealthy status code was not config related, it is possible it could be resolved during an active + # session We determine the status is resolved by calling this function when a 200 status code is received to + # check if the current status is resolvable + + # Checking for forced disconnects or proxy/ HTTP errors + non_config_error_codes = frozenset("NR-APM-003", "NR-APM-004", "NR-APM-007") + if self.last_error in non_config_error_codes: + self.last_error = "NR-APM-000" + self.status = "Healthy" + def write_to_health_file(self): is_healthy = True if self.status == "Healthy" else False status_time_unix_nano = time.time_ns() + health_file_location = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) - file_path = urlparse(HEALTH_FILE_LOCATION).path - pid = os.getpid() - file_name = f"health_{pid}.yml" + file_path = urlparse(health_file_location).path + file_id = str(uuid.uuid4()).replace("-", "") + file_name = f"health_{file_id}.yml" full_path = os.path.join(file_path, file_name) try: @@ -95,12 +164,14 @@ def write_to_health_file(self): f.write(f"status: {self.status}\n") f.write(f"start_time_unix_nano: {self.start_time_unix_nano}\n") f.write(f"status_time_unix_nano: {status_time_unix_nano}\n") - f.write(f"last_error: {self.last_error}\n") + if not is_healthy: + f.write(f"last_error: {self.last_error}\n") except: - _logger.warning("Unable to write agent health.") + _logger.warning("Unable to write to agent health file.") def super_agent_health_instance(): + # Helper function directly returns the singleton instance similar to agent_instance() return SuperAgentHealth.super_agent_health_singleton() @@ -108,6 +179,7 @@ def super_agent_healthcheck_loop(): reporting_frequency = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY", 5) scheduler = sched.scheduler(time.time, time.sleep) + # Target this function when starting super agent health check threads to keep the scheduler running scheduler.enter(reporting_frequency, 1, super_agent_healthcheck, (scheduler, reporting_frequency)) scheduler.run() @@ -116,88 +188,3 @@ def super_agent_healthcheck(scheduler, reporting_frequency): scheduler.enter(reporting_frequency, 1, super_agent_healthcheck, (scheduler, reporting_frequency)) super_agent_health_instance().write_to_health_file() - -# class SuperAgentHealthCheck: -# _last_error = "NR-APM-000" -# _status = "Healthy" -# _start_time_unix_nano = None -# -# def __init__(self): -# pass -# -# @classmethod -# def set_health_status(cls, health_status, response_code=None, info=None): -# last_error, current_status = HEALTH_CHECK_STATUSES[health_status] -# -# if health_status == "http_error" and response_code and info: -# current_status = ( -# f"HTTP error response code {response_code} received from New Relic while sending data type {info}" -# ) -# -# if health_status == "proxy_error" and response_code: -# current_status = f"HTTP Proxy configuration error; response code {response_code}" -# -# if health_status == "agent_shutdown" and cls._status != "Healthy": -# pass -# -# else: -# cls._last_error = last_error -# cls._status = current_status -# -# @classmethod -# def set_agent_start_time(cls, agent_start_time): -# cls._start_time_unix_nano = agent_start_time -# -# @classmethod -# def write_to_health_file(cls): -# is_healthy = True if cls._status == "Healthy" else False -# status_time_unix_nano = time.time_ns() -# file_path = urlparse(HEALTH_FILE_LOCATION).path -# -# with open(file_path, "w") as f: -# f.write(f"healthy: {is_healthy}\n") -# f.write(f"status: {cls._status}\n") -# f.write(f"start_time_unix_nano: {cls._start_time_unix_nano}\n") -# f.write(f"status_time_unix_nano: {status_time_unix_nano}\n") -# f.write(f"last_error: {cls._last_error}\n") -# -# - -def is_valid_file_delivery_location(file_uri): - if not file_uri: - _logger.warning( - "Configured Super Agent health delivery location is empty. Super Agent health check will not be enabled." - ) - return False - - try: - parsed_uri = urlparse(file_uri) - - if not parsed_uri.scheme or not parsed_uri.path: - _logger.warning( - "Configured Super Agent health delivery location is not a complete file URI. Super Agent health check will not be enabled." - ) - return False - - if parsed_uri.scheme != "file": - _logger.warning( - "Configured Super Agent health delivery location does not have a valid scheme. Super Agent health check will not be enabled." - ) - return False - - path = Path(parsed_uri.path) - - # Check if the path exists - if not path.parent.exists(): - _logger.warning( - "Configured Super Agent health delivery location does not exist. Super Agent health check will not be enabled." - ) - return False - - return True - - except Exception as e: - _logger.warning( - "Configured Super Agent health delivery location is not valid. Super Agent health check will not be enabled." - ) - return False diff --git a/tests/agent_features/test_super_agent_health_check.py b/tests/agent_features/test_super_agent_health_check.py new file mode 100644 index 000000000..8365944a5 --- /dev/null +++ b/tests/agent_features/test_super_agent_health_check.py @@ -0,0 +1,179 @@ +# Copyright 2010 New Relic, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import pytest +import threading + +from testing_support.fixtures import override_application_settings +from newrelic.core.super_agent_health import is_valid_file_delivery_location, super_agent_health_instance +from newrelic.config import initialize +from newrelic.core.application import Application + +_disable_agent_settings = { + "monitor_mode": False, + "developer_mode": False +} + + +@pytest.mark.parametrize("file_uri", ["", "file://", "/test/dir", "foo:/test/dir"]) +def test_invalid_file_directory_supplied(file_uri): + assert is_valid_file_delivery_location(file_uri) is False + + +def test_write_to_file_healthy_status(monkeypatch, tmp_path): + # Setup expected env vars to run super agent health check + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") + file_path = tmp_path.as_uri() + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + + # Write to health YAML file + super_agent_instance = super_agent_health_instance() + super_agent_instance.start_time_unix_nano = "1234567890" + super_agent_instance.write_to_health_file() + + # Grab the file we just wrote to and read its contents + health_files = os.listdir(tmp_path) + path_to_written_file = f"{tmp_path}/{health_files[0]}" + with open(path_to_written_file, 'r') as f: + contents = f.readlines() + + # Assert on contents of health file + assert len(contents) == 4 + assert contents[0] == "healthy: True\n" + assert contents[1] == "status: Healthy\n" + assert contents[2] == "start_time_unix_nano: 1234567890\n" + assert contents[3].startswith("status_time_unix_nano:") is True + + +def test_write_to_file_unhealthy_status(monkeypatch, tmp_path): + # Setup expected env vars to run super agent health check + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") + file_path = tmp_path.as_uri() + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + + # Write to health YAML file + super_agent_instance = super_agent_health_instance() + super_agent_instance.start_time_unix_nano = "1234567890" + super_agent_instance.set_health_status("invalid_license") + super_agent_instance.write_to_health_file() + + # Grab the file we just wrote to and read its contents + health_files = os.listdir(tmp_path) + path_to_written_file = f"{tmp_path}/{health_files[0]}" + with open(path_to_written_file, 'r') as f: + contents = f.readlines() + + # Assert on contents of health file + assert len(contents) == 5 + assert contents[0] == "healthy: False\n" + assert contents[1] == "status: Invalid license key (HTTP status code 401)\n" + assert contents[2] == "start_time_unix_nano: 1234567890\n" + assert contents[3].startswith("status_time_unix_nano:") is True + assert contents[4] == "last_error: NR-APM-001\n" + + +def test_no_override_on_unhealthy_shutdown(monkeypatch, tmp_path): + # Setup expected env vars to run super agent health check + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") + file_path = tmp_path.as_uri() + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + + # Write to health YAML file + super_agent_instance = super_agent_health_instance() + super_agent_instance.start_time_unix_nano = "1234567890" + super_agent_instance.set_health_status("invalid_license") + # Attempt to override a previously unhealthy status + super_agent_instance.set_health_status("agent_shutdown") + super_agent_instance.write_to_health_file() + + # Grab the file we just wrote to and read its contents + health_files = os.listdir(tmp_path) + path_to_written_file = f"{tmp_path}/{health_files[0]}" + with open(path_to_written_file, 'r') as f: + contents = f.readlines() + + # Assert on contents of health file + assert len(contents) == 5 + assert contents[0] == "healthy: False\n" + assert contents[1] == "status: Invalid license key (HTTP status code 401)\n" + assert contents[4] == "last_error: NR-APM-001\n" + + +def test_health_check_running_threads(monkeypatch, tmp_path): + running_threads = threading.enumerate() + # Only the main thread should be running since not super agent env vars are set + assert len(running_threads) == 1 + + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") + file_path = tmp_path.as_uri() + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + + # Re-initialize the agent to allow the health check thread to start and assert that it did + initialize() + running_threads = threading.enumerate() + + assert len(running_threads) == 2 + assert running_threads[1].name == "NR-Control-Health-Main-Thread" + + +def test_multiple_activations_running_threads(monkeypatch, tmp_path): + # Setup expected env vars to run super agent health check + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") + file_path = tmp_path.as_uri() + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + + initialize() + application_1 = Application("Test App 1") + application_2 = Application("Test App 2") + + application_1.activate_session() + application_2.activate_session() + + running_threads = threading.enumerate() + + # + assert len(running_threads) == 6 + assert running_threads[1].name == "NR-Control-Health-Main-Thread" + assert running_threads[2].name == "NR-Control-Health-Session-Thread" + assert running_threads[4].name == "NR-Control-Health-Session-Thread" + + + +def test_missing_license_key(monkeypatch, tmp_path): + # Setup expected env vars to run super agent health check + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") + file_path = tmp_path.as_uri() + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + #monkeypatch.setenv("NEW_RELIC_LICENSE_KEY", "") + + #initialize() + # Write to health YAML file + super_agent_instance = super_agent_health_instance() + super_agent_instance.write_to_health_file() + + # Grab the file we just wrote to and read its contents + health_files = os.listdir(tmp_path) + path_to_written_file = f"{tmp_path}/{health_files[0]}" + with open(path_to_written_file, 'r') as f: + contents = f.readlines() + + # Assert on contents of health file + assert len(contents) == 5 + assert contents[0] == "healthy: False\n" + assert contents[1] == "status: Invalid license key (HTTP status code 401)\n" + assert contents[2] == "start_time_unix_nano: 1234567890\n" + assert contents[3].startswith("status_time_unix_nano:") is True + assert contents[4] == "last_error: NR-APM-001\n" + + From 1238fbcdf68dbc623cc236a311e7bcb00837f346 Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Wed, 6 Nov 2024 12:13:48 -0800 Subject: [PATCH 03/14] Checkpoint. --- newrelic/config.py | 12 ++- newrelic/core/agent.py | 5 -- newrelic/core/application.py | 15 ++-- newrelic/core/config.py | 1 - newrelic/core/super_agent_health.py | 49 ++++++++---- .../test_super_agent_health_check.py | 74 +++++++++++-------- 6 files changed, 92 insertions(+), 64 deletions(-) diff --git a/newrelic/config.py b/newrelic/config.py index 60aa0d8ff..a3a0b7a88 100644 --- a/newrelic/config.py +++ b/newrelic/config.py @@ -16,6 +16,7 @@ import fnmatch import logging import os +import uuid import sched import sys import threading @@ -49,7 +50,7 @@ default_host, fetch_config_setting, ) -from newrelic.core.super_agent_health import super_agent_health_instance, super_agent_healthcheck_loop, HEALTH_CHECK_ENABLED, should_start_health_check +from newrelic.core.super_agent_health import super_agent_health_instance, super_agent_healthcheck_loop, health_check_enabled __all__ = ["initialize", "filter_app_factory"] @@ -230,6 +231,7 @@ def _map_default_host_value(license_key): # to be the region aware host _default_host = default_host(license_key) _settings.host = os.environ.get("NEW_RELIC_HOST", _default_host) + return license_key @@ -4820,12 +4822,15 @@ def _setup_agent_console(): newrelic.core.agent.Agent.run_on_startup(_startup_agent_console) -super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Control-Health-Main-Thread") +super_agent_health_thread = threading.Thread(name="APM-Control-Health-Main-Thread", target=super_agent_healthcheck_loop) super_agent_health_thread.daemon = True def _setup_super_agent_health(): - if should_start_health_check(): + if super_agent_health_thread.is_alive(): + return + + if health_check_enabled(): super_agent_health_thread.start() @@ -4847,7 +4852,6 @@ def initialize( if ignore_errors is None: ignore_errors = newrelic.core.config._environ_as_bool("NEW_RELIC_IGNORE_STARTUP_ERRORS", True) - _load_configuration(config_file, environment, ignore_errors, log_file, log_level) _setup_super_agent_health() diff --git a/newrelic/core/agent.py b/newrelic/core/agent.py index d27b614d9..b20bda93d 100644 --- a/newrelic/core/agent.py +++ b/newrelic/core/agent.py @@ -35,7 +35,6 @@ from newrelic.samplers.cpu_usage import cpu_usage_data_source from newrelic.samplers.gc_data import garbage_collector_data_source from newrelic.samplers.memory_usage import memory_usage_data_source -from newrelic.core.super_agent_health import super_agent_healthcheck_loop _logger = logging.getLogger(__name__) @@ -210,9 +209,6 @@ def __init__(self, config): self._harvest_thread.daemon = True self._harvest_shutdown = threading.Event() - # self._super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Control-Harvest-Thread") - # self._super_agent_health_thread.daemon = True - self._default_harvest_count = 0 self._flexible_harvest_count = 0 self._last_default_harvest = 0.0 @@ -266,7 +262,6 @@ def dump(self, file): print(f"Agent Shutdown: {self._harvest_shutdown.isSet()}", file=file) print(f"Applications: {sorted(self._applications.keys())!r}", file=file) - def global_settings(self): """Returns the global default settings object. If access is needed to this prior to initialising the agent, use the diff --git a/newrelic/core/application.py b/newrelic/core/application.py index 2ea13f0db..f736173f1 100644 --- a/newrelic/core/application.py +++ b/newrelic/core/application.py @@ -19,6 +19,7 @@ import logging import os import sys +import uuid import threading import time import traceback @@ -50,7 +51,7 @@ RetryDataForRequest, ) from newrelic.samplers.data_sampler import DataSampler -from newrelic.core.super_agent_health import super_agent_healthcheck_loop, HEALTH_CHECK_ENABLED, should_start_health_check +from newrelic.core.super_agent_health import super_agent_healthcheck_loop, health_check_enabled _logger = logging.getLogger(__name__) @@ -112,8 +113,9 @@ def __init__(self, app_name, linked_applications=None): self._remaining_plugins = True - self._super_agent_health_thread = threading.Thread(target=super_agent_healthcheck_loop, name="NR-Control-Health-Session-Thread") + self._super_agent_health_thread = threading.Thread(name="NR-Control-Health-Session-Thread", target=super_agent_healthcheck_loop) self._super_agent_health_thread.daemon = True + self._super_agent = super_agent_health_instance() # We setup empty rules engines here even though they will be @@ -210,7 +212,7 @@ def activate_session(self, activate_agent=None, timeout=0.0): if self._active_session: return - if should_start_health_check(): + if health_check_enabled() and not self._super_agent_health_thread.is_alive(): self._super_agent_health_thread.start() self._process_id = os.getpid() @@ -672,7 +674,7 @@ def validate_process(self): self._process_id = 0 def normalize_name(self, name, rule_type): - """Applies the agent normalization rules of the the specified + """Applies the agent normalization rules of the specified rule type to the supplied name. """ @@ -1697,8 +1699,8 @@ def internal_agent_shutdown(self, restart=False): """ super_agent = super_agent_health_instance() super_agent.set_health_status("agent_shutdown") - if HEALTH_CHECK_ENABLED: - super_agent.write_to_health_file() + if health_check_enabled(): + super_agent.write_to_health_file(self._super_agent_file_id) # We need to stop any thread profiler session related to this # application. @@ -1760,7 +1762,6 @@ def internal_agent_shutdown(self, restart=False): else: self._agent_shutdown = True - def process_agent_commands(self): """Fetches agents commands from data collector and process them.""" diff --git a/newrelic/core/config.py b/newrelic/core/config.py index cd2aa8f06..02096401c 100644 --- a/newrelic/core/config.py +++ b/newrelic/core/config.py @@ -33,7 +33,6 @@ from newrelic.common.object_names import parse_exc_info from newrelic.core.attribute import MAX_ATTRIBUTE_LENGTH from newrelic.core.attribute_filter import AttributeFilter -from newrelic.core.super_agent_health import super_agent_health_instance try: import grpc diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py index b99600f13..045706e35 100644 --- a/newrelic/core/super_agent_health.py +++ b/newrelic/core/super_agent_health.py @@ -17,7 +17,7 @@ import uuid import threading import time -from pathlib import Path, PurePath +from pathlib import Path from urllib.parse import urlparse import sched @@ -40,7 +40,7 @@ def is_valid_file_delivery_location(file_uri): if not file_uri: _logger.warning( - "Configured Super Agent health delivery location is empty. Super Agent health check will not be enabled." + "Configured APM Control health delivery location is empty. APM Control health check will not be enabled." ) return False @@ -49,13 +49,15 @@ def is_valid_file_delivery_location(file_uri): if not parsed_uri.scheme or not parsed_uri.path: _logger.warning( - "Configured Super Agent health delivery location is not a complete file URI. Super Agent health check will not be enabled." + "Configured Super Agent health delivery location is not a complete file URI. Super Agent health check " + "will not be enabled. " ) return False if parsed_uri.scheme != "file": _logger.warning( - "Configured Super Agent health delivery location does not have a valid scheme. Super Agent health check will not be enabled." + "Configured Super Agent health delivery location does not have a valid scheme. Super Agent health " + "check will not be enabled. " ) return False @@ -64,7 +66,8 @@ def is_valid_file_delivery_location(file_uri): # Check if the path exists if not path.exists(): _logger.warning( - "Configured Super Agent health delivery location does not exist. Super Agent health check will not be enabled." + "Configured Super Agent health delivery location does not exist. Super Agent health check will not be " + "enabled. " ) return False @@ -72,14 +75,15 @@ def is_valid_file_delivery_location(file_uri): except Exception as e: _logger.warning( - "Configured Super Agent health delivery location is not valid. Super Agent health check will not be enabled." + "Configured Super Agent health delivery location is not valid. Super Agent health check will not be " + "enabled. " ) return False -def should_start_health_check(): - health_check_enabled = os.environ.get("NEW_RELIC_SUPERAGENT_FLEET_ID", None) - if not health_check_enabled: +def health_check_enabled(): + fleet_id_present = os.environ.get("NEW_RELIC_SUPERAGENT_FLEET_ID", None) + if not fleet_id_present: _logger.warning("Super Agent fleet ID not found in environment. Health reporting will not be enabled.") return False @@ -91,9 +95,6 @@ def should_start_health_check(): return True -HEALTH_CHECK_ENABLED = should_start_health_check() - - class SuperAgentHealth: _instance_lock = threading.Lock() _instance = None @@ -116,6 +117,7 @@ def __init__(self): self.last_error = "NR-APM-000" self.status = "Healthy" self.start_time_unix_nano = None + self.pid_file_id_map = {} def set_health_status(self, health_status, response_code=None, info=None): last_error, current_status = HEALTH_CHECK_STATUSES[health_status] @@ -139,11 +141,11 @@ def set_health_status(self, health_status, response_code=None, info=None): def check_for_healthy_status(self): # If our unhealthy status code was not config related, it is possible it could be resolved during an active - # session We determine the status is resolved by calling this function when a 200 status code is received to + # session. We determine the status is resolved by calling this function when a 200 status code is received to # check if the current status is resolvable # Checking for forced disconnects or proxy/ HTTP errors - non_config_error_codes = frozenset("NR-APM-003", "NR-APM-004", "NR-APM-007") + non_config_error_codes = frozenset(["NR-APM-003", "NR-APM-004", "NR-APM-007"]) if self.last_error in non_config_error_codes: self.last_error = "NR-APM-000" self.status = "Healthy" @@ -153,9 +155,12 @@ def write_to_health_file(self): status_time_unix_nano = time.time_ns() health_file_location = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) + health_file_location = str(health_file_location) file_path = urlparse(health_file_location).path - file_id = str(uuid.uuid4()).replace("-", "") - file_name = f"health_{file_id}.yml" + pid = os.getpid() + file_ids = self.get_file_id(pid) + + file_name = f"health-{file_ids}.yml" full_path = os.path.join(file_path, file_name) try: @@ -169,6 +174,18 @@ def write_to_health_file(self): except: _logger.warning("Unable to write to agent health file.") + def get_file_id(self, pid): + # Each file name should have a UUID with hyphens stripped appended to it + file_id = str(uuid.uuid4()).replace("-", "") + + # Map the UUID to the process ID to ensure each agent instance has one UUID associated with it + if pid in self.pid_file_id_map: + pass + else: + self.pid_file_id_map[pid] = file_id + + return self.pid_file_id_map[pid] + def super_agent_health_instance(): # Helper function directly returns the singleton instance similar to agent_instance() diff --git a/tests/agent_features/test_super_agent_health_check.py b/tests/agent_features/test_super_agent_health_check.py index 8365944a5..59b4adf8a 100644 --- a/tests/agent_features/test_super_agent_health_check.py +++ b/tests/agent_features/test_super_agent_health_check.py @@ -12,13 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +import time import pytest import threading -from testing_support.fixtures import override_application_settings +from testing_support.fixtures import reset_core_stats_engine +from testing_support.fixtures import override_generic_settings +from newrelic.core.config import finalize_application_settings +from agent_unittests.test_agent_protocol import HttpClientRecorder from newrelic.core.super_agent_health import is_valid_file_delivery_location, super_agent_health_instance from newrelic.config import initialize +from newrelic.core.agent_protocol import AgentProtocol from newrelic.core.application import Application +from newrelic.core.config import global_settings +from newrelic.network.exceptions import DiscardDataForRequest _disable_agent_settings = { "monitor_mode": False, @@ -124,43 +131,31 @@ def test_health_check_running_threads(monkeypatch, tmp_path): running_threads = threading.enumerate() assert len(running_threads) == 2 - assert running_threads[1].name == "NR-Control-Health-Main-Thread" + assert running_threads[1].name == "APM-Control-Health-Main-Thread" -def test_multiple_activations_running_threads(monkeypatch, tmp_path): +def test_proxy_error_status(monkeypatch, tmp_path): # Setup expected env vars to run super agent health check monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") file_path = tmp_path.as_uri() monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) initialize() - application_1 = Application("Test App 1") - application_2 = Application("Test App 2") - - application_1.activate_session() - application_2.activate_session() - running_threads = threading.enumerate() + # Mock a 407 error to generate a proxy error health status + HttpClientRecorder.STATUS_CODE = 407 + settings = finalize_application_settings( + { + "license_key": "123LICENSEKEY", + } + ) + protocol = AgentProtocol(settings, client_cls=HttpClientRecorder) - # - assert len(running_threads) == 6 - assert running_threads[1].name == "NR-Control-Health-Main-Thread" - assert running_threads[2].name == "NR-Control-Health-Session-Thread" - assert running_threads[4].name == "NR-Control-Health-Session-Thread" + with pytest.raises(DiscardDataForRequest): + protocol.send("analytic_event_data") - - -def test_missing_license_key(monkeypatch, tmp_path): - # Setup expected env vars to run super agent health check - monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") - file_path = tmp_path.as_uri() - monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) - #monkeypatch.setenv("NEW_RELIC_LICENSE_KEY", "") - - #initialize() - # Write to health YAML file - super_agent_instance = super_agent_health_instance() - super_agent_instance.write_to_health_file() + # Give time for the scheduler to kick in and write to the health file + time.sleep(5) # Grab the file we just wrote to and read its contents health_files = os.listdir(tmp_path) @@ -171,9 +166,26 @@ def test_missing_license_key(monkeypatch, tmp_path): # Assert on contents of health file assert len(contents) == 5 assert contents[0] == "healthy: False\n" - assert contents[1] == "status: Invalid license key (HTTP status code 401)\n" - assert contents[2] == "start_time_unix_nano: 1234567890\n" - assert contents[3].startswith("status_time_unix_nano:") is True - assert contents[4] == "last_error: NR-APM-001\n" + assert contents[1] == "status: HTTP Proxy configuration error; response code 407\n" + assert contents[4] == "last_error: NR-APM-007\n" +def test_multiple_activations_running_threads(monkeypatch, tmp_path): + # Setup expected env vars to run super agent health check + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") + file_path = tmp_path.as_uri() + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + + initialize() + application_1 = Application("Test App 1") + application_2 = Application("Test App 2") + + application_1.activate_session() + application_2.activate_session() + + running_threads = threading.enumerate() + + assert len(running_threads) == 6 + assert running_threads[1].name == "APM-Control-Health-Main-Thread" + assert running_threads[2].name == "APM-Control-Health-Session-Thread" + assert running_threads[4].name == "APM-Control-Health-Session-Thread" \ No newline at end of file From 41a1b8d11f79c68752e64676154465506116221e Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Wed, 6 Nov 2024 12:40:15 -0800 Subject: [PATCH 04/14] Remove unused arg. --- newrelic/core/application.py | 2 +- tests/agent_features/test_super_agent_health_check.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/newrelic/core/application.py b/newrelic/core/application.py index f736173f1..84e451344 100644 --- a/newrelic/core/application.py +++ b/newrelic/core/application.py @@ -1700,7 +1700,7 @@ def internal_agent_shutdown(self, restart=False): super_agent = super_agent_health_instance() super_agent.set_health_status("agent_shutdown") if health_check_enabled(): - super_agent.write_to_health_file(self._super_agent_file_id) + super_agent.write_to_health_file() # We need to stop any thread profiler session related to this # application. diff --git a/tests/agent_features/test_super_agent_health_check.py b/tests/agent_features/test_super_agent_health_check.py index 59b4adf8a..1cccea2fb 100644 --- a/tests/agent_features/test_super_agent_health_check.py +++ b/tests/agent_features/test_super_agent_health_check.py @@ -188,4 +188,4 @@ def test_multiple_activations_running_threads(monkeypatch, tmp_path): assert len(running_threads) == 6 assert running_threads[1].name == "APM-Control-Health-Main-Thread" assert running_threads[2].name == "APM-Control-Health-Session-Thread" - assert running_threads[4].name == "APM-Control-Health-Session-Thread" \ No newline at end of file + assert running_threads[4].name == "APM-Control-Health-Session-Thread" From a0498e2e083cb983e5f9c281c7196db6cec090ac Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Fri, 8 Nov 2024 08:42:11 -0800 Subject: [PATCH 05/14] Clear configuration leaking from test_configuration.py. --- newrelic/core/agent_protocol.py | 15 ++++++++------- .../test_super_agent_health_check.py | 13 ++++--------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/newrelic/core/agent_protocol.py b/newrelic/core/agent_protocol.py index dd5c5dbfe..a432741f4 100644 --- a/newrelic/core/agent_protocol.py +++ b/newrelic/core/agent_protocol.py @@ -211,7 +211,7 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient): # Do not access configuration anywhere inside the class self.configuration = settings - self._super_agent = super_agent_health_instance() + self.super_agent = super_agent_health_instance() def __enter__(self): self.client.__enter__() @@ -250,21 +250,21 @@ def send( # initialize function doesn't get overridden with invalid_license as a missing license key is also # treated as a 401 status code if not self._license_key: - self._super_agent.set_health_status("missing_license") + self.super_agent.set_health_status("missing_license") else: - self._super_agent.set_health_status("invalid_license") + self.super_agent.set_health_status("invalid_license") if status == 407: - self._super_agent.set_health_status("proxy_error", status) + self.super_agent.set_health_status("proxy_error", status) if status == 410: - self._super_agent.set_health_status("forced_disconnect") + self.super_agent.set_health_status("forced_disconnect") level, message = self.LOG_MESSAGES.get(status, self.LOG_MESSAGES["default"]) # If the default error message was used, then we know we have a general HTTP error if message.startswith("Received a non 200 or 202"): - self._super_agent.set_health_status("http_error", status, method) + self.super_agent.set_health_status("http_error", status, method) _logger.log( level, @@ -284,7 +284,7 @@ def send( exception = self.STATUS_CODE_RESPONSE.get(status, DiscardDataForRequest) raise exception if status == 200: - self._super_agent.check_for_healthy_status() + self.super_agent.check_for_healthy_status() return self.decode_response(data) def decode_response(self, response): @@ -614,6 +614,7 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient): # Do not access configuration anywhere inside the class self.configuration = settings + self.super_agent = super_agent_health_instance() @classmethod def connect( diff --git a/tests/agent_features/test_super_agent_health_check.py b/tests/agent_features/test_super_agent_health_check.py index 1cccea2fb..a7591526e 100644 --- a/tests/agent_features/test_super_agent_health_check.py +++ b/tests/agent_features/test_super_agent_health_check.py @@ -16,22 +16,14 @@ import pytest import threading -from testing_support.fixtures import reset_core_stats_engine -from testing_support.fixtures import override_generic_settings from newrelic.core.config import finalize_application_settings from agent_unittests.test_agent_protocol import HttpClientRecorder from newrelic.core.super_agent_health import is_valid_file_delivery_location, super_agent_health_instance -from newrelic.config import initialize +from newrelic.config import initialize, _reset_configuration_done from newrelic.core.agent_protocol import AgentProtocol from newrelic.core.application import Application -from newrelic.core.config import global_settings from newrelic.network.exceptions import DiscardDataForRequest -_disable_agent_settings = { - "monitor_mode": False, - "developer_mode": False -} - @pytest.mark.parametrize("file_uri", ["", "file://", "/test/dir", "foo:/test/dir"]) def test_invalid_file_directory_supplied(file_uri): @@ -127,6 +119,7 @@ def test_health_check_running_threads(monkeypatch, tmp_path): monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) # Re-initialize the agent to allow the health check thread to start and assert that it did + _reset_configuration_done() initialize() running_threads = threading.enumerate() @@ -140,6 +133,7 @@ def test_proxy_error_status(monkeypatch, tmp_path): file_path = tmp_path.as_uri() monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + _reset_configuration_done() initialize() # Mock a 407 error to generate a proxy error health status @@ -176,6 +170,7 @@ def test_multiple_activations_running_threads(monkeypatch, tmp_path): file_path = tmp_path.as_uri() monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + _reset_configuration_done() initialize() application_1 = Application("Test App 1") application_2 = Application("Test App 2") From 29fac89fcdcebdca9c350daf8e70368db72999d2 Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Fri, 8 Nov 2024 10:51:31 -0800 Subject: [PATCH 06/14] Remove log statement when fleet ID is not found. --- newrelic/config.py | 2 -- newrelic/core/agent.py | 1 - newrelic/core/application.py | 1 - newrelic/core/super_agent_health.py | 1 - 4 files changed, 5 deletions(-) diff --git a/newrelic/config.py b/newrelic/config.py index a3a0b7a88..c3cfeb7f6 100644 --- a/newrelic/config.py +++ b/newrelic/config.py @@ -16,8 +16,6 @@ import fnmatch import logging import os -import uuid -import sched import sys import threading import time diff --git a/newrelic/core/agent.py b/newrelic/core/agent.py index b20bda93d..1b81bb9b4 100644 --- a/newrelic/core/agent.py +++ b/newrelic/core/agent.py @@ -189,7 +189,6 @@ def agent_singleton(): return Agent._instance - def __init__(self, config): """Initialises the agent and attempt to establish a connection to the core application. Will start the harvest loop running but diff --git a/newrelic/core/application.py b/newrelic/core/application.py index 84e451344..d0d8efc36 100644 --- a/newrelic/core/application.py +++ b/newrelic/core/application.py @@ -19,7 +19,6 @@ import logging import os import sys -import uuid import threading import time import traceback diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py index 045706e35..ba4cce0d9 100644 --- a/newrelic/core/super_agent_health.py +++ b/newrelic/core/super_agent_health.py @@ -84,7 +84,6 @@ def is_valid_file_delivery_location(file_uri): def health_check_enabled(): fleet_id_present = os.environ.get("NEW_RELIC_SUPERAGENT_FLEET_ID", None) if not fleet_id_present: - _logger.warning("Super Agent fleet ID not found in environment. Health reporting will not be enabled.") return False health_file_location = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) From d5cd9c43a875356096ec0c23f329199597719500 Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Tue, 3 Dec 2024 13:41:57 -0800 Subject: [PATCH 07/14] Add support testing and support for new status codes. --- newrelic/config.py | 1 + newrelic/console.py | 3 ++ newrelic/core/agent_protocol.py | 4 +- newrelic/core/application.py | 13 +++-- newrelic/core/super_agent_health.py | 51 ++++++++++++------- .../test_super_agent_health_check.py | 43 ++++++++++++++-- tests/agent_unittests/test_agent_protocol.py | 13 +++-- 7 files changed, 96 insertions(+), 32 deletions(-) diff --git a/newrelic/config.py b/newrelic/config.py index c3cfeb7f6..0942ac7ed 100644 --- a/newrelic/config.py +++ b/newrelic/config.py @@ -1052,6 +1052,7 @@ def _load_configuration( raise newrelic.api.exceptions.ConfigurationError("New Relic configuration not found in TOML file.") _config_object.read_dict(_toml_config_to_configparser_dict(newrelic_section)) elif not _config_object.read([config_file]): + super_agent_instance.set_health_status("invalid_config") raise newrelic.api.exceptions.ConfigurationError(f"Unable to open configuration file {config_file}.") _settings.config_file = config_file diff --git a/newrelic/console.py b/newrelic/console.py index 2e527dd9e..3b80d3033 100644 --- a/newrelic/console.py +++ b/newrelic/console.py @@ -36,6 +36,7 @@ from newrelic.common.object_wrapper import ObjectProxy from newrelic.core.agent import agent_instance from newrelic.core.config import flatten_settings, global_settings +from newrelic.core.super_agent_health import super_agent_health_instance from newrelic.core.trace_cache import trace_cache @@ -512,6 +513,8 @@ def __init__(self, config_file, stdin=None, stdout=None, log=None): self.__log_object = log if not self.__config_object.read([config_file]): + super_agent_instance = super_agent_health_instance() + super_agent_instance.set_health_status("invalid_config") raise RuntimeError(f"Unable to open configuration file {config_file}.") listener_socket = self.__config_object.get("newrelic", "console.listener_socket") % {"pid": "*"} diff --git a/newrelic/core/agent_protocol.py b/newrelic/core/agent_protocol.py index a432741f4..f7c0522cd 100644 --- a/newrelic/core/agent_protocol.py +++ b/newrelic/core/agent_protocol.py @@ -281,10 +281,12 @@ def send( "agent_run_id": self._run_token, }, ) + exception = self.STATUS_CODE_RESPONSE.get(status, DiscardDataForRequest) raise exception if status == 200: - self.super_agent.check_for_healthy_status() + # Check if we previously had a protocol related error and update to a healthy status + self.super_agent.update_to_healthy_agent_protocol_status(protocol_error=True) return self.decode_response(data) def decode_response(self, response): diff --git a/newrelic/core/application.py b/newrelic/core/application.py index d0d8efc36..8b04633db 100644 --- a/newrelic/core/application.py +++ b/newrelic/core/application.py @@ -41,7 +41,6 @@ from newrelic.core.profile_sessions import profile_session_manager from newrelic.core.rules_engine import RulesEngine, SegmentCollapseEngine from newrelic.core.stats_engine import CustomMetrics, StatsEngine -from newrelic.core.super_agent_health import super_agent_health_instance from newrelic.network.exceptions import ( DiscardDataForRequest, ForceAgentDisconnect, @@ -50,7 +49,7 @@ RetryDataForRequest, ) from newrelic.samplers.data_sampler import DataSampler -from newrelic.core.super_agent_health import super_agent_healthcheck_loop, health_check_enabled +from newrelic.core.super_agent_health import super_agent_healthcheck_loop, health_check_enabled, super_agent_health_instance _logger = logging.getLogger(__name__) @@ -370,6 +369,7 @@ def connect_to_data_collector(self, activate_agent): None, self._app_name, self.linked_applications, environment_settings() ) except ForceAgentDisconnect: + self._super_agent.set_health_status("failed_nr_connection") # Any disconnect exception means we should stop trying to connect _logger.error( "The New Relic service has requested that the agent " @@ -380,6 +380,7 @@ def connect_to_data_collector(self, activate_agent): ) return except NetworkInterfaceException: + self._super_agent.set_health_status("failed_nr_connection") active_session = None except Exception: # If an exception occurs after agent has been flagged to be @@ -389,6 +390,7 @@ def connect_to_data_collector(self, activate_agent): # the application is still running. if not self._agent_shutdown and not self._pending_shutdown: + self._super_agent.set_health_status("failed_nr_connection") _logger.exception( "Unexpected exception when registering " "agent with the data collector. If this problem " @@ -499,6 +501,8 @@ def connect_to_data_collector(self, activate_agent): # data from a prior agent run for this application. configuration = active_session.configuration + # Check if the agent previously had an unhealthy status related to the data collector and update + self._super_agent.update_to_healthy_agent_protocol_status(collector_error=True) with self._stats_lock: self._stats_engine.reset_stats(configuration, reset_stream=True) @@ -1696,10 +1700,9 @@ def internal_agent_shutdown(self, restart=False): optionally triggers activation of a new session. """ - super_agent = super_agent_health_instance() - super_agent.set_health_status("agent_shutdown") + self._super_agent.set_health_status("agent_shutdown") if health_check_enabled(): - super_agent.write_to_health_file() + self._super_agent.write_to_health_file() # We need to stop any thread profiler session related to this # application. diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py index ba4cce0d9..70c39b501 100644 --- a/newrelic/core/super_agent_health.py +++ b/newrelic/core/super_agent_health.py @@ -15,11 +15,12 @@ import logging import os import uuid +import sched import threading import time from pathlib import Path from urllib.parse import urlparse -import sched + _logger = logging.getLogger(__name__) @@ -30,17 +31,19 @@ "missing_license": ("NR-APM-002", "License key missing in configuration"), "forced_disconnect": ("NR-APM-003", "Forced disconnect received from New Relic (HTTP status code 410)"), "http_error": ("NR-APM-004", "HTTP error response code received from New Relic"), - # "max_app_names": ("NR-APM-006", "The maximum number of configured app names (3) exceeded"), "proxy_error": ("NR-APM-007", "HTTP Proxy configuration error"), "agent_disabled": ("NR-APM-008", "Agent is disabled via configuration"), + "failed_nr_connection": ("NR-APM-009", "Failed to connect to New Relic data collector"), + "invalid_config": ("NR-APM-010", "Agent config file is not able to be parsed"), "agent_shutdown": ("NR-APM-099", "Agent has shutdown"), } def is_valid_file_delivery_location(file_uri): + # Verify whether file directory provided to agent via env var is a valid file URI to determine whether health check should run if not file_uri: _logger.warning( - "Configured APM Control health delivery location is empty. APM Control health check will not be enabled." + "Configured NR Control health delivery location is empty. APM Control health check will not be enabled." ) return False @@ -49,15 +52,15 @@ def is_valid_file_delivery_location(file_uri): if not parsed_uri.scheme or not parsed_uri.path: _logger.warning( - "Configured Super Agent health delivery location is not a complete file URI. Super Agent health check " + "Configured NR Control health delivery location is not a complete file URI. Health check " "will not be enabled. " ) return False if parsed_uri.scheme != "file": _logger.warning( - "Configured Super Agent health delivery location does not have a valid scheme. Super Agent health " - "check will not be enabled. " + "Configured NR Control health delivery location does not have a valid scheme. Health check will not be" + "enabled." ) return False @@ -66,8 +69,7 @@ def is_valid_file_delivery_location(file_uri): # Check if the path exists if not path.exists(): _logger.warning( - "Configured Super Agent health delivery location does not exist. Super Agent health check will not be " - "enabled. " + "Configured NR Control health delivery location does not exist. Health check will not be enabled." ) return False @@ -75,8 +77,7 @@ def is_valid_file_delivery_location(file_uri): except Exception as e: _logger.warning( - "Configured Super Agent health delivery location is not valid. Super Agent health check will not be " - "enabled. " + "Configured NR Control health delivery location is not valid. Health check will not be enabled." ) return False @@ -113,6 +114,7 @@ def super_agent_health_singleton(): return SuperAgentHealth._instance def __init__(self): + # Initialize health check with a healthy status that can be updated as issues are encountered self.last_error = "NR-APM-000" self.status = "Healthy" self.start_time_unix_nano = None @@ -138,14 +140,25 @@ def set_health_status(self, health_status, response_code=None, info=None): self.last_error = last_error self.status = current_status - def check_for_healthy_status(self): + def update_to_healthy_agent_protocol_status(self, protocol_error=False, collector_error=False): # If our unhealthy status code was not config related, it is possible it could be resolved during an active - # session. We determine the status is resolved by calling this function when a 200 status code is received to - # check if the current status is resolvable - - # Checking for forced disconnects or proxy/ HTTP errors - non_config_error_codes = frozenset(["NR-APM-003", "NR-APM-004", "NR-APM-007"]) - if self.last_error in non_config_error_codes: + # session. This function allows us to update to a healthy status if so + + if not protocol_error and not collector_error: + return + + # For protocol errors, we determine the status is resolved by calling this function when a 200 status code is + # received to check if the current status is resolvable + if protocol_error: + error_codes = frozenset(["NR-APM-003", "NR-APM-004", "NR-APM-007"]) + # Also check if we had a failure connecting to NR as this could resolve itself if the collector becomes + # reachable during an active session + if collector_error: + error_codes = frozenset(["NR-APM-009"]) + + # Since this function is only called when we are in scenario where the agent functioned as expected, we check to + # see if the previous status was unhealthy so we know to update it + if self.last_error in error_codes: self.last_error = "NR-APM-000" self.status = "Healthy" @@ -157,9 +170,9 @@ def write_to_health_file(self): health_file_location = str(health_file_location) file_path = urlparse(health_file_location).path pid = os.getpid() - file_ids = self.get_file_id(pid) + file_id = self.get_file_id(pid) - file_name = f"health-{file_ids}.yml" + file_name = f"health-{file_id}.yml" full_path = os.path.join(file_path, file_name) try: diff --git a/tests/agent_features/test_super_agent_health_check.py b/tests/agent_features/test_super_agent_health_check.py index a7591526e..b73efa8f5 100644 --- a/tests/agent_features/test_super_agent_health_check.py +++ b/tests/agent_features/test_super_agent_health_check.py @@ -44,7 +44,7 @@ def test_write_to_file_healthy_status(monkeypatch, tmp_path): # Grab the file we just wrote to and read its contents health_files = os.listdir(tmp_path) path_to_written_file = f"{tmp_path}/{health_files[0]}" - with open(path_to_written_file, 'r') as f: + with open(path_to_written_file, "r") as f: contents = f.readlines() # Assert on contents of health file @@ -70,7 +70,7 @@ def test_write_to_file_unhealthy_status(monkeypatch, tmp_path): # Grab the file we just wrote to and read its contents health_files = os.listdir(tmp_path) path_to_written_file = f"{tmp_path}/{health_files[0]}" - with open(path_to_written_file, 'r') as f: + with open(path_to_written_file, "r") as f: contents = f.readlines() # Assert on contents of health file @@ -99,7 +99,7 @@ def test_no_override_on_unhealthy_shutdown(monkeypatch, tmp_path): # Grab the file we just wrote to and read its contents health_files = os.listdir(tmp_path) path_to_written_file = f"{tmp_path}/{health_files[0]}" - with open(path_to_written_file, 'r') as f: + with open(path_to_written_file, "r") as f: contents = f.readlines() # Assert on contents of health file @@ -154,7 +154,7 @@ def test_proxy_error_status(monkeypatch, tmp_path): # Grab the file we just wrote to and read its contents health_files = os.listdir(tmp_path) path_to_written_file = f"{tmp_path}/{health_files[0]}" - with open(path_to_written_file, 'r') as f: + with open(path_to_written_file, "r") as f: contents = f.readlines() # Assert on contents of health file @@ -164,6 +164,41 @@ def test_proxy_error_status(monkeypatch, tmp_path): assert contents[4] == "last_error: NR-APM-007\n" +def test_update_to_healthy(monkeypatch, tmp_path): + # Setup expected env vars to run super agent health check + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") + file_path = tmp_path.as_uri() + monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) + + _reset_configuration_done() + # Write to health YAML file + super_agent_instance = super_agent_health_instance() + super_agent_instance.start_time_unix_nano = "1234567890" + super_agent_instance.set_health_status("forced_disconnect") + + # Mock a 407 error to generate a proxy error health status + HttpClientRecorder.STATUS_CODE = 200 + settings = finalize_application_settings( + { + "license_key": "123LICENSEKEY", + } + ) + protocol = AgentProtocol(settings, client_cls=HttpClientRecorder) + protocol.send("analytic_event_data") + + super_agent_instance.write_to_health_file() + + # Grab the file we just wrote to and read its contents + health_files = os.listdir(tmp_path) + path_to_written_file = f"{tmp_path}/{health_files[0]}" + with open(path_to_written_file, "r") as f: + contents = f.readlines() + + # Assert on contents of health file + assert contents[0] == "healthy: True\n" + assert contents[1] == "status: Healthy\n" + + def test_multiple_activations_running_threads(monkeypatch, tmp_path): # Setup expected env vars to run super agent health check monkeypatch.setenv("NEW_RELIC_SUPERAGENT_FLEET_ID", "1234") diff --git a/tests/agent_unittests/test_agent_protocol.py b/tests/agent_unittests/test_agent_protocol.py index 839e479bf..49d62f2a1 100644 --- a/tests/agent_unittests/test_agent_protocol.py +++ b/tests/agent_unittests/test_agent_protocol.py @@ -22,7 +22,7 @@ from newrelic.common import certs, system_info from newrelic.common.agent_http import DeveloperModeClient -from newrelic.common.encoding_utils import json_decode, serverless_payload_decode +from newrelic.common.encoding_utils import json_decode, json_encode, serverless_payload_decode from newrelic.common.utilization import CommonUtilization from newrelic.core.agent_protocol import AgentProtocol, ServerlessModeProtocol from newrelic.core.config import finalize_application_settings, global_settings @@ -80,6 +80,11 @@ def send_request( request = Request(method=method, path=path, params=params, headers=headers, payload=payload) self.SENT.append(request) if self.STATUS_CODE: + # Define behavior for a 200 status code for use in test_super_agent_health.py + if self.STATUS_CODE == 200: + payload = {"return_value": "Hello World!"} + response_data = json_encode(payload).encode("utf-8") + return self.STATUS_CODE, response_data return self.STATUS_CODE, b"" return super(HttpClientRecorder, self).send_request(method, path, params, headers, payload) @@ -332,7 +337,9 @@ def connect_payload_asserts( else: assert "ip_address" not in payload_data["utilization"] - utilization_len = utilization_len + any([with_aws, with_ecs, with_pcf, with_gcp, with_azure, with_docker, with_kubernetes]) + utilization_len = utilization_len + any( + [with_aws, with_ecs, with_pcf, with_gcp, with_azure, with_docker, with_kubernetes] + ) assert len(payload_data["utilization"]) == utilization_len assert payload_data["utilization"]["hostname"] == HOST @@ -580,7 +587,7 @@ def test_audit_logging(): ) def test_ca_bundle_path(monkeypatch, ca_bundle_path): # Pretend CA certificates are not available - class DefaultVerifyPaths(): + class DefaultVerifyPaths: cafile = None capath = None From 6eb724bcd45c27de3890bebbd1e467d27842f43f Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Tue, 3 Dec 2024 20:22:54 -0800 Subject: [PATCH 08/14] Update thread names to use NR-Control. --- newrelic/config.py | 2 +- tests/agent_features/test_super_agent_health_check.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/newrelic/config.py b/newrelic/config.py index 0942ac7ed..47273a684 100644 --- a/newrelic/config.py +++ b/newrelic/config.py @@ -4821,7 +4821,7 @@ def _setup_agent_console(): newrelic.core.agent.Agent.run_on_startup(_startup_agent_console) -super_agent_health_thread = threading.Thread(name="APM-Control-Health-Main-Thread", target=super_agent_healthcheck_loop) +super_agent_health_thread = threading.Thread(name="NR-Control-Health-Main-Thread", target=super_agent_healthcheck_loop) super_agent_health_thread.daemon = True diff --git a/tests/agent_features/test_super_agent_health_check.py b/tests/agent_features/test_super_agent_health_check.py index b73efa8f5..a45b2eb67 100644 --- a/tests/agent_features/test_super_agent_health_check.py +++ b/tests/agent_features/test_super_agent_health_check.py @@ -124,7 +124,7 @@ def test_health_check_running_threads(monkeypatch, tmp_path): running_threads = threading.enumerate() assert len(running_threads) == 2 - assert running_threads[1].name == "APM-Control-Health-Main-Thread" + assert running_threads[1].name == "NR-Control-Health-Main-Thread" def test_proxy_error_status(monkeypatch, tmp_path): @@ -216,6 +216,6 @@ def test_multiple_activations_running_threads(monkeypatch, tmp_path): running_threads = threading.enumerate() assert len(running_threads) == 6 - assert running_threads[1].name == "APM-Control-Health-Main-Thread" - assert running_threads[2].name == "APM-Control-Health-Session-Thread" - assert running_threads[4].name == "APM-Control-Health-Session-Thread" + assert running_threads[1].name == "NR-Control-Health-Main-Thread" + assert running_threads[2].name == "NR-Control-Health-Session-Thread" + assert running_threads[4].name == "NR-Control-Health-Session-Thread" From d62ecba7a1d1ab4b33e9c151b09b831c273527c1 Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Thu, 5 Dec 2024 13:28:37 -0800 Subject: [PATCH 09/14] Refactoring. --- newrelic/config.py | 4 +- newrelic/core/agent_protocol.py | 3 +- newrelic/core/application.py | 8 +-- newrelic/core/super_agent_health.py | 52 ++++++++++--------- .../test_super_agent_health_check.py | 46 +++++++--------- 5 files changed, 55 insertions(+), 58 deletions(-) diff --git a/newrelic/config.py b/newrelic/config.py index 47273a684..c840d6466 100644 --- a/newrelic/config.py +++ b/newrelic/config.py @@ -48,7 +48,7 @@ default_host, fetch_config_setting, ) -from newrelic.core.super_agent_health import super_agent_health_instance, super_agent_healthcheck_loop, health_check_enabled +from newrelic.core.super_agent_health import super_agent_health_instance, super_agent_healthcheck_loop __all__ = ["initialize", "filter_app_factory"] @@ -4829,7 +4829,7 @@ def _setup_super_agent_health(): if super_agent_health_thread.is_alive(): return - if health_check_enabled(): + if super_agent_instance.health_check_enabled: super_agent_health_thread.start() diff --git a/newrelic/core/agent_protocol.py b/newrelic/core/agent_protocol.py index f7c0522cd..724f7688d 100644 --- a/newrelic/core/agent_protocol.py +++ b/newrelic/core/agent_protocol.py @@ -213,6 +213,7 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient): self.configuration = settings self.super_agent = super_agent_health_instance() + def __enter__(self): self.client.__enter__() return self @@ -286,7 +287,7 @@ def send( raise exception if status == 200: # Check if we previously had a protocol related error and update to a healthy status - self.super_agent.update_to_healthy_agent_protocol_status(protocol_error=True) + self.super_agent.update_to_healthy_status(protocol_error=True) return self.decode_response(data) def decode_response(self, response): diff --git a/newrelic/core/application.py b/newrelic/core/application.py index 8b04633db..a78f5360c 100644 --- a/newrelic/core/application.py +++ b/newrelic/core/application.py @@ -49,7 +49,7 @@ RetryDataForRequest, ) from newrelic.samplers.data_sampler import DataSampler -from newrelic.core.super_agent_health import super_agent_healthcheck_loop, health_check_enabled, super_agent_health_instance +from newrelic.core.super_agent_health import super_agent_healthcheck_loop, super_agent_health_instance _logger = logging.getLogger(__name__) @@ -210,7 +210,7 @@ def activate_session(self, activate_agent=None, timeout=0.0): if self._active_session: return - if health_check_enabled() and not self._super_agent_health_thread.is_alive(): + if self._super_agent.health_check_enabled and not self._super_agent_health_thread.is_alive(): self._super_agent_health_thread.start() self._process_id = os.getpid() @@ -502,7 +502,7 @@ def connect_to_data_collector(self, activate_agent): configuration = active_session.configuration # Check if the agent previously had an unhealthy status related to the data collector and update - self._super_agent.update_to_healthy_agent_protocol_status(collector_error=True) + self._super_agent.update_to_healthy_status(collector_error=True) with self._stats_lock: self._stats_engine.reset_stats(configuration, reset_stream=True) @@ -1701,7 +1701,7 @@ def internal_agent_shutdown(self, restart=False): """ self._super_agent.set_health_status("agent_shutdown") - if health_check_enabled(): + if self._super_agent.health_check_enabled: self._super_agent.write_to_health_file() # We need to stop any thread profiler session related to this diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py index 70c39b501..22664fb1d 100644 --- a/newrelic/core/super_agent_health.py +++ b/newrelic/core/super_agent_health.py @@ -14,10 +14,10 @@ import logging import os -import uuid import sched import threading import time +import uuid from pathlib import Path from urllib.parse import urlparse @@ -40,10 +40,11 @@ def is_valid_file_delivery_location(file_uri): - # Verify whether file directory provided to agent via env var is a valid file URI to determine whether health check should run + # Verify whether file directory provided to agent via env var is a valid file URI to determine whether health + # check should run if not file_uri: _logger.warning( - "Configured NR Control health delivery location is empty. APM Control health check will not be enabled." + "Configured NR Control health delivery location is empty. Health check will not be enabled." ) return False @@ -52,8 +53,8 @@ def is_valid_file_delivery_location(file_uri): if not parsed_uri.scheme or not parsed_uri.path: _logger.warning( - "Configured NR Control health delivery location is not a complete file URI. Health check " - "will not be enabled. " + "Configured NR Control health delivery location is not a complete file URI. Health check will not be" + "enabled. " ) return False @@ -82,19 +83,6 @@ def is_valid_file_delivery_location(file_uri): return False -def health_check_enabled(): - fleet_id_present = os.environ.get("NEW_RELIC_SUPERAGENT_FLEET_ID", None) - if not fleet_id_present: - return False - - health_file_location = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) - valid_file_location = is_valid_file_delivery_location(health_file_location) - if not valid_file_location: - return False - - return True - - class SuperAgentHealth: _instance_lock = threading.Lock() _instance = None @@ -120,9 +108,21 @@ def __init__(self): self.start_time_unix_nano = None self.pid_file_id_map = {} + @property + def health_check_enabled(self): + fleet_id_present = os.environ.get("NEW_RELIC_SUPERAGENT_FLEET_ID", None) + if not fleet_id_present: + return False + + health_file_location = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) + valid_file_location = is_valid_file_delivery_location(health_file_location) + if not valid_file_location: + return False + + return True + def set_health_status(self, health_status, response_code=None, info=None): last_error, current_status = HEALTH_CHECK_STATUSES[health_status] - # Update status messages to be more descriptive if necessary data is present if health_status == "http_error" and response_code and info: current_status = ( @@ -132,15 +132,19 @@ def set_health_status(self, health_status, response_code=None, info=None): if health_status == "proxy_error" and response_code: current_status = f"HTTP Proxy configuration error; response code {response_code}" + + license_key_error = True if self.status == "Invalid license key (HTTP status code 401)" or "License key missing in configuration" else False + + if health_status == "failed_nr_connection" and license_key_error: + pass # Do not override status with agent_shutdown unless the agent was previously healthy - if health_status == "agent_shutdown" and self.status != "Healthy": + elif health_status == "agent_shutdown" and self.status != "Healthy": pass - else: self.last_error = last_error self.status = current_status - def update_to_healthy_agent_protocol_status(self, protocol_error=False, collector_error=False): + def update_to_healthy_status(self, protocol_error=False, collector_error=False): # If our unhealthy status code was not config related, it is possible it could be resolved during an active # session. This function allows us to update to a healthy status if so @@ -191,9 +195,7 @@ def get_file_id(self, pid): file_id = str(uuid.uuid4()).replace("-", "") # Map the UUID to the process ID to ensure each agent instance has one UUID associated with it - if pid in self.pid_file_id_map: - pass - else: + if pid not in self.pid_file_id_map: self.pid_file_id_map[pid] = file_id return self.pid_file_id_map[pid] diff --git a/tests/agent_features/test_super_agent_health_check.py b/tests/agent_features/test_super_agent_health_check.py index a45b2eb67..5599dc62a 100644 --- a/tests/agent_features/test_super_agent_health_check.py +++ b/tests/agent_features/test_super_agent_health_check.py @@ -25,6 +25,15 @@ from newrelic.network.exceptions import DiscardDataForRequest +def get_health_file_contents(tmp_path): + # Grab the file we just wrote to and read its contents + health_files = os.listdir(tmp_path) + path_to_written_file = f"{tmp_path}/{health_files[0]}" + with open(path_to_written_file, "r") as f: + contents = f.readlines() + return contents + + @pytest.mark.parametrize("file_uri", ["", "file://", "/test/dir", "foo:/test/dir"]) def test_invalid_file_directory_supplied(file_uri): assert is_valid_file_delivery_location(file_uri) is False @@ -41,11 +50,7 @@ def test_write_to_file_healthy_status(monkeypatch, tmp_path): super_agent_instance.start_time_unix_nano = "1234567890" super_agent_instance.write_to_health_file() - # Grab the file we just wrote to and read its contents - health_files = os.listdir(tmp_path) - path_to_written_file = f"{tmp_path}/{health_files[0]}" - with open(path_to_written_file, "r") as f: - contents = f.readlines() + contents = get_health_file_contents(tmp_path) # Assert on contents of health file assert len(contents) == 4 @@ -65,13 +70,10 @@ def test_write_to_file_unhealthy_status(monkeypatch, tmp_path): super_agent_instance = super_agent_health_instance() super_agent_instance.start_time_unix_nano = "1234567890" super_agent_instance.set_health_status("invalid_license") + super_agent_instance.write_to_health_file() - # Grab the file we just wrote to and read its contents - health_files = os.listdir(tmp_path) - path_to_written_file = f"{tmp_path}/{health_files[0]}" - with open(path_to_written_file, "r") as f: - contents = f.readlines() + contents = get_health_file_contents(tmp_path) # Assert on contents of health file assert len(contents) == 5 @@ -92,15 +94,12 @@ def test_no_override_on_unhealthy_shutdown(monkeypatch, tmp_path): super_agent_instance = super_agent_health_instance() super_agent_instance.start_time_unix_nano = "1234567890" super_agent_instance.set_health_status("invalid_license") + # Attempt to override a previously unhealthy status super_agent_instance.set_health_status("agent_shutdown") super_agent_instance.write_to_health_file() - # Grab the file we just wrote to and read its contents - health_files = os.listdir(tmp_path) - path_to_written_file = f"{tmp_path}/{health_files[0]}" - with open(path_to_written_file, "r") as f: - contents = f.readlines() + contents = get_health_file_contents(tmp_path) # Assert on contents of health file assert len(contents) == 5 @@ -121,6 +120,7 @@ def test_health_check_running_threads(monkeypatch, tmp_path): # Re-initialize the agent to allow the health check thread to start and assert that it did _reset_configuration_done() initialize() + running_threads = threading.enumerate() assert len(running_threads) == 2 @@ -151,11 +151,7 @@ def test_proxy_error_status(monkeypatch, tmp_path): # Give time for the scheduler to kick in and write to the health file time.sleep(5) - # Grab the file we just wrote to and read its contents - health_files = os.listdir(tmp_path) - path_to_written_file = f"{tmp_path}/{health_files[0]}" - with open(path_to_written_file, "r") as f: - contents = f.readlines() + contents = get_health_file_contents(tmp_path) # Assert on contents of health file assert len(contents) == 5 @@ -171,12 +167,13 @@ def test_update_to_healthy(monkeypatch, tmp_path): monkeypatch.setenv("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", file_path) _reset_configuration_done() + # Write to health YAML file super_agent_instance = super_agent_health_instance() super_agent_instance.start_time_unix_nano = "1234567890" super_agent_instance.set_health_status("forced_disconnect") - # Mock a 407 error to generate a proxy error health status + # Send a successful data batch to enable health status to update to "healthy" HttpClientRecorder.STATUS_CODE = 200 settings = finalize_application_settings( { @@ -188,11 +185,7 @@ def test_update_to_healthy(monkeypatch, tmp_path): super_agent_instance.write_to_health_file() - # Grab the file we just wrote to and read its contents - health_files = os.listdir(tmp_path) - path_to_written_file = f"{tmp_path}/{health_files[0]}" - with open(path_to_written_file, "r") as f: - contents = f.readlines() + contents = get_health_file_contents(tmp_path) # Assert on contents of health file assert contents[0] == "healthy: True\n" @@ -207,6 +200,7 @@ def test_multiple_activations_running_threads(monkeypatch, tmp_path): _reset_configuration_done() initialize() + application_1 = Application("Test App 1") application_2 = Application("Test App 2") From 922b69cc961faae6e1b61968af9142af41bec3a9 Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Wed, 11 Dec 2024 09:59:25 -0800 Subject: [PATCH 10/14] Address review feedback. --- newrelic/config.py | 47 ++++---- newrelic/console.py | 4 +- newrelic/core/agent_protocol.py | 12 +- newrelic/core/application.py | 10 +- newrelic/core/super_agent_health.py | 109 ++++++++++-------- .../test_super_agent_health_check.py | 20 ++-- tests/agent_unittests/test_agent_protocol.py | 41 +------ tests/testing_support/http_client_recorder.py | 40 +++++++ 8 files changed, 152 insertions(+), 131 deletions(-) create mode 100644 tests/testing_support/http_client_recorder.py diff --git a/newrelic/config.py b/newrelic/config.py index c840d6466..51e6e00c8 100644 --- a/newrelic/config.py +++ b/newrelic/config.py @@ -48,7 +48,7 @@ default_host, fetch_config_setting, ) -from newrelic.core.super_agent_health import super_agent_health_instance, super_agent_healthcheck_loop +from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance, super_agent_healthcheck_loop __all__ = ["initialize", "filter_app_factory"] @@ -104,7 +104,7 @@ def _map_aws_account_id(s): # all the settings have been read. _cache_object = [] -super_agent_instance = super_agent_health_instance() +super_agent_health = super_agent_health_instance() def _reset_config_parser(): @@ -1038,22 +1038,25 @@ def _load_configuration( # Now read in the configuration file. Cache the config file # name in internal settings object as indication of succeeding. - if config_file.endswith(".toml"): - try: - import tomllib - except ImportError: - raise newrelic.api.exceptions.ConfigurationError( - "TOML configuration file can only be used if tomllib is available (Python 3.11+)." - ) - with open(config_file, "rb") as f: - content = tomllib.load(f) - newrelic_section = content.get("tool", {}).get("newrelic") - if not newrelic_section: - raise newrelic.api.exceptions.ConfigurationError("New Relic configuration not found in TOML file.") - _config_object.read_dict(_toml_config_to_configparser_dict(newrelic_section)) - elif not _config_object.read([config_file]): - super_agent_instance.set_health_status("invalid_config") - raise newrelic.api.exceptions.ConfigurationError(f"Unable to open configuration file {config_file}.") + try: + if config_file.endswith(".toml"): + try: + import tomllib + except ImportError: + raise newrelic.api.exceptions.ConfigurationError( + "TOML configuration file can only be used if tomllib is available (Python 3.11+)." + ) + with open(config_file, "rb") as f: + content = tomllib.load(f) + newrelic_section = content.get("tool", {}).get("newrelic") + if not newrelic_section: + raise newrelic.api.exceptions.ConfigurationError("New Relic configuration not found in TOML file.") + _config_object.read_dict(_toml_config_to_configparser_dict(newrelic_section)) + elif not _config_object.read([config_file]): + raise newrelic.api.exceptions.ConfigurationError(f"Unable to open configuration file {config_file}.") + except Exception: + super_agent_health.set_health_status(HealthStatus.INVALID_CONFIG.value) + raise _settings.config_file = config_file @@ -4829,7 +4832,7 @@ def _setup_super_agent_health(): if super_agent_health_thread.is_alive(): return - if super_agent_instance.health_check_enabled: + if super_agent_health.health_check_enabled: super_agent_health_thread.start() @@ -4840,7 +4843,7 @@ def initialize( log_file=None, log_level=None, ): - super_agent_instance.start_time_unix_nano = time.time_ns() + super_agent_health.start_time_unix_nano = time.time_ns() if config_file is None: config_file = os.environ.get("NEW_RELIC_CONFIG_FILE", None) @@ -4857,7 +4860,7 @@ def initialize( if _settings.monitor_mode: if not _settings.license_key: - super_agent_instance.set_health_status("missing_license") + super_agent_health.set_health_status(HealthStatus.MISSING_LICENSE.value) if _settings.monitor_mode or _settings.developer_mode: _settings.enabled = True @@ -4867,7 +4870,7 @@ def initialize( _setup_agent_console() else: _settings.enabled = False - super_agent_instance.set_health_status("agent_disabled") + super_agent_health.set_health_status(HealthStatus.AGENT_DISABLED.value) def filter_app_factory(app, global_conf, config_file, environment=None): diff --git a/newrelic/console.py b/newrelic/console.py index 3b80d3033..f7f7de506 100644 --- a/newrelic/console.py +++ b/newrelic/console.py @@ -36,7 +36,7 @@ from newrelic.common.object_wrapper import ObjectProxy from newrelic.core.agent import agent_instance from newrelic.core.config import flatten_settings, global_settings -from newrelic.core.super_agent_health import super_agent_health_instance +from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance from newrelic.core.trace_cache import trace_cache @@ -514,7 +514,7 @@ def __init__(self, config_file, stdin=None, stdout=None, log=None): if not self.__config_object.read([config_file]): super_agent_instance = super_agent_health_instance() - super_agent_instance.set_health_status("invalid_config") + super_agent_instance.set_health_status(HealthStatus.INVALID_CONFIG.value) raise RuntimeError(f"Unable to open configuration file {config_file}.") listener_socket = self.__config_object.get("newrelic", "console.listener_socket") % {"pid": "*"} diff --git a/newrelic/core/agent_protocol.py b/newrelic/core/agent_protocol.py index 724f7688d..2984144b0 100644 --- a/newrelic/core/agent_protocol.py +++ b/newrelic/core/agent_protocol.py @@ -47,7 +47,7 @@ NetworkInterfaceException, RetryDataForRequest, ) -from newrelic.core.super_agent_health import super_agent_health_instance +from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance _logger = logging.getLogger(__name__) @@ -251,21 +251,21 @@ def send( # initialize function doesn't get overridden with invalid_license as a missing license key is also # treated as a 401 status code if not self._license_key: - self.super_agent.set_health_status("missing_license") + self.super_agent.set_health_status(HealthStatus.MISSING_LICENSE.value) else: - self.super_agent.set_health_status("invalid_license") + self.super_agent.set_health_status(HealthStatus.INVALID_LICENSE.value) if status == 407: - self.super_agent.set_health_status("proxy_error", status) + self.super_agent.set_health_status(HealthStatus.PROXY_ERROR.value, status) if status == 410: - self.super_agent.set_health_status("forced_disconnect") + self.super_agent.set_health_status(HealthStatus.FORCED_DISCONNECT.value) level, message = self.LOG_MESSAGES.get(status, self.LOG_MESSAGES["default"]) # If the default error message was used, then we know we have a general HTTP error if message.startswith("Received a non 200 or 202"): - self.super_agent.set_health_status("http_error", status, method) + self.super_agent.set_health_status(HealthStatus.HTTP_ERROR.value, status, method) _logger.log( level, diff --git a/newrelic/core/application.py b/newrelic/core/application.py index a78f5360c..91637bf2d 100644 --- a/newrelic/core/application.py +++ b/newrelic/core/application.py @@ -49,7 +49,7 @@ RetryDataForRequest, ) from newrelic.samplers.data_sampler import DataSampler -from newrelic.core.super_agent_health import super_agent_healthcheck_loop, super_agent_health_instance +from newrelic.core.super_agent_health import HealthStatus, super_agent_healthcheck_loop, super_agent_health_instance _logger = logging.getLogger(__name__) @@ -369,7 +369,7 @@ def connect_to_data_collector(self, activate_agent): None, self._app_name, self.linked_applications, environment_settings() ) except ForceAgentDisconnect: - self._super_agent.set_health_status("failed_nr_connection") + self._super_agent.set_health_status(HealthStatus.FAILED_NR_CONNECTION.value) # Any disconnect exception means we should stop trying to connect _logger.error( "The New Relic service has requested that the agent " @@ -380,7 +380,7 @@ def connect_to_data_collector(self, activate_agent): ) return except NetworkInterfaceException: - self._super_agent.set_health_status("failed_nr_connection") + self._super_agent.set_health_status(HealthStatus.FAILED_NR_CONNECTION.value) active_session = None except Exception: # If an exception occurs after agent has been flagged to be @@ -390,7 +390,7 @@ def connect_to_data_collector(self, activate_agent): # the application is still running. if not self._agent_shutdown and not self._pending_shutdown: - self._super_agent.set_health_status("failed_nr_connection") + self._super_agent.set_health_status(HealthStatus.FAILED_NR_CONNECTION.value) _logger.exception( "Unexpected exception when registering " "agent with the data collector. If this problem " @@ -1700,7 +1700,7 @@ def internal_agent_shutdown(self, restart=False): optionally triggers activation of a new session. """ - self._super_agent.set_health_status("agent_shutdown") + self._super_agent.set_health_status(HealthStatus.AGENT_SHUTDOWN.value) if self._super_agent.health_check_enabled: self._super_agent.write_to_health_file() diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py index 22664fb1d..924c243e2 100644 --- a/newrelic/core/super_agent_health.py +++ b/newrelic/core/super_agent_health.py @@ -18,6 +18,7 @@ import threading import time import uuid +from enum import Enum from pathlib import Path from urllib.parse import urlparse @@ -25,19 +26,35 @@ _logger = logging.getLogger(__name__) +class HealthStatus(Enum): + HEALTHY = 1 + INVALID_LICENSE = 2 + MISSING_LICENSE = 3 + FORCED_DISCONNECT = 4 + HTTP_ERROR = 5 + PROXY_ERROR = 6 + AGENT_DISABLED = 7 + FAILED_NR_CONNECTION = 8 + INVALID_CONFIG = 9 + AGENT_SHUTDOWN = 10 + +# Set enum integer values as dict keys to reduce performance impact of string copies HEALTH_CHECK_STATUSES = { - "healthy": ("NR-APM-000", "Healthy"), - "invalid_license": ("NR-APM-001", "Invalid license key (HTTP status code 401)"), - "missing_license": ("NR-APM-002", "License key missing in configuration"), - "forced_disconnect": ("NR-APM-003", "Forced disconnect received from New Relic (HTTP status code 410)"), - "http_error": ("NR-APM-004", "HTTP error response code received from New Relic"), - "proxy_error": ("NR-APM-007", "HTTP Proxy configuration error"), - "agent_disabled": ("NR-APM-008", "Agent is disabled via configuration"), - "failed_nr_connection": ("NR-APM-009", "Failed to connect to New Relic data collector"), - "invalid_config": ("NR-APM-010", "Agent config file is not able to be parsed"), - "agent_shutdown": ("NR-APM-099", "Agent has shutdown"), + HealthStatus.HEALTHY.value: ("NR-APM-000", "Healthy"), + HealthStatus.INVALID_LICENSE.value: ("NR-APM-001", "Invalid license key (HTTP status code 401)"), + HealthStatus.MISSING_LICENSE.value: ("NR-APM-002", "License key missing in configuration"), + HealthStatus.FORCED_DISCONNECT.value: ("NR-APM-003", "Forced disconnect received from New Relic (HTTP status code 410)"), + HealthStatus.HTTP_ERROR.value: ("NR-APM-004", "HTTP error response code received from New Relic"), + HealthStatus.PROXY_ERROR.value: ("NR-APM-007", "HTTP Proxy configuration error"), + HealthStatus.AGENT_DISABLED.value: ("NR-APM-008", "Agent is disabled via configuration"), + HealthStatus.FAILED_NR_CONNECTION.value: ("NR-APM-009", "Failed to connect to New Relic data collector"), + HealthStatus.INVALID_CONFIG.value: ("NR-APM-010", "Agent config file is not able to be parsed"), + HealthStatus.AGENT_SHUTDOWN.value: ("NR-APM-099", "Agent has shutdown"), } +PROTOCOL_ERROR_CODES = frozenset(["NR-APM-003", "NR-APM-004", "NR-APM-007"]) +COLLECTOR_ERROR_CODES = frozenset(["NR-APM-009"]) + def is_valid_file_delivery_location(file_uri): # Verify whether file directory provided to agent via env var is a valid file URI to determine whether health @@ -115,88 +132,82 @@ def health_check_enabled(self): return False health_file_location = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) - valid_file_location = is_valid_file_delivery_location(health_file_location) - if not valid_file_location: - return False - return True + return is_valid_file_delivery_location(health_file_location) + + @property + def is_healthy(self): + return True if self.status == "Healthy" else False def set_health_status(self, health_status, response_code=None, info=None): + license_key_error = True if self.status == "Invalid license key (HTTP status code 401)" or "License key missing in configuration" else False + + if health_status == HealthStatus.FAILED_NR_CONNECTION.value and license_key_error: + return + + # Do not override status with agent_shutdown unless the agent was previously healthy + elif health_status == HealthStatus.AGENT_SHUTDOWN.value and self.status != "Healthy": + return + last_error, current_status = HEALTH_CHECK_STATUSES[health_status] + # Update status messages to be more descriptive if necessary data is present - if health_status == "http_error" and response_code and info: + if health_status == HealthStatus.HTTP_ERROR.value and response_code and info: current_status = ( f"HTTP error response code {response_code} received from New Relic while sending data type {info}" ) - if health_status == "proxy_error" and response_code: + if health_status == HealthStatus.PROXY_ERROR.value and response_code: current_status = f"HTTP Proxy configuration error; response code {response_code}" - - license_key_error = True if self.status == "Invalid license key (HTTP status code 401)" or "License key missing in configuration" else False - - if health_status == "failed_nr_connection" and license_key_error: - pass - # Do not override status with agent_shutdown unless the agent was previously healthy - elif health_status == "agent_shutdown" and self.status != "Healthy": - pass - else: - self.last_error = last_error - self.status = current_status + self.last_error = last_error + self.status = current_status def update_to_healthy_status(self, protocol_error=False, collector_error=False): # If our unhealthy status code was not config related, it is possible it could be resolved during an active - # session. This function allows us to update to a healthy status if so - - if not protocol_error and not collector_error: - return - - # For protocol errors, we determine the status is resolved by calling this function when a 200 status code is - # received to check if the current status is resolvable - if protocol_error: - error_codes = frozenset(["NR-APM-003", "NR-APM-004", "NR-APM-007"]) - # Also check if we had a failure connecting to NR as this could resolve itself if the collector becomes - # reachable during an active session - if collector_error: - error_codes = frozenset(["NR-APM-009"]) - + # session. This function allows us to update to a healthy status if so based on the error type # Since this function is only called when we are in scenario where the agent functioned as expected, we check to # see if the previous status was unhealthy so we know to update it - if self.last_error in error_codes: + if protocol_error and self.last_error in PROTOCOL_ERROR_CODES or collector_error and self.last_error in COLLECTOR_ERROR_CODES: self.last_error = "NR-APM-000" self.status = "Healthy" def write_to_health_file(self): - is_healthy = True if self.status == "Healthy" else False status_time_unix_nano = time.time_ns() health_file_location = os.environ.get("NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION", None) - health_file_location = str(health_file_location) + # Additional safeguard though health delivery location contents were initially checked to determine if health + # check should be enabled + if not health_file_location: + return + file_path = urlparse(health_file_location).path - pid = os.getpid() - file_id = self.get_file_id(pid) + file_id = self.get_file_id() file_name = f"health-{file_id}.yml" full_path = os.path.join(file_path, file_name) try: with open(full_path, "w") as f: - f.write(f"healthy: {is_healthy}\n") + f.write(f"healthy: {self.is_healthy}\n") f.write(f"status: {self.status}\n") f.write(f"start_time_unix_nano: {self.start_time_unix_nano}\n") f.write(f"status_time_unix_nano: {status_time_unix_nano}\n") - if not is_healthy: + if not self.is_healthy: f.write(f"last_error: {self.last_error}\n") except: _logger.warning("Unable to write to agent health file.") - def get_file_id(self, pid): + def get_file_id(self): + pid = os.getpid() + # Each file name should have a UUID with hyphens stripped appended to it file_id = str(uuid.uuid4()).replace("-", "") # Map the UUID to the process ID to ensure each agent instance has one UUID associated with it if pid not in self.pid_file_id_map: self.pid_file_id_map[pid] = file_id + return file_id return self.pid_file_id_map[pid] diff --git a/tests/agent_features/test_super_agent_health_check.py b/tests/agent_features/test_super_agent_health_check.py index 5599dc62a..5b400b989 100644 --- a/tests/agent_features/test_super_agent_health_check.py +++ b/tests/agent_features/test_super_agent_health_check.py @@ -13,12 +13,13 @@ # limitations under the License. import os import time +import re import pytest import threading from newrelic.core.config import finalize_application_settings -from agent_unittests.test_agent_protocol import HttpClientRecorder -from newrelic.core.super_agent_health import is_valid_file_delivery_location, super_agent_health_instance +from testing_support.http_client_recorder import HttpClientRecorder +from newrelic.core.super_agent_health import HealthStatus, is_valid_file_delivery_location, super_agent_health_instance from newrelic.config import initialize, _reset_configuration_done from newrelic.core.agent_protocol import AgentProtocol from newrelic.core.application import Application @@ -56,7 +57,7 @@ def test_write_to_file_healthy_status(monkeypatch, tmp_path): assert len(contents) == 4 assert contents[0] == "healthy: True\n" assert contents[1] == "status: Healthy\n" - assert contents[2] == "start_time_unix_nano: 1234567890\n" + assert re.search(r"status_time_unix_nano:\s\d+", contents[3]) is not None assert contents[3].startswith("status_time_unix_nano:") is True @@ -69,7 +70,7 @@ def test_write_to_file_unhealthy_status(monkeypatch, tmp_path): # Write to health YAML file super_agent_instance = super_agent_health_instance() super_agent_instance.start_time_unix_nano = "1234567890" - super_agent_instance.set_health_status("invalid_license") + super_agent_instance.set_health_status(HealthStatus.INVALID_LICENSE.value) super_agent_instance.write_to_health_file() @@ -80,7 +81,7 @@ def test_write_to_file_unhealthy_status(monkeypatch, tmp_path): assert contents[0] == "healthy: False\n" assert contents[1] == "status: Invalid license key (HTTP status code 401)\n" assert contents[2] == "start_time_unix_nano: 1234567890\n" - assert contents[3].startswith("status_time_unix_nano:") is True + assert re.search(r"status_time_unix_nano:\s\d+", contents[3]) is not None assert contents[4] == "last_error: NR-APM-001\n" @@ -93,10 +94,10 @@ def test_no_override_on_unhealthy_shutdown(monkeypatch, tmp_path): # Write to health YAML file super_agent_instance = super_agent_health_instance() super_agent_instance.start_time_unix_nano = "1234567890" - super_agent_instance.set_health_status("invalid_license") + super_agent_instance.set_health_status(HealthStatus.INVALID_LICENSE.value) # Attempt to override a previously unhealthy status - super_agent_instance.set_health_status("agent_shutdown") + super_agent_instance.set_health_status(HealthStatus.AGENT_SHUTDOWN.value) super_agent_instance.write_to_health_file() contents = get_health_file_contents(tmp_path) @@ -123,6 +124,7 @@ def test_health_check_running_threads(monkeypatch, tmp_path): running_threads = threading.enumerate() + # Two expected threads: One main agent thread and one main health thread since we have no additional active sessions assert len(running_threads) == 2 assert running_threads[1].name == "NR-Control-Health-Main-Thread" @@ -171,7 +173,7 @@ def test_update_to_healthy(monkeypatch, tmp_path): # Write to health YAML file super_agent_instance = super_agent_health_instance() super_agent_instance.start_time_unix_nano = "1234567890" - super_agent_instance.set_health_status("forced_disconnect") + super_agent_instance.set_health_status(HealthStatus.FORCED_DISCONNECT.value) # Send a successful data batch to enable health status to update to "healthy" HttpClientRecorder.STATUS_CODE = 200 @@ -209,6 +211,8 @@ def test_multiple_activations_running_threads(monkeypatch, tmp_path): running_threads = threading.enumerate() + # 6 threads expected: One main agent thread, two active session threads, one main health check thread, and two + # active session health threads assert len(running_threads) == 6 assert running_threads[1].name == "NR-Control-Health-Main-Thread" assert running_threads[2].name == "NR-Control-Health-Session-Thread" diff --git a/tests/agent_unittests/test_agent_protocol.py b/tests/agent_unittests/test_agent_protocol.py index 49d62f2a1..c56f3c519 100644 --- a/tests/agent_unittests/test_agent_protocol.py +++ b/tests/agent_unittests/test_agent_protocol.py @@ -16,13 +16,12 @@ import os import ssl import tempfile -from collections import namedtuple import pytest from newrelic.common import certs, system_info from newrelic.common.agent_http import DeveloperModeClient -from newrelic.common.encoding_utils import json_decode, json_encode, serverless_payload_decode +from newrelic.common.encoding_utils import json_decode, serverless_payload_decode from newrelic.common.utilization import CommonUtilization from newrelic.core.agent_protocol import AgentProtocol, ServerlessModeProtocol from newrelic.core.config import finalize_application_settings, global_settings @@ -35,8 +34,7 @@ NetworkInterfaceException, RetryDataForRequest, ) - -Request = namedtuple("Request", ("method", "path", "params", "headers", "payload")) +from testing_support.http_client_recorder import HttpClientRecorder # Global constants used in tests @@ -64,41 +62,6 @@ ERROR_EVENT_DATA = 100 -class HttpClientRecorder(DeveloperModeClient): - SENT = [] - STATUS_CODE = None - STATE = 0 - - def send_request( - self, - method="POST", - path="/agent_listener/invoke_raw_method", - params=None, - headers=None, - payload=None, - ): - request = Request(method=method, path=path, params=params, headers=headers, payload=payload) - self.SENT.append(request) - if self.STATUS_CODE: - # Define behavior for a 200 status code for use in test_super_agent_health.py - if self.STATUS_CODE == 200: - payload = {"return_value": "Hello World!"} - response_data = json_encode(payload).encode("utf-8") - return self.STATUS_CODE, response_data - return self.STATUS_CODE, b"" - - return super(HttpClientRecorder, self).send_request(method, path, params, headers, payload) - - def __enter__(self): - HttpClientRecorder.STATE += 1 - - def __exit__(self, exc, value, tb): - HttpClientRecorder.STATE -= 1 - - def close_connection(self): - HttpClientRecorder.STATE -= 1 - - class HttpClientException(DeveloperModeClient): def send_request( self, diff --git a/tests/testing_support/http_client_recorder.py b/tests/testing_support/http_client_recorder.py new file mode 100644 index 000000000..c3bf189f1 --- /dev/null +++ b/tests/testing_support/http_client_recorder.py @@ -0,0 +1,40 @@ +from collections import namedtuple + +from newrelic.common.agent_http import DeveloperModeClient +from newrelic.common.encoding_utils import json_encode + +Request = namedtuple("Request", ("method", "path", "params", "headers", "payload")) + +class HttpClientRecorder(DeveloperModeClient): + SENT = [] + STATUS_CODE = None + STATE = 0 + + def send_request( + self, + method="POST", + path="/agent_listener/invoke_raw_method", + params=None, + headers=None, + payload=None, + ): + request = Request(method=method, path=path, params=params, headers=headers, payload=payload) + self.SENT.append(request) + if self.STATUS_CODE: + # Define behavior for a 200 status code for use in test_super_agent_health.py + if self.STATUS_CODE == 200: + payload = {"return_value": "Hello World!"} + response_data = json_encode(payload).encode("utf-8") + return self.STATUS_CODE, response_data + return self.STATUS_CODE, b"" + + return super(HttpClientRecorder, self).send_request(method, path, params, headers, payload) + + def __enter__(self): + HttpClientRecorder.STATE += 1 + + def __exit__(self, exc, value, tb): + HttpClientRecorder.STATE -= 1 + + def close_connection(self): + HttpClientRecorder.STATE -= 1 From 155bc17455781ae3dbe0151ecd0956f3ead16d4a Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Wed, 11 Dec 2024 22:53:42 -0800 Subject: [PATCH 11/14] Capture urlparse logic in try except. --- newrelic/core/super_agent_health.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py index 924c243e2..225534754 100644 --- a/newrelic/core/super_agent_health.py +++ b/newrelic/core/super_agent_health.py @@ -140,7 +140,7 @@ def is_healthy(self): return True if self.status == "Healthy" else False def set_health_status(self, health_status, response_code=None, info=None): - license_key_error = True if self.status == "Invalid license key (HTTP status code 401)" or "License key missing in configuration" else False + license_key_error = True if self.status == "License key missing in configuration" or self.status == "Invalid license key (HTTP status code 401)" else False if health_status == HealthStatus.FAILED_NR_CONNECTION.value and license_key_error: return @@ -181,13 +181,12 @@ def write_to_health_file(self): if not health_file_location: return - file_path = urlparse(health_file_location).path - file_id = self.get_file_id() - - file_name = f"health-{file_id}.yml" - full_path = os.path.join(file_path, file_name) - try: + file_path = urlparse(health_file_location).path + file_id = self.get_file_id() + file_name = f"health-{file_id}.yml" + full_path = os.path.join(file_path, file_name) + with open(full_path, "w") as f: f.write(f"healthy: {self.is_healthy}\n") f.write(f"status: {self.status}\n") From 922a51b6f9557f137098e27678d883a86a5bc40f Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Thu, 12 Dec 2024 12:59:20 -0800 Subject: [PATCH 12/14] Add additional shutdown check in shutdown_agent. --- newrelic/core/agent.py | 8 ++++++++ newrelic/core/super_agent_health.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/newrelic/core/agent.py b/newrelic/core/agent.py index 1b81bb9b4..89e27f6f4 100644 --- a/newrelic/core/agent.py +++ b/newrelic/core/agent.py @@ -35,6 +35,8 @@ from newrelic.samplers.cpu_usage import cpu_usage_data_source from newrelic.samplers.gc_data import garbage_collector_data_source from newrelic.samplers.memory_usage import memory_usage_data_source +from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance + _logger = logging.getLogger(__name__) @@ -217,6 +219,7 @@ def __init__(self, config): self._scheduler = sched.scheduler(self._harvest_timer, self._harvest_shutdown.wait) self._process_shutdown = False + self._super_agent = super_agent_health_instance() self._lock = threading.Lock() @@ -734,6 +737,11 @@ def shutdown_agent(self, timeout=None): if self._harvest_shutdown_is_set(): return + self._super_agent.set_health_status(HealthStatus.AGENT_SHUTDOWN.value) + + if self._super_agent.health_check_enabled: + self._super_agent.write_to_health_file() + if timeout is None: timeout = self._config.shutdown_timeout diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py index 225534754..934470817 100644 --- a/newrelic/core/super_agent_health.py +++ b/newrelic/core/super_agent_health.py @@ -194,7 +194,7 @@ def write_to_health_file(self): f.write(f"status_time_unix_nano: {status_time_unix_nano}\n") if not self.is_healthy: f.write(f"last_error: {self.last_error}\n") - except: + except Exception: _logger.warning("Unable to write to agent health file.") def get_file_id(self): From 3ac48d57b52b06bf9c7c014efcec66944a897e90 Mon Sep 17 00:00:00 2001 From: Tim Pansino Date: Fri, 13 Dec 2024 11:58:10 -0800 Subject: [PATCH 13/14] Refactor status code and message usage in superagent --- newrelic/core/super_agent_health.py | 99 ++++++++++++++--------------- 1 file changed, 46 insertions(+), 53 deletions(-) diff --git a/newrelic/core/super_agent_health.py b/newrelic/core/super_agent_health.py index 934470817..e256a6b3a 100644 --- a/newrelic/core/super_agent_health.py +++ b/newrelic/core/super_agent_health.py @@ -18,7 +18,7 @@ import threading import time import uuid -from enum import Enum +from enum import IntEnum from pathlib import Path from urllib.parse import urlparse @@ -26,34 +26,36 @@ _logger = logging.getLogger(__name__) -class HealthStatus(Enum): - HEALTHY = 1 - INVALID_LICENSE = 2 - MISSING_LICENSE = 3 - FORCED_DISCONNECT = 4 - HTTP_ERROR = 5 - PROXY_ERROR = 6 - AGENT_DISABLED = 7 - FAILED_NR_CONNECTION = 8 - INVALID_CONFIG = 9 - AGENT_SHUTDOWN = 10 +class HealthStatus(IntEnum): + HEALTHY = 0 + INVALID_LICENSE = 1 + MISSING_LICENSE = 2 + FORCED_DISCONNECT = 3 + HTTP_ERROR = 4 + PROXY_ERROR = 7 + AGENT_DISABLED = 8 + FAILED_NR_CONNECTION = 9 + INVALID_CONFIG = 10 + AGENT_SHUTDOWN = 99 # Set enum integer values as dict keys to reduce performance impact of string copies HEALTH_CHECK_STATUSES = { - HealthStatus.HEALTHY.value: ("NR-APM-000", "Healthy"), - HealthStatus.INVALID_LICENSE.value: ("NR-APM-001", "Invalid license key (HTTP status code 401)"), - HealthStatus.MISSING_LICENSE.value: ("NR-APM-002", "License key missing in configuration"), - HealthStatus.FORCED_DISCONNECT.value: ("NR-APM-003", "Forced disconnect received from New Relic (HTTP status code 410)"), - HealthStatus.HTTP_ERROR.value: ("NR-APM-004", "HTTP error response code received from New Relic"), - HealthStatus.PROXY_ERROR.value: ("NR-APM-007", "HTTP Proxy configuration error"), - HealthStatus.AGENT_DISABLED.value: ("NR-APM-008", "Agent is disabled via configuration"), - HealthStatus.FAILED_NR_CONNECTION.value: ("NR-APM-009", "Failed to connect to New Relic data collector"), - HealthStatus.INVALID_CONFIG.value: ("NR-APM-010", "Agent config file is not able to be parsed"), - HealthStatus.AGENT_SHUTDOWN.value: ("NR-APM-099", "Agent has shutdown"), + HealthStatus.HEALTHY.value: "Healthy", + HealthStatus.INVALID_LICENSE.value: "Invalid license key (HTTP status code 401)", + HealthStatus.MISSING_LICENSE.value: "License key missing in configuration", + HealthStatus.FORCED_DISCONNECT.value: "Forced disconnect received from New Relic (HTTP status code 410)", + HealthStatus.HTTP_ERROR.value: "HTTP error response code {response_code} received from New Relic while sending data type {info}", + HealthStatus.PROXY_ERROR.value: "HTTP Proxy configuration error; response code {response_code}", + HealthStatus.AGENT_DISABLED.value: "Agent is disabled via configuration", + HealthStatus.FAILED_NR_CONNECTION.value: "Failed to connect to New Relic data collector", + HealthStatus.INVALID_CONFIG.value: "Agent config file is not able to be parsed", + HealthStatus.AGENT_SHUTDOWN.value: "Agent has shutdown", } +UNKNOWN_STATUS_MESSAGE = "Unknown health status code." +HEALTHY_STATUS_MESSAGE = HEALTH_CHECK_STATUSES[HealthStatus.HEALTHY.value] # Assign most used status a symbol -PROTOCOL_ERROR_CODES = frozenset(["NR-APM-003", "NR-APM-004", "NR-APM-007"]) -COLLECTOR_ERROR_CODES = frozenset(["NR-APM-009"]) +PROTOCOL_ERROR_CODES = frozenset([HealthStatus.FORCED_DISCONNECT.value, HealthStatus.HTTP_ERROR.value, HealthStatus.PROXY_ERROR.value]) +LICENSE_KEY_ERROR_CODES = frozenset([HealthStatus.INVALID_LICENSE.value, HealthStatus.MISSING_LICENSE.value]) def is_valid_file_delivery_location(file_uri): @@ -120,8 +122,8 @@ def super_agent_health_singleton(): def __init__(self): # Initialize health check with a healthy status that can be updated as issues are encountered - self.last_error = "NR-APM-000" - self.status = "Healthy" + self.status_code = HealthStatus.HEALTHY.value + self.status_message = HEALTHY_STATUS_MESSAGE self.start_time_unix_nano = None self.pid_file_id_map = {} @@ -137,40 +139,30 @@ def health_check_enabled(self): @property def is_healthy(self): - return True if self.status == "Healthy" else False + return self.status_code == HealthStatus.HEALTHY.value - def set_health_status(self, health_status, response_code=None, info=None): - license_key_error = True if self.status == "License key missing in configuration" or self.status == "Invalid license key (HTTP status code 401)" else False + def set_health_status(self, status_code, response_code=None, info=None): + previous_status_code = self.status_code - if health_status == HealthStatus.FAILED_NR_CONNECTION.value and license_key_error: + if status_code == HealthStatus.FAILED_NR_CONNECTION.value and previous_status_code in LICENSE_KEY_ERROR_CODES: + # Do not update to failed connection status when license key is the issue return - - # Do not override status with agent_shutdown unless the agent was previously healthy - elif health_status == HealthStatus.AGENT_SHUTDOWN.value and self.status != "Healthy": + elif status_code == HealthStatus.AGENT_SHUTDOWN.value and not self.is_healthy: + # Do not override status with agent_shutdown unless the agent was previously healthy return - last_error, current_status = HEALTH_CHECK_STATUSES[health_status] - - # Update status messages to be more descriptive if necessary data is present - if health_status == HealthStatus.HTTP_ERROR.value and response_code and info: - current_status = ( - f"HTTP error response code {response_code} received from New Relic while sending data type {info}" - ) - - if health_status == HealthStatus.PROXY_ERROR.value and response_code: - current_status = f"HTTP Proxy configuration error; response code {response_code}" - - self.last_error = last_error - self.status = current_status + status_message = HEALTH_CHECK_STATUSES.get(status_code, UNKNOWN_STATUS_MESSAGE) + self.status_message = status_message.format(response_code=response_code, info=info) + self.status_code = status_code def update_to_healthy_status(self, protocol_error=False, collector_error=False): # If our unhealthy status code was not config related, it is possible it could be resolved during an active # session. This function allows us to update to a healthy status if so based on the error type # Since this function is only called when we are in scenario where the agent functioned as expected, we check to # see if the previous status was unhealthy so we know to update it - if protocol_error and self.last_error in PROTOCOL_ERROR_CODES or collector_error and self.last_error in COLLECTOR_ERROR_CODES: - self.last_error = "NR-APM-000" - self.status = "Healthy" + if protocol_error and self.status_code in PROTOCOL_ERROR_CODES or collector_error and self.status_code == HealthStatus.FAILED_NR_CONNECTION.value: + self.status_code = HealthStatus.HEALTHY.value + self.status_message = HEALTHY_STATUS_MESSAGE def write_to_health_file(self): status_time_unix_nano = time.time_ns() @@ -186,14 +178,15 @@ def write_to_health_file(self): file_id = self.get_file_id() file_name = f"health-{file_id}.yml" full_path = os.path.join(file_path, file_name) + is_healthy = self.is_healthy with open(full_path, "w") as f: - f.write(f"healthy: {self.is_healthy}\n") - f.write(f"status: {self.status}\n") + f.write(f"healthy: {is_healthy}\n") + f.write(f"status: {self.status_message}\n") f.write(f"start_time_unix_nano: {self.start_time_unix_nano}\n") f.write(f"status_time_unix_nano: {status_time_unix_nano}\n") - if not self.is_healthy: - f.write(f"last_error: {self.last_error}\n") + if not is_healthy: + f.write(f"last_error: NR-APM-{self.status_code:03d}\n") except Exception: _logger.warning("Unable to write to agent health file.") From 3594297dfaa9e2f44d001f6106271cd4f8d7b8e7 Mon Sep 17 00:00:00 2001 From: Uma Annamalai Date: Fri, 13 Dec 2024 12:55:52 -0800 Subject: [PATCH 14/14] Update regex assertion Co-authored-by: Timothy Pansino <11214426+TimPansino@users.noreply.github.com> --- tests/agent_features/test_super_agent_health_check.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/agent_features/test_super_agent_health_check.py b/tests/agent_features/test_super_agent_health_check.py index 5b400b989..c6b6756dc 100644 --- a/tests/agent_features/test_super_agent_health_check.py +++ b/tests/agent_features/test_super_agent_health_check.py @@ -57,8 +57,7 @@ def test_write_to_file_healthy_status(monkeypatch, tmp_path): assert len(contents) == 4 assert contents[0] == "healthy: True\n" assert contents[1] == "status: Healthy\n" - assert re.search(r"status_time_unix_nano:\s\d+", contents[3]) is not None - assert contents[3].startswith("status_time_unix_nano:") is True + assert int(re.search(r"status_time_unix_nano: (\d+)", contents[3]).group(1)) > 0 def test_write_to_file_unhealthy_status(monkeypatch, tmp_path): @@ -81,7 +80,7 @@ def test_write_to_file_unhealthy_status(monkeypatch, tmp_path): assert contents[0] == "healthy: False\n" assert contents[1] == "status: Invalid license key (HTTP status code 401)\n" assert contents[2] == "start_time_unix_nano: 1234567890\n" - assert re.search(r"status_time_unix_nano:\s\d+", contents[3]) is not None + assert int(re.search(r"status_time_unix_nano: (\d+)", contents[3]).group(1)) > 0 assert contents[4] == "last_error: NR-APM-001\n"