Skip to content

Commit

Permalink
Merge pull request #1250 from newrelic/super-agent-health-status
Browse files Browse the repository at this point in the history
NR Control Health Checks
  • Loading branch information
umaannamalai authored Dec 18, 2024
2 parents 5e48f62 + 9f57085 commit 1279a52
Show file tree
Hide file tree
Showing 9 changed files with 589 additions and 53 deletions.
60 changes: 45 additions & 15 deletions newrelic/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import logging
import os
import sys
import threading
import time
import traceback

import newrelic.api.application
Expand Down Expand Up @@ -46,6 +48,8 @@
default_host,
fetch_config_setting,
)
from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance, super_agent_healthcheck_loop


__all__ = ["initialize", "filter_app_factory"]

Expand Down Expand Up @@ -100,6 +104,7 @@ def _map_aws_account_id(s):
# all the settings have been read.

_cache_object = []
super_agent_health = super_agent_health_instance()


def _reset_config_parser():
Expand Down Expand Up @@ -1033,21 +1038,25 @@ def _load_configuration(

# Now read in the configuration file. Cache the config file
# name in internal settings object as indication of succeeding.
if config_file.endswith(".toml"):
try:
import tomllib
except ImportError:
raise newrelic.api.exceptions.ConfigurationError(
"TOML configuration file can only be used if tomllib is available (Python 3.11+)."
)
with open(config_file, "rb") as f:
content = tomllib.load(f)
newrelic_section = content.get("tool", {}).get("newrelic")
if not newrelic_section:
raise newrelic.api.exceptions.ConfigurationError("New Relic configuration not found in TOML file.")
_config_object.read_dict(_toml_config_to_configparser_dict(newrelic_section))
elif not _config_object.read([config_file]):
raise newrelic.api.exceptions.ConfigurationError(f"Unable to open configuration file {config_file}.")
try:
if config_file.endswith(".toml"):
try:
import tomllib
except ImportError:
raise newrelic.api.exceptions.ConfigurationError(
"TOML configuration file can only be used if tomllib is available (Python 3.11+)."
)
with open(config_file, "rb") as f:
content = tomllib.load(f)
newrelic_section = content.get("tool", {}).get("newrelic")
if not newrelic_section:
raise newrelic.api.exceptions.ConfigurationError("New Relic configuration not found in TOML file.")
_config_object.read_dict(_toml_config_to_configparser_dict(newrelic_section))
elif not _config_object.read([config_file]):
raise newrelic.api.exceptions.ConfigurationError(f"Unable to open configuration file {config_file}.")
except Exception:
super_agent_health.set_health_status(HealthStatus.INVALID_CONFIG.value)
raise

_settings.config_file = config_file

Expand Down Expand Up @@ -4818,13 +4827,27 @@ def _setup_agent_console():
newrelic.core.agent.Agent.run_on_startup(_startup_agent_console)


super_agent_health_thread = threading.Thread(name="NR-Control-Health-Main-Thread", target=super_agent_healthcheck_loop)
super_agent_health_thread.daemon = True


def _setup_super_agent_health():
if super_agent_health_thread.is_alive():
return

if super_agent_health.health_check_enabled:
super_agent_health_thread.start()


def initialize(
config_file=None,
environment=None,
ignore_errors=None,
log_file=None,
log_level=None,
):
super_agent_health.start_time_unix_nano = time.time_ns()

if config_file is None:
config_file = os.environ.get("NEW_RELIC_CONFIG_FILE", None)

Expand All @@ -4836,6 +4859,12 @@ def initialize(

_load_configuration(config_file, environment, ignore_errors, log_file, log_level)

_setup_super_agent_health()

if _settings.monitor_mode:
if not _settings.license_key:
super_agent_health.set_health_status(HealthStatus.MISSING_LICENSE.value)

if _settings.monitor_mode or _settings.developer_mode:
_settings.enabled = True
_setup_instrumentation()
Expand All @@ -4844,6 +4873,7 @@ def initialize(
_setup_agent_console()
else:
_settings.enabled = False
super_agent_health.set_health_status(HealthStatus.AGENT_DISABLED.value)


def filter_app_factory(app, global_conf, config_file, environment=None):
Expand Down
3 changes: 3 additions & 0 deletions newrelic/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from newrelic.common.object_wrapper import ObjectProxy
from newrelic.core.agent import agent_instance
from newrelic.core.config import flatten_settings, global_settings
from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance
from newrelic.core.trace_cache import trace_cache


Expand Down Expand Up @@ -512,6 +513,8 @@ def __init__(self, config_file, stdin=None, stdout=None, log=None):
self.__log_object = log

if not self.__config_object.read([config_file]):
super_agent_instance = super_agent_health_instance()
super_agent_instance.set_health_status(HealthStatus.INVALID_CONFIG.value)
raise RuntimeError(f"Unable to open configuration file {config_file}.")

listener_socket = self.__config_object.get("newrelic", "console.listener_socket") % {"pid": "*"}
Expand Down
8 changes: 8 additions & 0 deletions newrelic/core/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
from newrelic.samplers.cpu_usage import cpu_usage_data_source
from newrelic.samplers.gc_data import garbage_collector_data_source
from newrelic.samplers.memory_usage import memory_usage_data_source
from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance


_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -217,6 +219,7 @@ def __init__(self, config):
self._scheduler = sched.scheduler(self._harvest_timer, self._harvest_shutdown.wait)

self._process_shutdown = False
self._super_agent = super_agent_health_instance()

self._lock = threading.Lock()

Expand Down Expand Up @@ -734,6 +737,11 @@ def shutdown_agent(self, timeout=None):
if self._harvest_shutdown_is_set():
return

self._super_agent.set_health_status(HealthStatus.AGENT_SHUTDOWN.value)

if self._super_agent.health_check_enabled:
self._super_agent.write_to_health_file()

if timeout is None:
timeout = self._config.shutdown_timeout

Expand Down
28 changes: 28 additions & 0 deletions newrelic/core/agent_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
NetworkInterfaceException,
RetryDataForRequest,
)
from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -188,6 +189,7 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient):
"marshal_format": "json",
}
self._headers = {}
self._license_key = settings.license_key

# In Python 2, the JSON is loaded with unicode keys and values;
# however, the header name must be a non-unicode value when given to
Expand All @@ -209,6 +211,8 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient):

# Do not access configuration anywhere inside the class
self.configuration = settings
self.super_agent = super_agent_health_instance()


def __enter__(self):
self.client.__enter__()
Expand Down Expand Up @@ -242,7 +246,27 @@ def send(
f"Supportability/Python/Collector/MaxPayloadSizeLimit/{method}",
1,
)
if status == 401:
# Check for license key presence again so the original missing license key status set in the
# initialize function doesn't get overridden with invalid_license as a missing license key is also
# treated as a 401 status code
if not self._license_key:
self.super_agent.set_health_status(HealthStatus.MISSING_LICENSE.value)
else:
self.super_agent.set_health_status(HealthStatus.INVALID_LICENSE.value)

if status == 407:
self.super_agent.set_health_status(HealthStatus.PROXY_ERROR.value, status)

if status == 410:
self.super_agent.set_health_status(HealthStatus.FORCED_DISCONNECT.value)

level, message = self.LOG_MESSAGES.get(status, self.LOG_MESSAGES["default"])

# If the default error message was used, then we know we have a general HTTP error
if message.startswith("Received a non 200 or 202"):
self.super_agent.set_health_status(HealthStatus.HTTP_ERROR.value, status, method)

_logger.log(
level,
message,
Expand All @@ -258,9 +282,12 @@ def send(
"agent_run_id": self._run_token,
},
)

exception = self.STATUS_CODE_RESPONSE.get(status, DiscardDataForRequest)
raise exception
if status == 200:
# Check if we previously had a protocol related error and update to a healthy status
self.super_agent.update_to_healthy_status(protocol_error=True)
return self.decode_response(data)

def decode_response(self, response):
Expand Down Expand Up @@ -590,6 +617,7 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient):

# Do not access configuration anywhere inside the class
self.configuration = settings
self.super_agent = super_agent_health_instance()

@classmethod
def connect(
Expand Down
21 changes: 18 additions & 3 deletions newrelic/core/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
RetryDataForRequest,
)
from newrelic.samplers.data_sampler import DataSampler
from newrelic.core.super_agent_health import HealthStatus, super_agent_healthcheck_loop, super_agent_health_instance

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -110,6 +111,11 @@ def __init__(self, app_name, linked_applications=None):

self._remaining_plugins = True

self._super_agent_health_thread = threading.Thread(name="NR-Control-Health-Session-Thread", target=super_agent_healthcheck_loop)
self._super_agent_health_thread.daemon = True
self._super_agent = super_agent_health_instance()


# We setup empty rules engines here even though they will be
# replaced when application first registered. This is done to
# avoid a race condition in setting it later. Otherwise we have
Expand Down Expand Up @@ -195,7 +201,6 @@ def activate_session(self, activate_agent=None, timeout=0.0):
to be activated.
"""

if self._agent_shutdown:
return

Expand All @@ -205,6 +210,9 @@ def activate_session(self, activate_agent=None, timeout=0.0):
if self._active_session:
return

if self._super_agent.health_check_enabled and not self._super_agent_health_thread.is_alive():
self._super_agent_health_thread.start()

self._process_id = os.getpid()

self._connected_event.clear()
Expand All @@ -225,7 +233,6 @@ def activate_session(self, activate_agent=None, timeout=0.0):
# timeout has likely occurred.

deadlock_timeout = 0.1

if timeout >= deadlock_timeout:
self._detect_deadlock = True

Expand Down Expand Up @@ -362,6 +369,7 @@ def connect_to_data_collector(self, activate_agent):
None, self._app_name, self.linked_applications, environment_settings()
)
except ForceAgentDisconnect:
self._super_agent.set_health_status(HealthStatus.FAILED_NR_CONNECTION.value)
# Any disconnect exception means we should stop trying to connect
_logger.error(
"The New Relic service has requested that the agent "
Expand All @@ -372,6 +380,7 @@ def connect_to_data_collector(self, activate_agent):
)
return
except NetworkInterfaceException:
self._super_agent.set_health_status(HealthStatus.FAILED_NR_CONNECTION.value)
active_session = None
except Exception:
# If an exception occurs after agent has been flagged to be
Expand All @@ -381,6 +390,7 @@ def connect_to_data_collector(self, activate_agent):
# the application is still running.

if not self._agent_shutdown and not self._pending_shutdown:
self._super_agent.set_health_status(HealthStatus.FAILED_NR_CONNECTION.value)
_logger.exception(
"Unexpected exception when registering "
"agent with the data collector. If this problem "
Expand Down Expand Up @@ -491,6 +501,8 @@ def connect_to_data_collector(self, activate_agent):
# data from a prior agent run for this application.

configuration = active_session.configuration
# Check if the agent previously had an unhealthy status related to the data collector and update
self._super_agent.update_to_healthy_status(collector_error=True)

with self._stats_lock:
self._stats_engine.reset_stats(configuration, reset_stream=True)
Expand Down Expand Up @@ -672,7 +684,7 @@ def validate_process(self):
self._process_id = 0

def normalize_name(self, name, rule_type):
"""Applies the agent normalization rules of the the specified
"""Applies the agent normalization rules of the specified
rule type to the supplied name.
"""
Expand Down Expand Up @@ -1695,6 +1707,9 @@ def internal_agent_shutdown(self, restart=False):
optionally triggers activation of a new session.
"""
self._super_agent.set_health_status(HealthStatus.AGENT_SHUTDOWN.value)
if self._super_agent.health_check_enabled:
self._super_agent.write_to_health_file()

# We need to stop any thread profiler session related to this
# application.
Expand Down
Loading

0 comments on commit 1279a52

Please sign in to comment.