Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NR Control Health Checks #1250

Merged
merged 16 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 45 additions & 15 deletions newrelic/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import logging
import os
import sys
import threading
import time
import traceback

import newrelic.api.application
Expand Down Expand Up @@ -46,6 +48,8 @@
default_host,
fetch_config_setting,
)
from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance, super_agent_healthcheck_loop


__all__ = ["initialize", "filter_app_factory"]

Expand Down Expand Up @@ -100,6 +104,7 @@ def _map_aws_account_id(s):
# all the settings have been read.

_cache_object = []
super_agent_health = super_agent_health_instance()


def _reset_config_parser():
Expand Down Expand Up @@ -1033,21 +1038,25 @@ def _load_configuration(

# Now read in the configuration file. Cache the config file
# name in internal settings object as indication of succeeding.
if config_file.endswith(".toml"):
try:
import tomllib
except ImportError:
raise newrelic.api.exceptions.ConfigurationError(
"TOML configuration file can only be used if tomllib is available (Python 3.11+)."
)
with open(config_file, "rb") as f:
content = tomllib.load(f)
newrelic_section = content.get("tool", {}).get("newrelic")
if not newrelic_section:
raise newrelic.api.exceptions.ConfigurationError("New Relic configuration not found in TOML file.")
_config_object.read_dict(_toml_config_to_configparser_dict(newrelic_section))
elif not _config_object.read([config_file]):
raise newrelic.api.exceptions.ConfigurationError(f"Unable to open configuration file {config_file}.")
try:
if config_file.endswith(".toml"):
try:
import tomllib
except ImportError:
raise newrelic.api.exceptions.ConfigurationError(
"TOML configuration file can only be used if tomllib is available (Python 3.11+)."
)
with open(config_file, "rb") as f:
content = tomllib.load(f)
newrelic_section = content.get("tool", {}).get("newrelic")
if not newrelic_section:
raise newrelic.api.exceptions.ConfigurationError("New Relic configuration not found in TOML file.")
_config_object.read_dict(_toml_config_to_configparser_dict(newrelic_section))
elif not _config_object.read([config_file]):
raise newrelic.api.exceptions.ConfigurationError(f"Unable to open configuration file {config_file}.")
except Exception:
super_agent_health.set_health_status(HealthStatus.INVALID_CONFIG.value)
raise

_settings.config_file = config_file

Expand Down Expand Up @@ -4815,13 +4824,27 @@ def _setup_agent_console():
newrelic.core.agent.Agent.run_on_startup(_startup_agent_console)


super_agent_health_thread = threading.Thread(name="NR-Control-Health-Main-Thread", target=super_agent_healthcheck_loop)
super_agent_health_thread.daemon = True


def _setup_super_agent_health():
if super_agent_health_thread.is_alive():
return

if super_agent_health.health_check_enabled:
super_agent_health_thread.start()


def initialize(
config_file=None,
environment=None,
ignore_errors=None,
log_file=None,
log_level=None,
):
super_agent_health.start_time_unix_nano = time.time_ns()

if config_file is None:
config_file = os.environ.get("NEW_RELIC_CONFIG_FILE", None)

Expand All @@ -4833,6 +4856,12 @@ def initialize(

_load_configuration(config_file, environment, ignore_errors, log_file, log_level)

_setup_super_agent_health()

if _settings.monitor_mode:
if not _settings.license_key:
super_agent_health.set_health_status(HealthStatus.MISSING_LICENSE.value)

if _settings.monitor_mode or _settings.developer_mode:
_settings.enabled = True
_setup_instrumentation()
Expand All @@ -4841,6 +4870,7 @@ def initialize(
_setup_agent_console()
else:
_settings.enabled = False
super_agent_health.set_health_status(HealthStatus.AGENT_DISABLED.value)


def filter_app_factory(app, global_conf, config_file, environment=None):
Expand Down
3 changes: 3 additions & 0 deletions newrelic/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from newrelic.common.object_wrapper import ObjectProxy
from newrelic.core.agent import agent_instance
from newrelic.core.config import flatten_settings, global_settings
from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance
from newrelic.core.trace_cache import trace_cache


Expand Down Expand Up @@ -512,6 +513,8 @@ def __init__(self, config_file, stdin=None, stdout=None, log=None):
self.__log_object = log

if not self.__config_object.read([config_file]):
super_agent_instance = super_agent_health_instance()
super_agent_instance.set_health_status(HealthStatus.INVALID_CONFIG.value)
raise RuntimeError(f"Unable to open configuration file {config_file}.")

listener_socket = self.__config_object.get("newrelic", "console.listener_socket") % {"pid": "*"}
Expand Down
8 changes: 8 additions & 0 deletions newrelic/core/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
from newrelic.samplers.cpu_usage import cpu_usage_data_source
from newrelic.samplers.gc_data import garbage_collector_data_source
from newrelic.samplers.memory_usage import memory_usage_data_source
from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance


_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -217,6 +219,7 @@ def __init__(self, config):
self._scheduler = sched.scheduler(self._harvest_timer, self._harvest_shutdown.wait)

self._process_shutdown = False
self._super_agent = super_agent_health_instance()

self._lock = threading.Lock()

Expand Down Expand Up @@ -734,6 +737,11 @@ def shutdown_agent(self, timeout=None):
if self._harvest_shutdown_is_set():
return

self._super_agent.set_health_status(HealthStatus.AGENT_SHUTDOWN.value)

if self._super_agent.health_check_enabled:
self._super_agent.write_to_health_file()

if timeout is None:
timeout = self._config.shutdown_timeout

Expand Down
28 changes: 28 additions & 0 deletions newrelic/core/agent_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
NetworkInterfaceException,
RetryDataForRequest,
)
from newrelic.core.super_agent_health import HealthStatus, super_agent_health_instance

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -188,6 +189,7 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient):
"marshal_format": "json",
}
self._headers = {}
self._license_key = settings.license_key

# In Python 2, the JSON is loaded with unicode keys and values;
# however, the header name must be a non-unicode value when given to
Expand All @@ -209,6 +211,8 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient):

# Do not access configuration anywhere inside the class
self.configuration = settings
self.super_agent = super_agent_health_instance()


def __enter__(self):
self.client.__enter__()
Expand Down Expand Up @@ -242,7 +246,27 @@ def send(
f"Supportability/Python/Collector/MaxPayloadSizeLimit/{method}",
1,
)
if status == 401:
# Check for license key presence again so the original missing license key status set in the
# initialize function doesn't get overridden with invalid_license as a missing license key is also
# treated as a 401 status code
if not self._license_key:
self.super_agent.set_health_status(HealthStatus.MISSING_LICENSE.value)
else:
self.super_agent.set_health_status(HealthStatus.INVALID_LICENSE.value)

if status == 407:
self.super_agent.set_health_status(HealthStatus.PROXY_ERROR.value, status)

if status == 410:
self.super_agent.set_health_status(HealthStatus.FORCED_DISCONNECT.value)

level, message = self.LOG_MESSAGES.get(status, self.LOG_MESSAGES["default"])

# If the default error message was used, then we know we have a general HTTP error
if message.startswith("Received a non 200 or 202"):
self.super_agent.set_health_status(HealthStatus.HTTP_ERROR.value, status, method)

_logger.log(
level,
message,
Expand All @@ -258,9 +282,12 @@ def send(
"agent_run_id": self._run_token,
},
)

exception = self.STATUS_CODE_RESPONSE.get(status, DiscardDataForRequest)
raise exception
if status == 200:
# Check if we previously had a protocol related error and update to a healthy status
self.super_agent.update_to_healthy_status(protocol_error=True)
return self.decode_response(data)

def decode_response(self, response):
Expand Down Expand Up @@ -590,6 +617,7 @@ def __init__(self, settings, host=None, client_cls=ApplicationModeClient):

# Do not access configuration anywhere inside the class
self.configuration = settings
self.super_agent = super_agent_health_instance()

@classmethod
def connect(
Expand Down
21 changes: 18 additions & 3 deletions newrelic/core/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
RetryDataForRequest,
)
from newrelic.samplers.data_sampler import DataSampler
from newrelic.core.super_agent_health import HealthStatus, super_agent_healthcheck_loop, super_agent_health_instance

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -110,6 +111,11 @@ def __init__(self, app_name, linked_applications=None):

self._remaining_plugins = True

self._super_agent_health_thread = threading.Thread(name="NR-Control-Health-Session-Thread", target=super_agent_healthcheck_loop)
self._super_agent_health_thread.daemon = True
self._super_agent = super_agent_health_instance()


# We setup empty rules engines here even though they will be
# replaced when application first registered. This is done to
# avoid a race condition in setting it later. Otherwise we have
Expand Down Expand Up @@ -195,7 +201,6 @@ def activate_session(self, activate_agent=None, timeout=0.0):
to be activated.

"""

if self._agent_shutdown:
return

Expand All @@ -205,6 +210,9 @@ def activate_session(self, activate_agent=None, timeout=0.0):
if self._active_session:
return

if self._super_agent.health_check_enabled and not self._super_agent_health_thread.is_alive():
self._super_agent_health_thread.start()

self._process_id = os.getpid()

self._connected_event.clear()
Expand All @@ -225,7 +233,6 @@ def activate_session(self, activate_agent=None, timeout=0.0):
# timeout has likely occurred.

deadlock_timeout = 0.1

if timeout >= deadlock_timeout:
self._detect_deadlock = True

Expand Down Expand Up @@ -362,6 +369,7 @@ def connect_to_data_collector(self, activate_agent):
None, self._app_name, self.linked_applications, environment_settings()
)
except ForceAgentDisconnect:
self._super_agent.set_health_status(HealthStatus.FAILED_NR_CONNECTION.value)
# Any disconnect exception means we should stop trying to connect
_logger.error(
"The New Relic service has requested that the agent "
Expand All @@ -372,6 +380,7 @@ def connect_to_data_collector(self, activate_agent):
)
return
except NetworkInterfaceException:
self._super_agent.set_health_status(HealthStatus.FAILED_NR_CONNECTION.value)
active_session = None
except Exception:
# If an exception occurs after agent has been flagged to be
Expand All @@ -381,6 +390,7 @@ def connect_to_data_collector(self, activate_agent):
# the application is still running.

if not self._agent_shutdown and not self._pending_shutdown:
self._super_agent.set_health_status(HealthStatus.FAILED_NR_CONNECTION.value)
_logger.exception(
"Unexpected exception when registering "
"agent with the data collector. If this problem "
Expand Down Expand Up @@ -491,6 +501,8 @@ def connect_to_data_collector(self, activate_agent):
# data from a prior agent run for this application.

configuration = active_session.configuration
# Check if the agent previously had an unhealthy status related to the data collector and update
self._super_agent.update_to_healthy_status(collector_error=True)

with self._stats_lock:
self._stats_engine.reset_stats(configuration, reset_stream=True)
Expand Down Expand Up @@ -665,7 +677,7 @@ def validate_process(self):
self._process_id = 0

def normalize_name(self, name, rule_type):
"""Applies the agent normalization rules of the the specified
"""Applies the agent normalization rules of the specified
rule type to the supplied name.

"""
Expand Down Expand Up @@ -1688,6 +1700,9 @@ def internal_agent_shutdown(self, restart=False):
optionally triggers activation of a new session.

"""
self._super_agent.set_health_status(HealthStatus.AGENT_SHUTDOWN.value)
if self._super_agent.health_check_enabled:
self._super_agent.write_to_health_file()

# We need to stop any thread profiler session related to this
# application.
Expand Down
Loading
Loading