Skip to content

Commit

Permalink
[DPE-4359] TLS ca/certificate rotation without downtime (#447)
Browse files Browse the repository at this point in the history
## Issue
We currently make use of PEM files for the:

- CA
- Certificates (admin, unit-http/unit-transport)
- private keys

This is fine in all cases but the case where the CA is rotated, in which
case downtime will be incurred as soon as the first unit gets a new CA
and restarts.

We need to switch from individual certificate resources to:

- java keystores: for private keys, admin/unit-http-transport
certificates
- java trust stores: for CA

We need to implement the whole 2-stages rolling restart routine for this
case.
## Solution
### Implementation:

Ensure that:

1. TLS certificates rotation on expiration works as expected without
downtime

2. TLS CA certificates rotation on expiration works as expected without
downtime

3. TLS certificates rotation works as expected after a CA rotation,
without downtime

4. For both simple and large deployments 

### Testing:

- unit tests 

- integration tests coverage for the 4 previous cases

### Documentation:
Document on charmhub the whole TLS rotation workflow for both deployment
modes:
- simple 
- large

---------

Co-authored-by: René Radoi <[email protected]>
Co-authored-by: Mehdi Bendriss <[email protected]>
Co-authored-by: Judit Novak <[email protected]>
  • Loading branch information
4 people authored Sep 24, 2024
1 parent 04f14e7 commit 7614a18
Show file tree
Hide file tree
Showing 18 changed files with 1,903 additions and 115 deletions.
1 change: 1 addition & 0 deletions lib/charms/opensearch/v0/constants_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
SecurityIndexInitProgress = "Initializing the security index..."
AdminUserInitProgress = "Configuring admin user..."
TLSNewCertsRequested = "Requesting new TLS certificates..."
TLSCaRotation = "Applying new CA certificate..."
HorizontalScaleUpSuggest = "Horizontal scale up advised: {} shards unassigned."
WaitingForOtherUnitServiceOps = "Waiting for other units to complete the ops on their service."
NewIndexRequested = "new index {index} requested"
Expand Down
34 changes: 34 additions & 0 deletions lib/charms/opensearch/v0/helper_conf_setter.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,20 @@ def replace(
"""
pass

@abstractmethod
def append(
self,
config_file: str,
text_to_append: str,
) -> None:
"""Append any string to a text file.
Args:
config_file (str): Path to the source config file
text_to_append (str): The str to append to the config file
"""
pass

@staticmethod
def __clean_base_path(base_path: str):
if base_path is None:
Expand Down Expand Up @@ -283,6 +297,26 @@ def replace(
with open(output_file, "w") as g:
g.write(data)

@override
def append(
self,
config_file: str,
text_to_append: str,
) -> None:
"""Append any string to a text file.
Args:
config_file (str): Path to the source config file
text_to_append (str): The str to append to the config file
"""
path = f"{self.base_path}{config_file}"

if not exists(path):
raise FileNotFoundError(f"{path} not found.")

with open(path, "a") as f:
f.write("\n" + text_to_append)

def __dump(self, data: Dict[str, any], output_type: OutputType, target_file: str):
"""Write the YAML data on the corresponding "output_type" stream."""
if not data:
Expand Down
111 changes: 86 additions & 25 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,14 @@
ServiceIsStopping,
ServiceStartError,
ServiceStopped,
TLSCaRotation,
TLSNewCertsRequested,
TLSNotFullyConfigured,
TLSRelationBrokenError,
TLSRelationMissing,
WaitingToStart,
)
from charms.opensearch.v0.constants_tls import TLS_RELATION, CertType
from charms.opensearch.v0.constants_tls import CertType
from charms.opensearch.v0.helper_charm import Status, all_units, format_unit_name
from charms.opensearch.v0.helper_cluster import ClusterTopology, Node
from charms.opensearch.v0.helper_networking import get_host_ip, units_ips
Expand Down Expand Up @@ -189,7 +190,7 @@ def __init__(self, *args, distro: Type[OpenSearchDistribution] = None):
self.peers_data = RelationDataStore(self, PeerRelationName)
self.secrets = OpenSearchSecrets(self, PeerRelationName)
self.tls = OpenSearchTLS(
self, TLS_RELATION, self.opensearch.paths.jdk, self.opensearch.paths.certs
self, PeerRelationName, self.opensearch.paths.jdk, self.opensearch.paths.certs
)
self.status = Status(self)
self.health = OpenSearchHealth(self)
Expand Down Expand Up @@ -448,9 +449,6 @@ def _on_peer_relation_created(self, event: RelationCreatedEvent):
"Adding units during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
)

# Store the "Admin" certificate, key and CA on the disk of the new unit
self.tls.store_admin_tls_secrets_if_applies()

def _on_peer_relation_joined(self, event: RelationJoinedEvent):
"""Event received by all units when a new node joins the cluster."""
if self.upgrade_in_progress:
Expand All @@ -460,8 +458,6 @@ def _on_peer_relation_joined(self, event: RelationJoinedEvent):

def _on_peer_relation_changed(self, event: RelationChangedEvent):
"""Handle peer relation changes."""
self.tls.store_admin_tls_secrets_if_applies()

if self.unit.is_leader() and self.opensearch.is_node_up():
health = self.health.apply()
if self._is_peer_rel_changed_deferred:
Expand Down Expand Up @@ -576,7 +572,7 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no
# release lock
self.node_lock.release()

def _on_update_status(self, event: UpdateStatusEvent):
def _on_update_status(self, event: UpdateStatusEvent): # noqa: C901
"""On update status event.
We want to periodically check for the following:
Expand Down Expand Up @@ -758,6 +754,11 @@ def _on_get_password_action(self, event: ActionEvent):
}
)

def on_tls_ca_rotation(self):
"""Called when adding new CA to the trust store."""
self.status.set(MaintenanceStatus(TLSCaRotation))
self._restart_opensearch_event.emit()

def on_tls_conf_set(
self, event: CertificateAvailableEvent, scope: Scope, cert_type: CertType, renewal: bool
):
Expand Down Expand Up @@ -790,12 +791,25 @@ def on_tls_conf_set(
self.tls.store_admin_tls_secrets_if_applies()

# In case of renewal of the unit transport layer cert - restart opensearch
if renewal and self.is_admin_user_configured() and self.tls.is_fully_configured():
try:
self.tls.reload_tls_certificates()
except OpenSearchHttpError:
logger.error("Could not reload TLS certificates via API, will restart.")
self._restart_opensearch_event.emit()
if renewal and self.is_admin_user_configured():
if self.tls.is_fully_configured():
try:
self.tls.reload_tls_certificates()
except OpenSearchHttpError:
logger.error("Could not reload TLS certificates via API, will restart.")
self._restart_opensearch_event.emit()
else:
# the chain.pem file should only be updated after applying the new certs
# otherwise there could be TLS verification errors after renewing the CA
self.tls.update_request_ca_bundle()
self.status.clear(TLSNotFullyConfigured)
self.tls.reset_ca_rotation_state()
# cleaning the former CA certificate from the truststore
# must only be done AFTER all renewed certificates are available and loaded
self.tls.remove_old_ca()
else:
event.defer()
return

def on_tls_relation_broken(self, _: RelationBrokenEvent):
"""As long as all certificates are produced, we don't do anything."""
Expand Down Expand Up @@ -827,6 +841,18 @@ def is_every_unit_marked_as_started(self) -> bool:
except OpenSearchHttpError:
return False

def is_tls_full_configured_in_cluster(self) -> bool:
"""Check if TLS is configured in all the units of the current cluster."""
rel = self.model.get_relation(PeerRelationName)
for unit in all_units(self):
if (
rel.data[unit].get("tls_configured") != "True"
or "tls_ca_renewing" in rel.data[unit]
or "tls_ca_renewed" in rel.data[unit]
):
return False
return True

def is_admin_user_configured(self) -> bool:
"""Check if admin user configured."""
# In case the initialisation of the admin user is not finished yet
Expand Down Expand Up @@ -895,18 +921,23 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901

self.peers_data.delete(Scope.UNIT, "started")

if not self.node_lock.acquired:
# (Attempt to acquire lock even if `event.ignore_lock`)
if event.ignore_lock:
# Only used for force upgrades
logger.debug("Starting without lock")
else:
logger.debug("Lock to start opensearch not acquired. Will retry next event")
event.defer()
return
if event.ignore_lock:
# Only used for force upgrades
logger.debug("Starting without lock")
elif not self.node_lock.acquired:
logger.debug("Lock to start opensearch not acquired. Will retry next event")
event.defer()
return

if not self._can_service_start():
# after rotating the CA and certificates:
# the last host in the cluster to restart might not be able to connect to the other
# hosts anymore, because it is the last to renew the pem-file for requests
# in this case we update the pem-file to be able to connect and start the host
if self.peers_data.get(Scope.UNIT, "tls_ca_renewed", False):
self.tls.update_request_ca_bundle()
self.node_lock.release()
logger.info("Could not start opensearch service. Will retry next event.")
event.defer()
return

Expand Down Expand Up @@ -939,6 +970,11 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901
self.unit.status = BlockedStatus(str(e))
return

# we should update the chain.pem file to avoid TLS verification errors
# this happens on restarts after applying a new admin cert on CA rotation
if self.peers_data.get(Scope.UNIT, "tls_ca_renewed", False):
self.tls.update_request_ca_bundle()

try:
self.opensearch.start(
wait_until_http_200=(
Expand All @@ -947,14 +983,19 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901
)
)
self._post_start_init(event)
except (OpenSearchHttpError, OpenSearchStartTimeoutError, OpenSearchNotFullyReadyError):
except (
OpenSearchHttpError,
OpenSearchStartTimeoutError,
OpenSearchNotFullyReadyError,
) as e:
self.node_lock.release()
# In large deployments with cluster-manager-only-nodes, the startup might fail
# for the cluster-manager if a joining data node did not yet initialize the
# security index. We still want to update and broadcast the latest relation data.
if self.opensearch_peer_cm.is_provider(typ="main"):
self.peer_cluster_provider.refresh_relation_data(event, can_defer=False)
event.defer()
logger.warning(e)
except (OpenSearchStartError, OpenSearchUserMgmtError) as e:
logger.warning(e)
self.node_lock.release()
Expand Down Expand Up @@ -992,7 +1033,7 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901
try:
nodes = self._get_nodes(use_localhost=self.opensearch.is_node_up())
except OpenSearchHttpError:
logger.debug("Failed to get online nodes")
logger.info("Failed to get online nodes")
event.defer()
return

Expand Down Expand Up @@ -1043,6 +1084,7 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901

# clear waiting to start status
self.status.clear(WaitingToStart)
self.status.clear(ServiceStartError)
self.status.clear(PClusterNoDataNode)

if event.after_upgrade:
Expand Down Expand Up @@ -1100,6 +1142,23 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901
if self.opensearch_peer_cm.is_provider():
self.peer_cluster_provider.refresh_relation_data(event, can_defer=False)

# update the peer relation data for TLS CA rotation routine
self.tls.reset_ca_rotation_state()
if self.is_tls_full_configured_in_cluster():
self.status.clear(TLSCaRotation)
self.status.clear(TLSNotFullyConfigured)

# request new certificates after rotating the CA
if self.peers_data.get(Scope.UNIT, "tls_ca_renewing", False) and self.peers_data.get(
Scope.UNIT, "tls_ca_renewed", False
):
self.status.set(MaintenanceStatus(TLSNotFullyConfigured))
self.tls.request_new_unit_certificates()
if self.unit.is_leader():
self.tls.request_new_admin_certificate()
else:
self.tls.store_admin_tls_secrets_if_applies()

def _stop_opensearch(self, *, restart=False) -> None:
"""Stop OpenSearch if possible."""
self.status.set(WaitingStatus(ServiceIsStopping))
Expand Down Expand Up @@ -1144,7 +1203,9 @@ def _restart_opensearch(self, event: _RestartOpenSearch) -> None:

try:
self._stop_opensearch(restart=True)
logger.info("Restarting OpenSearch.")
except OpenSearchStopError as e:
logger.info(f"Error while Restarting Opensearch: {e}")
logger.exception(e)
self.node_lock.release()
event.defer()
Expand Down
22 changes: 16 additions & 6 deletions lib/charms/opensearch/v0/opensearch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ def set_client_auth(self):
True,
)

self._opensearch.config.append(
self.JVM_OPTIONS,
"-Djdk.tls.client.protocols=TLSv1.2",
)

def set_admin_tls_conf(self, secrets: Dict[str, any]):
"""Configures the admin certificate."""
self._opensearch.config.put(
Expand All @@ -89,12 +94,11 @@ def set_node_tls_conf(self, cert_type: CertType, truststore_pwd: str, keystore_p
f"{self._opensearch.paths.certs_relative}/{cert if cert == 'ca' else cert_type}.p12",
)

for store_type, certificate_type in [("keystore", cert_type.val), ("truststore", "ca")]:
self._opensearch.config.put(
self.CONFIG_YML,
f"plugins.security.ssl.{target_conf_layer}.{store_type}_alias",
certificate_type,
)
self._opensearch.config.put(
self.CONFIG_YML,
f"plugins.security.ssl.{target_conf_layer}.keystore_alias",
cert_type.val,
)

for store_type, pwd in [("keystore", keystore_pwd), ("truststore", truststore_pwd)]:
self._opensearch.config.put(
Expand All @@ -103,6 +107,12 @@ def set_node_tls_conf(self, cert_type: CertType, truststore_pwd: str, keystore_p
pwd,
)

self._opensearch.config.put(
self.CONFIG_YML,
f"plugins.security.ssl.{target_conf_layer}.enabled_protocols",
"TLSv1.2",
)

def append_transport_node(self, ip_pattern_entries: List[str], append: bool = True):
"""Set the IP address of the new unit in nodes_dn."""
if not append:
Expand Down
3 changes: 2 additions & 1 deletion lib/charms/opensearch/v0/opensearch_distro.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,8 @@ def is_node_up(self, host: Optional[str] = None) -> bool:
timeout=1,
)
return resp_code < 400
except (OpenSearchHttpError, Exception):
except (OpenSearchHttpError, Exception) as e:
logger.debug(f"Error when checking if host {host} is up: {e}")
return False

def run_bin(self, bin_script_name: str, args: str = None, stdin: str = None) -> str:
Expand Down
6 changes: 6 additions & 0 deletions lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,7 @@ def _set_security_conf(self, data: PeerClusterRelData) -> None:

# store the app admin TLS resources if not stored
self.charm.tls.store_new_tls_resources(CertType.APP_ADMIN, data.credentials.admin_tls)
self.charm.tls.update_request_ca_bundle()

# take over the internal users from the main orchestrator
self.charm.user_manager.put_internal_user(AdminUser, data.credentials.admin_password_hash)
Expand Down Expand Up @@ -916,6 +917,11 @@ def _error_set_from_tls(self, peer_cluster_rel_data: PeerClusterRelData) -> bool
blocked_msg = "CA certificate mismatch between clusters."
should_sever_relation = True

if not peer_cluster_rel_data.credentials.admin_tls["truststore-password"]:
logger.info("Relation data for TLS is missing.")
blocked_msg = "CA truststore-password not available."
should_sever_relation = True

if not blocked_msg:
self._clear_errors("error_from_tls")
return False
Expand Down
8 changes: 1 addition & 7 deletions lib/charms/opensearch/v0/opensearch_secrets.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,7 @@ def _on_secret_changed(self, event: SecretChangedEvent): # noqa: C901

logger.debug("Secret change for %s", str(label_key))

# Leader has to maintain TLS and Dashboards relation credentials
if not is_leader and label_key == CertType.APP_ADMIN.val:
self._charm.tls.store_new_tls_resources(CertType.APP_ADMIN, event.secret.get_content())
if self._charm.tls.is_fully_configured():
self._charm.peers_data.put(Scope.UNIT, "tls_configured", True)

elif is_leader and label_key == self._charm.secrets.password_key(KibanaserverUser):
if is_leader and label_key == self._charm.secrets.password_key(KibanaserverUser):
self._charm.opensearch_provider.update_dashboards_password()

# Non-leader units need to maintain local users in internal_users.yml
Expand Down
Loading

0 comments on commit 7614a18

Please sign in to comment.