Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.18] Enable NROP metrics to be to scraped securely by Prometheus #1140

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Assuming you can push container images to a container registry and you are in th
1. build and upload the manifest bundle container image: `make bundle bundle-build bundle-push`
1. leverage `operator-sdk` to deploy the container: `operator-sdk run bundle ${REPO}/numaresources-operator-bundle:${VERSION}`. Note the build procedure typically downloads a local copy of `operator-sdk` in `bin/` which you can reuse

Note that installing the operator using this method requires adding the openshift.io/cluster-monitoring: "true" label to the operator namespace to enable Prometheus cluster monitoring.

For further details, please refer to the [operator-sdk documentation](https://sdk.operatorframework.io/docs/olm-integration/tutorial-bundle/)

## roadmap
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
kind: Service
metadata:
annotations:
service.beta.openshift.io/serving-cert-secret-name: metrics-service-cert
creationTimestamp: null
labels:
control-plane: controller-manager
name: numaresources-controller-manager-metrics-service
spec:
ports:
- name: https
port: 8080
protocol: TCP
targetPort: https
selector:
control-plane: controller-manager
status:
loadBalancer: {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
creationTimestamp: null
name: numaresources-manager-role
rules:
- apiGroups:
- ""
resources:
- services
verbs:
- '*'
19 changes: 19 additions & 0 deletions bundle/manifests/numaresources-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ metadata:
capabilities: Basic Install
createdAt: "2025-01-07T08:34:09Z"
olm.skipRange: '>=4.17.0 <4.18.0'
operatorframework.io/cluster-monitoring: "true"
operators.operatorframework.io/builder: operator-sdk-v1.36.1
operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
name: numaresources-operator.v4.18.999-snapshot
Expand Down Expand Up @@ -533,6 +534,10 @@ spec:
initialDelaySeconds: 15
periodSeconds: 20
name: manager
ports:
- containerPort: 8080
name: https
protocol: TCP
readinessProbe:
httpGet:
path: /readyz
Expand All @@ -545,6 +550,10 @@ spec:
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
volumeMounts:
- mountPath: /certs
name: metrics-tls
readOnly: true
securityContext:
runAsNonRoot: true
serviceAccountName: numaresources-controller-manager
Expand All @@ -554,6 +563,10 @@ spec:
key: node-role.kubernetes.io/control-plane
- effect: NoSchedule
key: node-role.kubernetes.io/master
volumes:
- name: metrics-tls
secret:
secretName: metrics-service-cert
permissions:
- rules:
- apiGroups:
Expand Down Expand Up @@ -587,6 +600,12 @@ spec:
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- services
verbs:
- '*'
serviceAccountName: numaresources-controller-manager
strategy: deployment
installModes:
Expand Down
13 changes: 13 additions & 0 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ metadata:
annotations:
workload.openshift.io/allowed: management
labels:
openshift.io/cluster-monitoring: "true"
control-plane: controller-manager
name: system
---
Expand Down Expand Up @@ -43,15 +44,27 @@ spec:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
volumes:
- name : metrics-tls
secret:
secretName: metrics-service-cert
containers:
- command:
- /bin/numaresources-operator
args:
- -v=4
- --leader-elect
- --enable-scheduler
ports:
- containerPort: 8080
protocol: TCP
name: https
image: controller:latest
name: manager
volumeMounts:
- name: metrics-tls
mountPath: /certs
readOnly: true
securityContext:
allowPrivilegeEscalation: false
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ metadata:
alm-examples: '[]'
capabilities: Basic Install
olm.skipRange: '>=4.17.0 <4.18.0'
operatorframework.io/cluster-monitoring: "true"
name: numaresources-operator.v0.0.0
namespace: placeholder
spec:
Expand Down
1 change: 1 addition & 0 deletions config/prometheus/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
resources:
- rbac.yaml
- monitor.yaml
23 changes: 13 additions & 10 deletions config/prometheus/monitor.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@

# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
control-plane: controller-manager
name: controller-manager-metrics-monitor
name: controller-manager
namespace: system
spec:
endpoints:
- path: /metrics
port: https
scheme: https
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token"
interval: 30s
targetPort: 8080
path: /metrics
scheme: https
tlsConfig:
# The CA file used by Prometheus to verify the server's certificate.
# It's the cluster's CA bundle from the service CA operator.
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
# The name of the server (CN) in the server's certificate.
serverName: numaresources-controller-manager-metrics-service.numaresources.svc
insecureSkipVerify: false
selector:
matchLabels:
control-plane: controller-manager
31 changes: 31 additions & 0 deletions config/prometheus/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# creates Role and RoleBinding for prometheus-k8s service account to access our namespace
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: system
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: openshift-monitoring
1 change: 1 addition & 0 deletions config/rbac/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ resources:
# runtime. Be sure to update RoleBinding and ClusterRoleBinding
# subjects if changing service account names.
- service_account.yaml
- service.yaml
- role.yaml
- role_binding.yaml
- leader_election_role.yaml
Expand Down
13 changes: 13 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,16 @@ rules:
- get
- list
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: manager-role
namespace: numaresources
rules:
- apiGroups:
- ""
resources:
- services
verbs:
- '*'
13 changes: 13 additions & 0 deletions config/rbac/role_binding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,16 @@ subjects:
- kind: ServiceAccount
name: controller-manager
namespace: system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: manager-rolebinding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: manager-role
subjects:
- kind: ServiceAccount
name: controller-manager
namespace: system
16 changes: 16 additions & 0 deletions config/rbac/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: v1
kind: Service
metadata:
annotations:
service.beta.openshift.io/serving-cert-secret-name: metrics-service-cert
labels:
control-plane: controller-manager
name: controller-manager-metrics-service
spec:
ports:
- name: https
port: 8080
protocol: TCP
targetPort: https
selector:
control-plane: controller-manager
2 changes: 1 addition & 1 deletion controllers/numaresourcesoperator_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ type NUMAResourcesOperatorReconciler struct {
// TODO: narrow down

// Namespace Scoped
// TODO
//+kubebuilder:rbac:groups="",resources=services,verbs=*,namespace="numaresources"

// Cluster Scoped
//+kubebuilder:rbac:groups=topology.node.k8s.io,resources=noderesourcetopologies,verbs=get;list;create;update
Expand Down
23 changes: 16 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
securityv1 "github.com/openshift/api/security/v1"
machineconfigv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
k8sruntime "k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
Expand Down Expand Up @@ -74,10 +75,11 @@ const (
)

const (
defaultWebhookPort = 9443
defaultMetricsAddr = ":8080"
defaultProbeAddr = ":8081"
defaultNamespace = "numaresources-operator"
defaultWebhookPort = 9443
defaultMetricsAddr = ":8080"
defaultMetricsSupport = true
defaultProbeAddr = ":8081"
defaultNamespace = "numaresources-operator"
)

var (
Expand Down Expand Up @@ -132,6 +134,7 @@ func (pa *Params) SetDefaults() {
pa.probeAddr = defaultProbeAddr
pa.render.Namespace = defaultNamespace
pa.enableReplicasDetect = true
pa.enableMetrics = defaultMetricsSupport
}

func (pa *Params) FromFlags() {
Expand Down Expand Up @@ -245,11 +248,17 @@ func main() {
klog.InfoS("metrics server", "enabled", params.enableMetrics, "addr", params.metricsAddr)

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Cache: cache.Options{}, // TODO: restrict namespace here?
Cache: cache.Options{
DefaultNamespaces: map[string]cache.Config{
namespace: {},
metav1.NamespaceNone: {},
},
},
Scheme: scheme,
Metrics: metricsserver.Options{
// TODO: secureServing?
BindAddress: params.metricsAddr,
BindAddress: params.metricsAddr,
SecureServing: true,
CertDir: "/certs",
},
WebhookServer: webhook.NewServer(webhook.Options{
Port: params.webhookPort,
Expand Down
Loading