diff --git a/CHANGELOG.md b/CHANGELOG.md index 2215f2983..8cf315e65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,6 @@ ### Changes -- Implemented `to_hex` for `AccountIdPrefix` and `epoch_block_num` for `BlockHeader` (#1039). -- Added tracing to the `miden-tx-prover` CLI (#1014). - Added health check endpoints to the prover service (#1006). - Implemented serialization for `AccountHeader` (#996). - Updated Pingora crates to 0.4 and added polling time to the configuration file (#997). @@ -21,6 +19,9 @@ - [BREAKING] Refactor error messages in `miden-lib` and `miden-tx` and use `thiserror` 2.0 (#1005). - [BREAKING] Extend `AccountId` to two `Felt`s and require block hash in derivation (#982). - Removed workers list from the proxy configuration file (#1018). +- Added tracing to the `miden-tx-prover` CLI (#1014). +- Added metrics to the `miden-tx-prover` proxy (#1017). +- Implemented `to_hex` for `AccountIdPrefix` and `epoch_block_num` for `BlockHeader` (#1039). ## 0.6.2 (2024-11-20) diff --git a/Cargo.lock b/Cargo.lock index b04af2dc0..0ca30ecc1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2037,6 +2037,7 @@ dependencies = [ "pingora-core", "pingora-limits", "pingora-proxy", + "prometheus", "prost", "prost-build", "protox", diff --git a/bin/tx-prover/Cargo.toml b/bin/tx-prover/Cargo.toml index 1a02075bb..650862eb9 100644 --- a/bin/tx-prover/Cargo.toml +++ b/bin/tx-prover/Cargo.toml @@ -54,6 +54,7 @@ figment = { version = "0.10", features = ["toml", "env"] } miden-lib = { workspace = true, default-features = false } miden-objects = { workspace = true, default-features = false } miden-tx = { workspace = true, default-features = false } +prometheus = "0.13" prost = { version = "0.13", default-features = false, features = ["derive"] } reqwest = { version = "0.11" } serde = { version = "1.0", features = ["derive"] } diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index 7dba23387..820b3173a 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -59,8 +59,14 @@ max_queue_items = 10 max_retries_per_request = 1 # Maximum amount of requests that a given IP address can make per second max_req_per_sec = 5 +# Time to wait before checking the availability of workers +available_workers_polling_time_ms = 20 # Interval to check the health of the workers health_check_interval_secs = 1 +# Host of the metrics server +prometheus_host = "127.0.0.1" +# Port of the metrics server +prometheus_port = 6192 ``` Then, to start the proxy service, you will need to run: @@ -120,6 +126,30 @@ Then access the Jaeger UI at `http://localhost:16686/`. If Docker is not an option, Jaeger can also be set up directly on your machine or hosted in the cloud. See the [Jaeger documentation](https://www.jaegertracing.io/docs/) for alternative installation methods. +## Metrics + +The proxy includes a service that exposes metrics to be consumed by [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the host and port defined in the `miden-tx-prover.toml` file. + +The metrics architecture works by having the proxy expose metrics at an endpoint (`/metrics`) in a format Prometheus can read. Prometheus periodically scrapes this endpoint, adds timestamps to the metrics, and stores them in its time-series database. Then, we can use tools like Grafana to query Prometheus and visualize these metrics in configurable dashboards. + +The simplest way to install Prometheus and Grafana is by using Docker containers. To do so, run: + +```bash +docker run \ + -d \ + -p 9090:9090 \ + -v /path/to/prometheus.yml:/etc/prometheus/prometheus.yml \ + prom/prometheus + +docker run -d -p 3000:3000 --name grafana grafana/grafana-enterprise:latest +``` + +In case that Docker is not an option, Prometheus and Grafana can also be set up directly on your machine or hosted in the cloud. See the [Prometheus documentation](https://prometheus.io/docs/prometheus/latest/getting_started/) and [Grafana documentation](https://grafana.com/docs/grafana/latest/setup-grafana/) for alternative installation methods. + +A prometheus configuration file is provided in this repository, you will need to modify the `scrape_configs` section to include the host and port of the proxy service. + +Then, to add the new Prometheus collector as a datasource for Grafana, you can [follow this tutorial](https://grafana.com/docs/grafana-cloud/connect-externally-hosted/existing-datasource/). A Grafana dashboard under the name `proxy_grafana_dashboard.json` is provided, see this [link](https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/import-dashboards/) to import it. Otherwise, you can [create your own dashboard](https://grafana.com/docs/grafana/latest/getting-started/build-first-dashboard/) using the metrics provided by the proxy and export it by following this [link](https://grafana.com/docs/grafana/latest/dashboards/share-dashboards-panels/#export-a-dashboard-as-json). + ## Features Description of this crate's feature: diff --git a/bin/tx-prover/grafana_dashboard.json b/bin/tx-prover/grafana_dashboard.json new file mode 100644 index 000000000..ba6791a3a --- /dev/null +++ b/bin/tx-prover/grafana_dashboard.json @@ -0,0 +1,1118 @@ +{ + "__inputs": [ + { + "name": "DS_TX_PROVER", + "label": "tx_prover", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.4.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 18, + "panels": [], + "title": "Requests", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqpm" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Total requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failed requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Accepted requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 0, + "y": 1 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "sum(rate(request_count[1m]))", + "hide": false, + "instant": false, + "legendFormat": "Total requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "sum(rate(request_count[1m])) - sum(rate(rate_limited_requests[1m])) - sum(rate(queue_drop_count[1m]))", + "hide": false, + "instant": false, + "legendFormat": "Accepted requests", + "range": true, + "refId": "C" + }, + { + "editorMode": "code", + "expr": "sum(rate(request_failure_count[1m]))", + "legendFormat": "Failed requests", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqpm" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Rate limited requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Queue overflow requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 9, + "y": 1 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(rate_limited_requests[1m])", + "hide": false, + "instant": false, + "legendFormat": "Rate limited requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(queue_drop_count[1m])", + "hide": false, + "instant": false, + "legendFormat": "Queue overflow requests", + "range": true, + "refId": "C" + } + ], + "title": "Rejected requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlRd" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqpm" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 18, + "y": 1 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(request_retries[1m])", + "legendFormat": "Retry rate", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Request retry rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 9 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "(1 - rate(request_failure_count[1m]) / rate(request_count[1m])) * 100", + "legendFormat": "Success rate over time", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Success rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 8, + "y": 9 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(request_latency_sum[1m]) / rate(request_latency_count[1m])", + "legendFormat": "Average request latency", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(queue_latency_sum[1m]) / rate(queue_latency_count[1m])", + "hide": false, + "instant": false, + "legendFormat": "Average queue latency", + "range": true, + "refId": "B" + } + ], + "title": "Latency", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 19, + "panels": [], + "title": "Workers", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 0, + "y": 18 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "worker_count", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Total workers", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "worker_busy", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Busy workers", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Workers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 7, + "y": 18 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "worker_unhealthy", + "legendFormat": "Unhealthy workers", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Unhealthy workers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 14, + "y": 18 + }, + "id": 12, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(worker_request_count[1m])", + "legendFormat": "{{worker_id}}", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Requests per worker", + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 20, + "panels": [], + "title": "Queue", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 15 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 27 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "queue_size", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "Queue size", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Queue size", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "tx_prover", + "uid": "be7bobzl5fr40f", + "version": 40, + "weekStart": "" +} \ No newline at end of file diff --git a/bin/tx-prover/prometheus.yml b/bin/tx-prover/prometheus.yml new file mode 100644 index 000000000..321418278 --- /dev/null +++ b/bin/tx-prover/prometheus.yml @@ -0,0 +1,16 @@ +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# A scrape configuration containing exactly one endpoint to scrape: +scrape_configs: + # The job name is a label that is used to group targets in the Prometheus UI. + # It can be any string. + - job_name: "tx_prover" + # Here you need to specify the address of the Prometheus service endpoint in the proxy + # We use the default port for Prometheus, but it need to be changed if you use a different host + # or port. In case of using Prometheus in a docker container, you can use the + # `host.docker.internal` address to access the host machine. + static_configs: + - targets: ["127.0.0.1:6192"] diff --git a/bin/tx-prover/src/commands/mod.rs b/bin/tx-prover/src/commands/mod.rs index 8d9f494f5..84954ed59 100644 --- a/bin/tx-prover/src/commands/mod.rs +++ b/bin/tx-prover/src/commands/mod.rs @@ -42,6 +42,10 @@ pub struct ProxyConfig { pub available_workers_polling_time_ms: u64, /// Health check interval in seconds. pub health_check_interval_secs: u64, + /// Prometheus metrics host. + pub prometheus_host: String, + /// Prometheus metrics port. + pub prometheus_port: u16, } impl Default for ProxyConfig { @@ -56,6 +60,8 @@ impl Default for ProxyConfig { max_req_per_sec: 5, available_workers_polling_time_ms: 20, health_check_interval_secs: 1, + prometheus_host: "127.0.0.1".into(), + prometheus_port: 6192, } } } diff --git a/bin/tx-prover/src/commands/proxy.rs b/bin/tx-prover/src/commands/proxy.rs index 12c06c87a..e172543fb 100644 --- a/bin/tx-prover/src/commands/proxy.rs +++ b/bin/tx-prover/src/commands/proxy.rs @@ -75,6 +75,14 @@ impl StartProxy { http_server_options.h2c = true; logic.server_options = Some(http_server_options); + // Enable Prometheus metrics + let mut prometheus_service_http = + pingora::services::listening::Service::prometheus_http_service(); + prometheus_service_http.add_tcp( + format!("{}:{}", proxy_config.prometheus_host, proxy_config.prometheus_port).as_str(), + ); + + server.add_service(prometheus_service_http); server.add_service(health_check_service); server.add_service(lb); tokio::task::spawn_blocking(|| server.run_forever()) diff --git a/bin/tx-prover/src/main.rs b/bin/tx-prover/src/main.rs index 9f560d62e..2c3c68714 100644 --- a/bin/tx-prover/src/main.rs +++ b/bin/tx-prover/src/main.rs @@ -1,7 +1,7 @@ pub mod api; pub mod commands; pub mod error; -mod proxy; +pub mod proxy; mod utils; use commands::Cli; use utils::setup_tracing; diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs new file mode 100644 index 000000000..8a02e6509 --- /dev/null +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -0,0 +1,90 @@ +use std::sync::LazyLock; + +use prometheus::{ + register_histogram, register_int_counter, register_int_counter_vec, register_int_gauge, + Histogram, IntCounter, IntCounterVec, IntGauge, +}; + +// SAFETY: The `unwrap` calls here are safe because: +// 1. The metrics being registered (gauges, counters, histograms) use hardcoded names and +// descriptions, which are guaranteed not to conflict within the application. +// 2. Registration errors occur only if there is a naming conflict, which is not possible in this +// context due to controlled metric definitions. +// 3. Any changes to metric names or types should be carefully reviewed to avoid conflicts. + +// QUEUE METRICS +// ================================================================================================ + +pub static QUEUE_SIZE: LazyLock = + LazyLock::new(|| register_int_gauge!("queue_size", "Number of requests in the queue").unwrap()); +pub static QUEUE_LATENCY: LazyLock = LazyLock::new(|| { + register_histogram!( + "queue_latency", + "Time (in seconds) requests spend in the queue", + vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0] + ) + .unwrap() +}); +pub static QUEUE_DROP_COUNT: LazyLock = LazyLock::new(|| { + register_int_counter!("queue_drop_count", "Number of requests dropped due to a full queue") + .unwrap() +}); + +// WORKER METRICS +// ================================================================================================ + +pub static WORKER_COUNT: LazyLock = + LazyLock::new(|| register_int_gauge!("worker_count", "Total number of workers").unwrap()); +pub static WORKER_UNHEALTHY: LazyLock = LazyLock::new(|| { + register_int_counter!( + "worker_unhealthy", + "Number of times that workers were registered as unhealthy" + ) + .unwrap() +}); +pub static WORKER_BUSY: LazyLock = + LazyLock::new(|| register_int_gauge!("worker_busy", "Number of busy workers").unwrap()); +pub static WORKER_REQUEST_COUNT: LazyLock = LazyLock::new(|| { + register_int_counter_vec!( + "worker_request_count", + "Number of requests processed by each worker", + &["worker_id"] + ) + .unwrap() +}); + +// REQUEST METRICS +// ================================================================================================ + +pub static REQUEST_FAILURE_COUNT: LazyLock = LazyLock::new(|| { + register_int_counter!("request_failure_count", "Number of failed requests").unwrap() +}); +pub static REQUEST_RETRIES: LazyLock = LazyLock::new(|| { + register_int_counter!("request_retries", "Number of request retries").unwrap() +}); +pub static REQUEST_COUNT: LazyLock = LazyLock::new(|| { + register_int_counter!("request_count", "Number of requests processed").unwrap() +}); +pub static REQUEST_LATENCY: LazyLock = LazyLock::new(|| { + register_histogram!( + "request_latency", + "Time (in seconds) requests take to process", + vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0] + ) + .unwrap() +}); + +// RATE LIMITING METRICS +// ================================================================================================ + +pub static RATE_LIMITED_REQUESTS: LazyLock = LazyLock::new(|| { + register_int_counter!( + "rate_limited_requests", + "Number of requests blocked due to rate limiting" + ) + .unwrap() +}); +pub static RATE_LIMIT_VIOLATIONS: LazyLock = LazyLock::new(|| { + register_int_counter!("rate_limit_violations", "Number of rate limit violations by clients") + .unwrap() +}); diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index 73b654588..090798a12 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -1,7 +1,18 @@ -use std::{collections::VecDeque, future::Future, pin::Pin, sync::Arc, time::Duration}; +use std::{ + collections::VecDeque, + future::Future, + pin::Pin, + sync::Arc, + time::{Duration, Instant}, +}; use async_trait::async_trait; use bytes::Bytes; +use metrics::{ + QUEUE_LATENCY, QUEUE_SIZE, RATE_LIMITED_REQUESTS, RATE_LIMIT_VIOLATIONS, REQUEST_COUNT, + REQUEST_FAILURE_COUNT, REQUEST_LATENCY, REQUEST_RETRIES, WORKER_BUSY, WORKER_COUNT, + WORKER_REQUEST_COUNT, WORKER_UNHEALTHY, +}; use once_cell::sync::Lazy; use pingora::{ http::ResponseHeader, @@ -32,6 +43,7 @@ use crate::{ }, }; +pub mod metrics; mod worker; /// Localhost address @@ -73,6 +85,11 @@ impl LoadBalancerState { workers.push(Worker::new(worker, connection_timeout, total_timeout).await?); } + WORKER_COUNT.set(workers.len() as i64); + RATE_LIMIT_VIOLATIONS.reset(); + RATE_LIMITED_REQUESTS.reset(); + REQUEST_RETRIES.reset(); + Ok(Self { workers: Arc::new(RwLock::new(workers)), timeout_secs: total_timeout, @@ -94,6 +111,7 @@ impl LoadBalancerState { let mut available_workers = self.workers.write().await; available_workers.iter_mut().find(|w| w.is_available()).map(|w| { w.set_availability(false); + WORKER_BUSY.inc(); w.clone() }) } @@ -159,6 +177,7 @@ impl LoadBalancerState { } info!("Workers updated: {:?}", workers); + WORKER_COUNT.set(workers.len() as i64); Ok(()) } @@ -168,6 +187,11 @@ impl LoadBalancerState { self.workers.read().await.len() } + /// Get the number of busy workers. + pub async fn num_busy_workers(&self) -> usize { + self.workers.read().await.iter().filter(|w| !w.is_available()).count() + } + /// Handles the update workers request. /// /// # Behavior @@ -256,39 +280,51 @@ static RATE_LIMITER: Lazy = Lazy::new(|| Rate::new(Duration::from_secs(1)) // REQUEST QUEUE // ================================================================================================ -/// Request queue holds the list of requests that are waiting to be processed by the workers. +/// Request queue holds the list of requests that are waiting to be processed by the workers and +/// the time they were enqueued. /// It is used to keep track of the order of the requests to then assign them to the workers. pub struct RequestQueue { - queue: RwLock>, + queue: RwLock>, } impl RequestQueue { /// Create a new empty request queue + #[allow(clippy::new_without_default)] pub fn new() -> Self { + QUEUE_SIZE.set(0); Self { queue: RwLock::new(VecDeque::new()) } } /// Get the length of the queue + #[allow(clippy::len_without_is_empty)] pub async fn len(&self) -> usize { self.queue.read().await.len() } /// Enqueue a request pub async fn enqueue(&self, request_id: Uuid) { + QUEUE_SIZE.inc(); let mut queue = self.queue.write().await; - queue.push_back(request_id); + queue.push_back((request_id, Instant::now())); } /// Dequeue a request pub async fn dequeue(&self) -> Option { let mut queue = self.queue.write().await; - queue.pop_front() + // If the queue was empty, the queue size does not change + if let Some((request_id, queued_time)) = queue.pop_front() { + QUEUE_SIZE.dec(); + QUEUE_LATENCY.observe(queued_time.elapsed().as_secs_f64()); + Some(request_id) + } else { + None + } } /// Peek at the first request in the queue pub async fn peek(&self) -> Option { let queue = self.queue.read().await; - queue.front().copied() + queue.front().copied().map(|(request_id, _)| request_id) } } @@ -299,8 +335,10 @@ static QUEUE: Lazy = Lazy::new(RequestQueue::new); // ================================================================================================ /// Custom context for the request/response lifecycle +/// /// We use this context to keep track of the number of tries for a request, the unique ID for the -/// request, and the worker that will process the request. +/// request, the worker that will process the request, a span that will be used for traces along +/// the transaction execution, and a timer to track how long the request took. #[derive(Debug)] pub struct RequestContext { /// Number of tries for the request @@ -311,6 +349,8 @@ pub struct RequestContext { worker: Option, /// Parent span for the request parent_span: Span, + /// Time when the request was created + created_at: Instant, } impl RequestContext { @@ -322,11 +362,13 @@ impl RequestContext { request_id, worker: None, parent_span: info_span!(target: MIDEN_TX_PROVER, "proxy:new_request", request_id = request_id.to_string()), + created_at: Instant::now(), } } /// Set the worker that will process the request fn set_worker(&mut self, worker: Worker) { + WORKER_REQUEST_COUNT.with_label_values(&[&worker.address()]).inc(); self.worker = Some(worker); } } @@ -400,6 +442,9 @@ impl ProxyHttp for LoadBalancer { } } + // Increment the request count + REQUEST_COUNT.inc(); + let user_id = Some(client_addr); // Retrieve the current window requests @@ -407,6 +452,13 @@ impl ProxyHttp for LoadBalancer { // Rate limit the request if curr_window_requests > self.0.max_req_per_sec { + RATE_LIMITED_REQUESTS.inc(); + + // Only count a violation the first time in a given window + if curr_window_requests == self.0.max_req_per_sec + 1 { + RATE_LIMIT_VIOLATIONS.inc(); + } + return create_too_many_requests_response(session, self.0.max_req_per_sec).await; }; @@ -524,6 +576,7 @@ impl ProxyHttp for LoadBalancer { if ctx.tries > self.0.max_retries_per_request { return e; } + REQUEST_RETRIES.inc(); ctx.tries += 1; e.set_retry(true); e @@ -539,6 +592,7 @@ impl ProxyHttp for LoadBalancer { Self::CTX: Send + Sync, { if let Some(e) = e { + REQUEST_FAILURE_COUNT.inc(); error!("Error: {:?}", e); } @@ -546,6 +600,11 @@ impl ProxyHttp for LoadBalancer { if let Some(worker) = ctx.worker.take() { self.0.add_available_worker(worker).await; } + + REQUEST_LATENCY.observe(ctx.created_at.elapsed().as_secs_f64()); + + // Update the number of busy workers + WORKER_BUSY.set(self.0.num_busy_workers().await as i64); } // The following methods are a copy of the default implementation defined in the trait, but @@ -723,6 +782,7 @@ impl BackgroundService for LoadBalancerState { let _guard = span.enter(); let mut workers = self.workers.write().await; + let initial_workers_len = workers.len(); // Perform health checks on workers and retain healthy ones let healthy_workers = self.check_workers_health(workers.iter_mut()).await; @@ -730,6 +790,11 @@ impl BackgroundService for LoadBalancerState { // Update the worker list with healthy workers *workers = healthy_workers; + // Update the worker count and worker unhealhy count metrics + WORKER_COUNT.set(workers.len() as i64); + let unhealthy_workers = initial_workers_len - workers.len(); + WORKER_UNHEALTHY.inc_by(unhealthy_workers as u64); + // Sleep for the defined interval before the next health check sleep(self.health_check_frequency).await; } diff --git a/bin/tx-prover/src/utils.rs b/bin/tx-prover/src/utils.rs index 6cfdc2ac8..9d8718d99 100644 --- a/bin/tx-prover/src/utils.rs +++ b/bin/tx-prover/src/utils.rs @@ -16,7 +16,7 @@ use tonic::transport::Channel; use tonic_health::pb::health_client::HealthClient; use tracing_subscriber::{layer::SubscriberExt, Registry}; -use crate::error::TxProverServiceError; +use crate::{error::TxProverServiceError, proxy::metrics::QUEUE_DROP_COUNT}; pub const MIDEN_TX_PROVER: &str = "miden-tx-prover"; @@ -112,6 +112,10 @@ pub(crate) async fn create_queue_full_response( error.set_cause("Too many requests in the queue"); session.write_response_header(Box::new(header), false).await?; + + // Increment the queue drop count metric + QUEUE_DROP_COUNT.inc(); + Err(error) }