From f0b411ef926c76660a44cb47bf78751223ce567e Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Thu, 12 Dec 2024 21:26:22 +0000 Subject: [PATCH 01/27] feat(database monitor): handles multi-region and connection pooling - rewritten in typescript --- .circleci/config.yml | 3 +- packages/database-monitor/.env.example | 7 + packages/database-monitor/Dockerfile | 51 +++ packages/database-monitor/README.md | 17 + packages/database-monitor/eslint.config.mjs | 61 ++++ .../database-monitor/multiregion.example.json | 32 ++ packages/database-monitor/package.json | 65 ++++ packages/database-monitor/src/aliasLoader.ts | 8 + packages/database-monitor/src/bin.ts | 9 + packages/database-monitor/src/bootstrap.ts | 2 + packages/database-monitor/src/clients/knex.ts | 53 +++ packages/database-monitor/src/domain/const.ts | 1 + .../src/observability/expressLogging.ts | 96 +++++ .../src/observability/logging.ts | 14 + .../src/observability/metricsApp.ts | 26 ++ .../src/observability/metricsRoute.ts | 15 + .../src/observability/prometheusMetrics.ts | 333 ++++++++++++++++++ packages/database-monitor/src/root.ts | 21 ++ .../src/server/routes/index.ts | 13 + .../database-monitor/src/server/server.ts | 88 +++++ packages/database-monitor/src/utils/env.ts | 22 ++ .../src/utils/errorHandler.ts | 25 ++ .../database-monitor/tests/helpers/helpers.ts | 45 +++ .../tests/helpers/testExtensions.ts | 65 ++++ .../tests/helpers/testKnexClient.ts | 20 ++ .../tests/hooks/globalSetup.ts | 50 +++ packages/database-monitor/tsconfig.build.json | 5 + packages/database-monitor/tsconfig.json | 109 ++++++ packages/database-monitor/vitest.config.ts | 20 ++ packages/preview-service/src/clients/knex.ts | 2 +- .../highfrequencyMonitoring.ts | 6 +- packages/server/logging/index.ts | 3 +- utils/monitor-deployment/Dockerfile | 33 -- utils/monitor-deployment/dev_run.sh | 10 - utils/monitor-deployment/requirements.txt | 3 - utils/monitor-deployment/src/run.py | 211 ----------- yarn.lock | 39 +- 37 files changed, 1318 insertions(+), 265 deletions(-) create mode 100644 packages/database-monitor/.env.example create mode 100644 packages/database-monitor/Dockerfile create mode 100644 packages/database-monitor/README.md create mode 100644 packages/database-monitor/eslint.config.mjs create mode 100644 packages/database-monitor/multiregion.example.json create mode 100644 packages/database-monitor/package.json create mode 100644 packages/database-monitor/src/aliasLoader.ts create mode 100644 packages/database-monitor/src/bin.ts create mode 100644 packages/database-monitor/src/bootstrap.ts create mode 100644 packages/database-monitor/src/clients/knex.ts create mode 100644 packages/database-monitor/src/domain/const.ts create mode 100644 packages/database-monitor/src/observability/expressLogging.ts create mode 100644 packages/database-monitor/src/observability/logging.ts create mode 100644 packages/database-monitor/src/observability/metricsApp.ts create mode 100644 packages/database-monitor/src/observability/metricsRoute.ts create mode 100644 packages/database-monitor/src/observability/prometheusMetrics.ts create mode 100644 packages/database-monitor/src/root.ts create mode 100644 packages/database-monitor/src/server/routes/index.ts create mode 100644 packages/database-monitor/src/server/server.ts create mode 100644 packages/database-monitor/src/utils/env.ts create mode 100644 packages/database-monitor/src/utils/errorHandler.ts create mode 100644 packages/database-monitor/tests/helpers/helpers.ts create mode 100644 packages/database-monitor/tests/helpers/testExtensions.ts create mode 100644 packages/database-monitor/tests/helpers/testKnexClient.ts create mode 100644 packages/database-monitor/tests/hooks/globalSetup.ts create mode 100644 packages/database-monitor/tsconfig.build.json create mode 100644 packages/database-monitor/tsconfig.json create mode 100644 packages/database-monitor/vitest.config.ts delete mode 100644 utils/monitor-deployment/Dockerfile delete mode 100755 utils/monitor-deployment/dev_run.sh delete mode 100644 utils/monitor-deployment/requirements.txt delete mode 100644 utils/monitor-deployment/src/run.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 9a7eda562c..c3588c135f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1049,8 +1049,7 @@ jobs: docker-build-monitor-container: <<: *build-job environment: - FOLDER: utils - SPECKLE_SERVER_PACKAGE: monitor-deployment + SPECKLE_SERVER_PACKAGE: database-monitor docker-build-docker-compose-ingress: <<: *build-job diff --git a/packages/database-monitor/.env.example b/packages/database-monitor/.env.example new file mode 100644 index 0000000000..3ac8c2c9ac --- /dev/null +++ b/packages/database-monitor/.env.example @@ -0,0 +1,7 @@ +PG_CONNECTION_STRING='postgres://speckle:speckle@127.0.0.1/speckle' +POSTGRES_MAX_CONNECTIONS='2' +PROMETHEUS_METRICS_PORT='9092' +LOG_LEVEL='info' +LOG_PRETTY='true' +METRICS_COLLECTION_PERIOD_SECONDS='120' +FF_WORKSPACES_MULTI_REGION_ENABLED='false' diff --git a/packages/database-monitor/Dockerfile b/packages/database-monitor/Dockerfile new file mode 100644 index 0000000000..48e25ead95 --- /dev/null +++ b/packages/database-monitor/Dockerfile @@ -0,0 +1,51 @@ +# NOTE: Docker context must be the git root directory, to include the shared directory +ARG NODE_ENV=production + +FROM node:18-bookworm-slim@sha256:408f8cbbb7b33a5bb94bdb8862795a94d2b64c2d516856824fd86c4a5594a443 AS build-stage + +WORKDIR /speckle-server + +# Download tini +ARG TINI_VERSION=v0.19.0 +ENV TINI_VERSION=${TINI_VERSION} +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini ./tini +RUN chmod +x ./tini + +ARG NODE_ENV +ENV NODE_ENV=${NODE_ENV} + +WORKDIR /speckle-server + +COPY .yarnrc.yml . +COPY .yarn ./.yarn +COPY package.json yarn.lock ./ + +# Only copy in the relevant package.json files for the dependencies +COPY packages/shared/package.json ./packages/shared/ +COPY packages/database-monitor/package.json ./packages/database-monitor/ + +RUN yarn workspaces focus -A && yarn install + +# Only copy in the relevant source files for the dependencies +COPY packages/shared ./packages/shared/ +COPY packages/database-monitor ./packages/database-monitor/ + +RUN yarn workspaces foreach -W run build + +WORKDIR /speckle-server/packages/database-monitor +RUN yarn workspaces focus --production + +FROM gcr.io/distroless/nodejs18-debian12:nonroot@sha256:afdea027580f7afcaf1f316b2b3806690c297cb3ce6ddc5cf6a15804dc1c790f AS production-stage + +ARG NODE_ENV +ENV NODE_ENV=${NODE_ENV} + +WORKDIR /speckle-server +COPY --from=build-stage /speckle-server/tini /usr/bin/tini +COPY --from=build-stage /speckle-server/packages/shared ./packages/shared +COPY --from=build-stage /speckle-server/packages/database-monitor ./packages/database-monitor +COPY --from=build-stage /speckle-server/node_modules ./node_modules + +WORKDIR /speckle-server/packages/database-monitor + +ENTRYPOINT [ "tini", "--", "/nodejs/bin/node", "--loader=./dist/src/aliasLoader.js", "bin/www.js" ] diff --git a/packages/database-monitor/README.md b/packages/database-monitor/README.md new file mode 100644 index 0000000000..043b811e04 --- /dev/null +++ b/packages/database-monitor/README.md @@ -0,0 +1,17 @@ +# Database Monitor + +Responsible for querying all databases and generating metrics. + +Metrics are available at `/metrics` endpoint and are in Prometheus format. + +## Development + +```bash +yarn dev +``` + +## Production + +```bash +yarn start +``` diff --git a/packages/database-monitor/eslint.config.mjs b/packages/database-monitor/eslint.config.mjs new file mode 100644 index 0000000000..884f851ba5 --- /dev/null +++ b/packages/database-monitor/eslint.config.mjs @@ -0,0 +1,61 @@ +import tseslint from 'typescript-eslint' +import { + baseConfigs, + getESMDirname, + globals, + prettierConfig +} from '../../eslint.config.mjs' + +const configs = [ + ...baseConfigs, + { + ignores: ['dist', 'public', 'docs'] + }, + { + files: ['**/*.js'], + ignores: ['**/*.mjs'], + languageOptions: { + sourceType: 'module', + globals: { + ...globals.node + } + } + }, + { + files: ['bin/www'], + languageOptions: { + sourceType: 'module', + globals: { + ...globals.node + } + } + }, + ...tseslint.configs.recommendedTypeChecked.map((c) => ({ + ...c, + files: [...(c.files || []), '**/*.ts', '**/*.d.ts'] + })), + { + files: ['**/*.ts', '**/*.d.ts'], + languageOptions: { + parserOptions: { + tsconfigRootDir: getESMDirname(import.meta.url), + project: './tsconfig.json' + } + }, + rules: { + '@typescript-eslint/no-explicit-any': 'error', + '@typescript-eslint/no-unsafe-return': 'error' + } + }, + { + files: ['**/*.spec.{js,ts}'], + languageOptions: { + globals: { + ...globals.node + } + } + }, + prettierConfig +] + +export default configs diff --git a/packages/database-monitor/multiregion.example.json b/packages/database-monitor/multiregion.example.json new file mode 100644 index 0000000000..77e82f94ef --- /dev/null +++ b/packages/database-monitor/multiregion.example.json @@ -0,0 +1,32 @@ +{ + "main": { + "postgres": { + "connectionUri": "postgresql://speckle:speckle@127.0.0.1:5432/speckle", + "privateConnectionUri": "postgresql://speckle:speckle@postgres:5432/speckle" + }, + "blobStorage": { + "accessKey": "minioadmin", + "secretKey": "minioadmin", + "bucket": "speckle-server", + "createBucketIfNotExists": true, + "endpoint": "http://127.0.0.1:9000", + "s3Region": "us-east-1" + } + }, + "regions": { + "region1": { + "postgres": { + "connectionUri": "postgresql://speckle:speckle@127.0.0.1:5401/speckle", + "privateConnectionUri": "postgresql://speckle:speckle@postgres-region1:5432/speckle" + }, + "blobStorage": { + "accessKey": "minioadmin", + "secretKey": "minioadmin", + "bucket": "speckle-server", + "createBucketIfNotExists": true, + "endpoint": "http://127.0.0.1:9020", + "s3Region": "us-east-1" + } + } + } +} diff --git a/packages/database-monitor/package.json b/packages/database-monitor/package.json new file mode 100644 index 0000000000..4ff826e119 --- /dev/null +++ b/packages/database-monitor/package.json @@ -0,0 +1,65 @@ +{ + "name": "@speckle/database-monitor", + "private": true, + "version": "2.5.4", + "description": "Query connected databases and generate metrics.", + "main": "bin/www", + "homepage": "https://speckle.systems", + "repository": { + "type": "git", + "url": "https://github.com/specklesystems/speckle-server.git", + "directory": "packages/database-monitor" + }, + "type": "module", + "engines": { + "node": "^18.19.0" + }, + "scripts": { + "build:tsc:watch": "tsc -p ./tsconfig.build.json --watch", + "run:watch": "NODE_ENV=development LOG_PRETTY=true LOG_LEVEL=debug nodemon --exec \"yarn start\" --trace-deprecation --watch ./bin/www.js --watch ./dist", + "dev": "concurrently \"npm:build:tsc:watch\" \"npm:run:watch\"", + "dev:headed": "yarn dev", + "build:tsc": "rimraf ./dist/src && tsc -p ./tsconfig.build.json", + "build": "yarn build:tsc", + "lint": "yarn lint:tsc && yarn lint:eslint", + "lint:ci": "yarn lint:tsc", + "lint:tsc": "tsc --noEmit", + "lint:eslint": "eslint .", + "start": "node --loader=./dist/src/aliasLoader.js ./bin/www.js", + "test": "NODE_ENV=test LOG_LEVEL=silent LOG_PRETTY=true vitest run --sequence.shuffle" + }, + "dependencies": { + "@speckle/shared": "workspace:^", + "crypto": "^1.0.1", + "dotenv": "^16.4.5", + "esm-module-alias": "^2.2.0", + "express": "^4.19.2", + "http-errors": "~1.6.3", + "knex": "^2.4.1", + "lodash": "^4.17.21", + "lodash-es": "^4.17.21", + "pg": "^8.7.3", + "pino": "^8.7.0", + "pino-http": "^8.2.1", + "pino-pretty": "^9.1.1", + "prom-client": "^14.0.1" + }, + "devDependencies": { + "@types/express": "^4.17.13", + "@types/http-errors": "^2.0.4", + "@types/lodash-es": "^4.17.6", + "@types/node": "^18.19.38", + "@vitest/coverage-istanbul": "^1.6.0", + "concurrently": "^8.2.2", + "crypto-random-string": "^5.0.0", + "eslint": "^9.4.0", + "eslint-config-prettier": "^9.1.0", + "eslint-plugin-vitest": "^0.5.4", + "nodemon": "^2.0.20", + "prettier": "^2.5.1", + "rimraf": "^5.0.7", + "typescript": "^4.6.4", + "typescript-eslint": "^7.12.0", + "vitest": "^1.6.0" + } +} diff --git a/packages/database-monitor/src/aliasLoader.ts b/packages/database-monitor/src/aliasLoader.ts new file mode 100644 index 0000000000..8deeda0895 --- /dev/null +++ b/packages/database-monitor/src/aliasLoader.ts @@ -0,0 +1,8 @@ +import generateAliasesResolver from 'esm-module-alias' +import { packageRoot, srcRoot } from './root.js' +import path from 'node:path' + +export const resolve = generateAliasesResolver({ + '@': srcRoot, + '#': path.resolve(packageRoot, './tests') +}) diff --git a/packages/database-monitor/src/bin.ts b/packages/database-monitor/src/bin.ts new file mode 100644 index 0000000000..2ec8a8f77b --- /dev/null +++ b/packages/database-monitor/src/bin.ts @@ -0,0 +1,9 @@ +import '@/bootstrap.js' // This has side-effects and has to be imported first + +import { startServer } from '@/server/server.js' + +const start = () => { + startServer() +} + +start() diff --git a/packages/database-monitor/src/bootstrap.ts b/packages/database-monitor/src/bootstrap.ts new file mode 100644 index 0000000000..50c8721e6a --- /dev/null +++ b/packages/database-monitor/src/bootstrap.ts @@ -0,0 +1,2 @@ +import dotenv from 'dotenv' +dotenv.config() diff --git a/packages/database-monitor/src/clients/knex.ts b/packages/database-monitor/src/clients/knex.ts new file mode 100644 index 0000000000..d47a234990 --- /dev/null +++ b/packages/database-monitor/src/clients/knex.ts @@ -0,0 +1,53 @@ +import { knexLogger as logger } from '@/observability/logging.js' +import { + getPostgresConnectionString, + getPostgresMaxConnections, + isDevOrTestEnv, + isTest +} from '@/utils/env.js' +import Environment from '@speckle/shared/dist/commonjs/environment/index.js' +import { + loadMultiRegionsConfig, + configureKnexClient +} from '@speckle/shared/dist/commonjs/environment/multiRegionConfig.js' + +const { FF_WORKSPACES_MULTI_REGION_ENABLED } = Environment.getFeatureFlags() + +type ConfiguredKnexClient = ReturnType +export type DbClients = Record<'main', ConfiguredKnexClient> & + Record +let dbClients: DbClients + +export const getDbClients = async () => { + if (dbClients) return dbClients + const maxConnections = getPostgresMaxConnections() + + const configArgs = { + migrationDirs: [], + isTestEnv: isTest(), + isDevOrTestEnv: isDevOrTestEnv(), + logger, + maxConnections, + applicationName: 'speckle_database_monitor' + } + if (!FF_WORKSPACES_MULTI_REGION_ENABLED) { + const mainClient = configureKnexClient( + { + postgres: { + connectionUri: getPostgresConnectionString() + } + }, + configArgs + ) + dbClients = { main: mainClient } + } else { + const configPath = process.env.MULTI_REGION_CONFIG_PATH || 'multiregion.json' + const config = await loadMultiRegionsConfig({ path: configPath }) + const clients = [['main', configureKnexClient(config.main, configArgs)]] + Object.entries(config.regions).map(([key, config]) => { + clients.push([key, configureKnexClient(config, configArgs)]) + }) + dbClients = Object.fromEntries(clients) as DbClients + } + return dbClients +} diff --git a/packages/database-monitor/src/domain/const.ts b/packages/database-monitor/src/domain/const.ts new file mode 100644 index 0000000000..898869b284 --- /dev/null +++ b/packages/database-monitor/src/domain/const.ts @@ -0,0 +1 @@ +export const REQUEST_ID_HEADER = 'x-request-id' diff --git a/packages/database-monitor/src/observability/expressLogging.ts b/packages/database-monitor/src/observability/expressLogging.ts new file mode 100644 index 0000000000..8beb5a6437 --- /dev/null +++ b/packages/database-monitor/src/observability/expressLogging.ts @@ -0,0 +1,96 @@ +import { REQUEST_ID_HEADER } from '@/domain/const.js' +import { logger } from '@/observability/logging.js' +import { randomUUID } from 'crypto' +import type { Request } from 'express' +import type { IncomingHttpHeaders, IncomingMessage } from 'http' +import { get } from 'lodash-es' +import { pinoHttp } from 'pino-http' +import pino from 'pino' +import { parse } from 'url' + +function determineRequestId(headers: IncomingHttpHeaders, uuidGenerator = randomUUID) { + const idHeader = headers[REQUEST_ID_HEADER] + if (!idHeader) return uuidGenerator() + if (Array.isArray(idHeader)) return idHeader[0] ?? uuidGenerator() + return idHeader +} + +const generateReqId = (req: IncomingMessage) => determineRequestId(req.headers) + +export const loggingExpressMiddleware = pinoHttp({ + genReqId: generateReqId, + logger, + autoLogging: true, + // this is here, to force logging 500 responses as errors in the final log + // and we don't really care about 3xx stuff + // all the user related 4xx responses are treated as info + customLogLevel: (req, res, error) => { + const path = getRequestPath(req) + const shouldBeDebug = ['/metrics'].includes(path || '') ?? false + + if (res.statusCode >= 400 && res.statusCode < 500) { + return 'info' + } else if (res.statusCode >= 500 || error) { + return 'error' + } else if (res.statusCode >= 300 && res.statusCode < 400) { + return 'silent' + } + + return shouldBeDebug ? 'debug' : 'info' + }, + + // we need to redact any potential sensitive data from being logged. + // as we do not know what headers may be sent in a request by a user or client + // we have to allow list selected headers + serializers: { + req: pino.stdSerializers.wrapRequestSerializer((req) => { + return { + id: req.raw.id, + method: req.raw.method, + path: getRequestPath(req.raw), + // Denylist potentially sensitive query parameters + pathParameters: sanitizeQueryParams(getRequestParameters(req.raw)), + // Denylist potentially sensitive headers + headers: sanitizeHeaders(req.raw.headers) + } + }) + } +}) + +const getRequestPath = (req: IncomingMessage | Request) => { + const path = ((get(req, 'originalUrl') || get(req, 'url') || '') as string).split( + '?' + )[0] + return path?.length ? path : null +} + +const getRequestParameters = (req: IncomingMessage | Request) => { + const maybeUrl = (get(req, 'originalUrl') as string) || get(req, 'url') || '' + const url = parse(maybeUrl, true) + return url.query || {} +} + +const sanitizeHeaders = (headers: Record) => + Object.fromEntries( + Object.entries(headers).filter( + ([key]) => + ![ + 'cookie', + 'authorization', + 'cf-connecting-ip', + 'true-client-ip', + 'x-real-ip', + 'x-forwarded-for', + 'x-original-forwarded-for' + ].includes(key.toLocaleLowerCase()) + ) + ) + +const sanitizeQueryParams = (query: Record) => { + Object.keys(query).forEach(function (key) { + if (['code', 'state'].includes(key.toLocaleLowerCase())) { + query[key] = '******' + } + }) + return query +} diff --git a/packages/database-monitor/src/observability/logging.ts b/packages/database-monitor/src/observability/logging.ts new file mode 100644 index 0000000000..8fef22a6c9 --- /dev/null +++ b/packages/database-monitor/src/observability/logging.ts @@ -0,0 +1,14 @@ +import { getLogLevel, isLogPretty } from '@/utils/env.js' +import { + extendLoggerComponent as elc, + getLogger +} from '@speckle/shared/dist/commonjs/observability/index.js' +export const extendLoggerComponent = elc + +export const logger = extendLoggerComponent( + getLogger(getLogLevel(), isLogPretty()), + 'database-monitor' +) +export const serverLogger = extendLoggerComponent(logger, 'server') +export const testLogger = getLogger(getLogLevel(), isLogPretty()) +export const knexLogger = extendLoggerComponent(logger, 'knex') diff --git a/packages/database-monitor/src/observability/metricsApp.ts b/packages/database-monitor/src/observability/metricsApp.ts new file mode 100644 index 0000000000..6d43a94046 --- /dev/null +++ b/packages/database-monitor/src/observability/metricsApp.ts @@ -0,0 +1,26 @@ +import { loggingExpressMiddleware } from '@/observability/expressLogging.js' +import { metricsRouterFactory } from '@/observability/metricsRoute.js' +import { initPrometheusMetrics } from '@/observability/prometheusMetrics.js' +import { errorHandler } from '@/utils/errorHandler.js' +import express from 'express' +import createError from 'http-errors' + +export const appFactory = () => { + initPrometheusMetrics() + const app = express() + + app.use(loggingExpressMiddleware) + app.use(express.json({ limit: '100mb' })) + app.use(express.urlencoded({ limit: '100mb', extended: false })) + + app.use('/metrics', metricsRouterFactory()) + + // catch 404 and forward to error handler + app.use(function (req, _res, next) { + next(createError(404, `Not Found: ${req.url}`)) + }) + app.set('json spaces', 2) // pretty print json + + app.use(errorHandler) + return app +} diff --git a/packages/database-monitor/src/observability/metricsRoute.ts b/packages/database-monitor/src/observability/metricsRoute.ts new file mode 100644 index 0000000000..3b661adaef --- /dev/null +++ b/packages/database-monitor/src/observability/metricsRoute.ts @@ -0,0 +1,15 @@ +import express, { RequestHandler } from 'express' +import prometheusClient from 'prom-client' + +export const metricsRouterFactory = () => { + const metricsRouter = express.Router() + + metricsRouter.get( + '/', //root path of the sub-path to which this router is attached (should be `/metrics`) + (async (_req, res) => { + res.setHeader('Content-Type', prometheusClient.register.contentType) + res.end(await prometheusClient.register.metrics()) + }) as RequestHandler //FIXME: this works around a type error with async, which is resolved in express 5 + ) + return metricsRouter +} diff --git a/packages/database-monitor/src/observability/prometheusMetrics.ts b/packages/database-monitor/src/observability/prometheusMetrics.ts new file mode 100644 index 0000000000..d143830c38 --- /dev/null +++ b/packages/database-monitor/src/observability/prometheusMetrics.ts @@ -0,0 +1,333 @@ +import { DbClients, getDbClients } from '@/clients/knex.js' +import { logger } from '@/observability/logging.js' +import { databaseMonitorCollectionPeriodSeconds } from '@/utils/env.js' +import { get, join } from 'lodash-es' +import { Histogram, Registry } from 'prom-client' +import prometheusClient from 'prom-client' + +let prometheusInitialized = false + +function isPrometheusInitialized() { + return prometheusInitialized +} + +type MetricConfig = { + prefix?: string + labels?: Record + buckets?: Record + getDbClients: () => Promise +} + +type MetricsMonitor = { + start: () => () => void +} + +function initMonitoringMetrics(params: { + register: Registry + collectionPeriodMilliseconds: number + config: MetricConfig +}): MetricsMonitor { + logger.info('Initializing monitoring metrics...') + const { register, collectionPeriodMilliseconds, config } = params + const registers = register ? [register] : undefined + const namePrefix = config.prefix ?? '' + const labels = config.labels ?? {} + const labelNames = Object.keys(labels) + const getDbClients = config.getDbClients + + const dbSize = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_size'], '_'), + help: 'Size of the entire database (in bytes)', + labelNames: ['region', ...labelNames] + }) + const objects = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_objects'], '_'), + help: 'Number of objects', + labelNames + }) + const streams = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_streams'], '_'), + help: 'Number of streams/projects', + labelNames + }) + const commits = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_commits'], '_'), + help: 'Number of commits/versions', + labelNames + }) + const users = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_users'], '_'), + help: 'Number of users', + labelNames + }) + const fileimports = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_fileimports'], '_'), + help: 'Number of imported files, by type and status', + labelNames: ['filetype', 'status', ...labelNames] + }) + const filesize = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_filesize'], '_'), + help: 'Size of imported files, by type (in bytes)', + labelNames: ['filetype', ...labelNames] + }) + const webhooks = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_webhooks'], '_'), + help: 'Number of webhook calls, by status', + labelNames: ['status', ...labelNames] + }) + const previews = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_previews'], '_'), + help: 'Number of previews, by status', + labelNames: ['status', ...labelNames] + }) + const tablesize = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_tablesize'], '_'), + help: 'Size of tables in the database, by table (in bytes)', + labelNames: ['table', 'region', ...labelNames] + }) + + const selfMonitor = new Histogram({ + name: join([namePrefix, 'self_monitor_time_monitoring_metrics'], '_'), + help: 'The time taken to collect all of the database monitoring metrics, seconds.', + registers, + buckets: [0, 0.1, 0.25, 0.5, 1, 2, 5, 10], + labelNames + }) + + const collect = async () => { + const dbClientsRecord = await getDbClients() + const dbClients = [ + ...Object.entries(dbClientsRecord).map(([regionKey, client]) => ({ + client: client.private, //this has to be the private client, as we need to get the database name from the connection string. The public client, if via a connection pool, does not has the connection pool name not the database name. + isMain: regionKey === 'main', + regionKey + })) + ] + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + if (!client) { + logger.error({ regionKey }, 'Could not get private client for region') + return + } + logger.info({ regionKey }, 'Collecting monitoring metrics for region') + const connectionString: string = String( + get(client.client, ['config', 'connection', 'connectionString'], '') + ) + if (!connectionString) { + logger.warn( + { regionKey }, + 'Could not get connection string from client config' + ) + } + const databaseName = new URL(connectionString).pathname?.split('/').pop() + if (databaseName) { + const dbSizeResult = await client.raw<{ + rows: [{ pg_database_size: string }] //bigints are returned as strings + }>('SELECT pg_database_size(?) LIMIT 1', [databaseName]) + dbSize.set( + { ...labels, region: regionKey }, + parseInt(dbSizeResult.rows[0].pg_database_size) //FIXME risk this bigint being too big for JS, but that would be a very large database! + ) + } else { + logger.warn({ regionKey }, 'Could not get database name from client config') + } + + const tableSizeResults = await client.raw<{ + rows: [{ table_name: string; table_size: string }] //bigints are returned as strings + }>( + ` + SELECT + table_name, + table_size + + FROM ( + SELECT + pg_catalog.pg_namespace.nspname AS schema_name, + relname AS table_name, + pg_relation_size(pg_catalog.pg_class.oid) AS table_size + + FROM pg_catalog.pg_class + JOIN pg_catalog.pg_namespace ON relnamespace = pg_catalog.pg_namespace.oid + ) t + WHERE schema_name = 'public' + ORDER BY table_size DESC; + ` + ) + for (const row of tableSizeResults.rows) { + tablesize.set( + { ...labels, table: row.table_name, region: regionKey }, + parseInt(row.table_size) //FIXME risk this bigint being too big for JS + ) + } + }) + ) + + const mainDbClient = dbClients.find((c) => c.isMain)?.client + if (!mainDbClient) { + logger.warn('Could not find main database client') + return + } + + // Counts for users, streams, commits, objects + const objectsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'objects' LIMIT 1;" + ) + objects.set({ ...labels }, objectsEstimate.rows[0].estimate) + const streamsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'streams' LIMIT 1;" + ) + streams.set({ ...labels }, streamsEstimate.rows[0].estimate) + const commitsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'commits' LIMIT 1;" + ) + commits.set({ ...labels }, commitsEstimate.rows[0].estimate) + const usersEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'users' LIMIT 1;" + ) + users.set({ ...labels }, usersEstimate.rows[0].estimate) + + const importedFiles = await mainDbClient.raw<{ + rows: [{ fileType: string; convertedStatus: number; count: number }] + }>( + ` + SELECT LOWER("fileType") AS "fileType", "convertedStatus", count(*) + FROM file_uploads + GROUP BY (LOWER("fileType"), "convertedStatus"); + ` + ) + + // Create zero-values for all possible combinations of file types and statuses + const allFileImportConvertedStatusAndFileTypes = importedFiles.rows.reduce( + (acc, row) => { + acc.convertedStatus.add(row.convertedStatus) + acc.fileType.add(row.fileType) + return acc + }, + { convertedStatus: new Set(), fileType: new Set() } + ) + const remainingConvertedStatusAndFileTypes = new Set<{ + fileType: string + status: number + }>() + allFileImportConvertedStatusAndFileTypes.convertedStatus.forEach((status) => { + allFileImportConvertedStatusAndFileTypes.fileType.forEach((fileType) => { + remainingConvertedStatusAndFileTypes.add({ fileType, status }) + }) + }) + + //it's a gauge, so the updated actual values will override the zero-values + for (const row of importedFiles.rows) { + remainingConvertedStatusAndFileTypes.delete({ + fileType: row.fileType, + status: row.convertedStatus + }) + fileimports.set( + { ...labels, filetype: row.fileType, status: row.convertedStatus.toString() }, + row.count + ) + } + // zero-values for all remaining file types and statuses + remainingConvertedStatusAndFileTypes.forEach(({ fileType, status }) => { + fileimports.set({ ...labels, filetype: fileType, status: status.toString() }, 0) + }) + + const fileSizeResults = await mainDbClient.raw<{ + rows: [{ fileType: string; fileSize: number }] + }>( + ` + SELECT LOWER("fileType") AS fileType, SUM("fileSize") AS fileSize + FROM file_uploads + GROUP BY LOWER("fileType"); + ` + ) + for (const row of fileSizeResults.rows) { + filesize.set({ ...labels, filetype: row.fileType }, row.fileSize) + } + + const webhookResults = await mainDbClient.raw<{ + rows: [{ status: number; count: number }] + }>( + ` + SELECT status, count(*) + FROM webhooks_events + GROUP BY status; + ` + ) + const remainingWebhookStatus = new Set(Array(4).keys()) + for (const row of webhookResults.rows) { + remainingWebhookStatus.delete(row.status) + webhooks.set({ ...labels, status: row.status.toString() }, row.count) + } + // zero-values for all remaining webhook statuses + remainingWebhookStatus.forEach((status) => { + webhooks.set({ ...labels, status: status.toString() }, 0) + }) + + const previewStatusResults = await mainDbClient.raw<{ + rows: [{ previewStatus: number; count: number }] + }>(` + SELECT "previewStatus", count(*) + FROM object_preview + GROUP BY "previewStatus"; + `) + + const remainingPreviewStatus = new Set(Array(4).keys()) + for (const row of previewStatusResults.rows) { + remainingPreviewStatus.delete(row.previewStatus) + previews.set({ ...labels, status: row.previewStatus.toString() }, row.count) + } + // zero-values for all remaining preview statuses + remainingPreviewStatus.forEach((status) => { + previews.set({ ...labels, status: status.toString() }, 0) + }) + } + + return { + start: () => { + const intervalId = setInterval(() => { + void (async () => { + const end = selfMonitor.startTimer() + await collect() + const duration = end() + logger.info( + { metricsCollectionDurationSeconds: duration }, + 'Collected monitoring metrics in {metricsCollectionDurationSeconds} seconds' + ) + })() + }, collectionPeriodMilliseconds) + return () => clearInterval(intervalId) // returns a handle which can be called to stop the monitoring + } + } +} + +export function initPrometheusMetrics() { + logger.info('Initializing Prometheus metrics...') + if (isPrometheusInitialized()) { + logger.info('Prometheus metrics already initialized') + return + } + + prometheusInitialized = true + + prometheusClient.register.clear() + prometheusClient.register.setDefaultLabels({ + project: 'speckle-server', + app: 'database-monitor' + }) + + try { + prometheusClient.collectDefaultMetrics() + const monitoringMetrics = initMonitoringMetrics({ + register: prometheusClient.register, + collectionPeriodMilliseconds: databaseMonitorCollectionPeriodSeconds() * 1000, + config: { + getDbClients, + prefix: 'speckle' + } + }) + monitoringMetrics.start() + } catch (e) { + logger.error(e, 'Failed to initialize Prometheus metrics.') + prometheusInitialized = false + } +} diff --git a/packages/database-monitor/src/root.ts b/packages/database-monitor/src/root.ts new file mode 100644 index 0000000000..13b51d800f --- /dev/null +++ b/packages/database-monitor/src/root.ts @@ -0,0 +1,21 @@ +import path from 'node:path' +import fs from 'node:fs' +import { fileURLToPath } from 'url' + +/** + * Singleton module for src root and package root directory resolution + */ + +const __filename = fileURLToPath(import.meta.url) +const srcRoot = path.dirname(__filename) + +// Recursively walk back from __dirname till we find our package.json +let packageRoot = srcRoot +while (packageRoot !== '/') { + if (fs.readdirSync(packageRoot).includes('package.json')) { + break + } + packageRoot = path.resolve(packageRoot, '..') +} + +export { srcRoot, packageRoot } diff --git a/packages/database-monitor/src/server/routes/index.ts b/packages/database-monitor/src/server/routes/index.ts new file mode 100644 index 0000000000..4faee34c04 --- /dev/null +++ b/packages/database-monitor/src/server/routes/index.ts @@ -0,0 +1,13 @@ +import express from 'express' + +const indexRouterFactory = () => { + const indexRouter = express.Router() + + indexRouter.get('/', (_req, res) => { + res.send('Speckle database monitoring, at your service.') + }) + + return indexRouter +} + +export default indexRouterFactory diff --git a/packages/database-monitor/src/server/server.ts b/packages/database-monitor/src/server/server.ts new file mode 100644 index 0000000000..4737e2f1da --- /dev/null +++ b/packages/database-monitor/src/server/server.ts @@ -0,0 +1,88 @@ +import { serverLogger } from '@/observability/logging.js' +import { appFactory as metricsAppFactory } from '@/observability/metricsApp.js' +import { getMetricsHost, getMetricsPort } from '@/utils/env.js' +import http from 'http' +import { isNaN, isString, toNumber } from 'lodash-es' + +export const startServer = (params?: { serveOnRandomPort?: boolean }) => { + // we place the metrics on a separate port as we wish to expose it to external monitoring tools, but do not wish to expose other routes (for now) + const inputMetricsPort = params?.serveOnRandomPort + ? 0 + : normalizePort(getMetricsPort()) + const metricsApp = metricsAppFactory() + metricsApp.set('port', inputMetricsPort) + + /** + * Create HTTP server. + */ + const metricsServer = http.createServer(metricsApp) + + const metricsHost = getMetricsHost() + metricsServer.on('error', onErrorFactory(inputMetricsPort)) + metricsServer.on('listening', () => { + serverLogger.info('📊 Started Database monitor server') + onListening(metricsServer) + }) + metricsServer.listen(inputMetricsPort, metricsHost) + + return { metricsServer } +} + +export const stopServer = (params: { server: http.Server }) => { + const { server } = params + server.close() +} + +/** + * Normalize a port into a number, string, or false. + */ +function normalizePort(val: string | number) { + const port = toNumber(val) + if (!isNaN(port) && port >= 0) return port + + throw new Error('Invalid port; port must be a positive integer.') +} + +/** + * Event listener for HTTP server "error" event. + */ + +const onErrorFactory = (port: string | number | false) => (error: Error) => { + if ('syscall' in error && error.syscall !== 'listen') { + throw error + } + + const bind = isString(port) ? 'Pipe ' + port : 'Port ' + port + + if (!('code' in error)) throw error + + // handle specific listen errors with friendly messages + switch (error.code) { + case 'EACCES': + serverLogger.error(error, bind + ' requires elevated privileges') + process.exit(1) + case 'EADDRINUSE': + serverLogger.error(error, bind + ' is already in use') + process.exit(1) + default: + throw error + } +} + +/** + * Event listener for HTTP server "listening" event. + */ + +function onListening(referenceServer: http.Server) { + const addr = referenceServer.address() + if (!addr) throw new Error('Server address is not defined') + + switch (typeof addr) { + case 'string': + serverLogger.info(`Listening on pipe ${addr}`) + return addr + default: + serverLogger.info(`Listening on port ${addr.port}`) + return addr.port + } +} diff --git a/packages/database-monitor/src/utils/env.ts b/packages/database-monitor/src/utils/env.ts new file mode 100644 index 0000000000..d406421d85 --- /dev/null +++ b/packages/database-monitor/src/utils/env.ts @@ -0,0 +1,22 @@ +export function getIntFromEnv(envVarKey: string, aDefault = '0'): number { + return parseInt(process.env[envVarKey] || aDefault) +} + +export const getMetricsHost = () => process.env.METRICS_HOST || '127.0.0.1' +export const getLogLevel = () => process.env.LOG_LEVEL || 'info' +export const getMetricsPort = () => process.env.PROMETHEUS_METRICS_PORT || '9092' +export const getNodeEnv = () => process.env.NODE_ENV || 'production' +export const getPostgresConnectionString = () => + process.env.PG_CONNECTION_STRING || 'postgres://speckle:speckle@127.0.0.1/speckle' +export const getPostgresMaxConnections = () => + parseInt(process.env.POSTGRES_MAX_CONNECTIONS || '2') +export function databaseMonitorCollectionPeriodSeconds() { + return getIntFromEnv('METRICS_COLLECTION_PERIOD_SECONDS', '120') +} + +export const isDevelopment = () => + getNodeEnv() === 'development' || getNodeEnv() === 'dev' +export const isLogPretty = () => process.env.LOG_PRETTY?.toLocaleLowerCase() === 'true' +export const isProduction = () => getNodeEnv() === 'production' +export const isTest = () => getNodeEnv() === 'test' +export const isDevOrTestEnv = () => isDevelopment() || isTest() diff --git a/packages/database-monitor/src/utils/errorHandler.ts b/packages/database-monitor/src/utils/errorHandler.ts new file mode 100644 index 0000000000..626c7b1106 --- /dev/null +++ b/packages/database-monitor/src/utils/errorHandler.ts @@ -0,0 +1,25 @@ +import { ErrorRequestHandler } from 'express' +import { isNaN, isObject, isString } from 'lodash-es' + +export const errorHandler: ErrorRequestHandler = (err, req, res) => { + if ( + isObject(err) && + 'status' in err && + typeof err.status === 'number' && + !isNaN(err.status) + ) { + res.status(err?.status) + } else { + res.status(500) + } + + res.setHeader('Content-Type', 'application/json') + + if (req.app.get('env') === 'development') { + res.send(JSON.stringify(err, undefined, 2)) + } else if (isObject(err) && 'message' in err && isString(err.message)) { + res.send(JSON.stringify({ message: err.message })) + } else { + res.send(JSON.stringify({ message: 'Internal Server Error' })) + } +} diff --git a/packages/database-monitor/tests/helpers/helpers.ts b/packages/database-monitor/tests/helpers/helpers.ts new file mode 100644 index 0000000000..5ddaa5c683 --- /dev/null +++ b/packages/database-monitor/tests/helpers/helpers.ts @@ -0,0 +1,45 @@ +import { startServer } from '@/server/server.js' +import http from 'http' +import type { AddressInfo } from 'net' +import { getPostgresConnectionString } from '@/utils/env.js' + +export const startAndWaitOnServers = async () => { + let metricsServerAddress: string | AddressInfo | null = null + + const { metricsServer } = startServer({ + serveOnRandomPort: true + }) + metricsServer.on('listening', () => { + metricsServerAddress = metricsServer.address() + }) + + //HACK wait until both servers are available + while (!metricsServerAddress) { + // wait for the servers to start + await new Promise((resolve) => setTimeout(resolve, 100)) + } + + return { metricsServer } +} + +export const getServerPort = (server: http.Server) => { + const address = server.address() + if (address && typeof address !== 'string') { + return address.port + } + throw new Error('Server port is not available') +} + +export const customizePostgresConnectionString = (databaseName?: string) => { + const originalPostgresConnectionString = getPostgresConnectionString() + if (!databaseName) return originalPostgresConnectionString + + const originalPostgresUrl = new URL(originalPostgresConnectionString) + const protocol = originalPostgresUrl.protocol + const user = originalPostgresUrl.username + const pass = originalPostgresUrl.password + const host = originalPostgresUrl.hostname + const port = originalPostgresUrl.port + const origin = `${protocol}//${user}:${pass}@${host}:${port}` + return new URL(databaseName, origin).toString() +} diff --git a/packages/database-monitor/tests/helpers/testExtensions.ts b/packages/database-monitor/tests/helpers/testExtensions.ts new file mode 100644 index 0000000000..36f570e294 --- /dev/null +++ b/packages/database-monitor/tests/helpers/testExtensions.ts @@ -0,0 +1,65 @@ +import { stopServer } from '@/server/server.js' +import { inject, test } from 'vitest' +import { getTestDb } from '#/helpers/testKnexClient.js' +import { startAndWaitOnServers } from '#/helpers/helpers.js' +import type { Knex } from 'knex' +import { Server } from 'http' + +export interface DatabaseIntegrationTestContext { + context: { + db: Knex.Transaction + } +} + +// vitest reference: https://vitest.dev/guide/test-context#fixture-initialization +export const databaseIntegrationTest = test.extend({ + // this key has to match the top level key in the interface (i.e. `context`). Some vitest typing magic at work here. + context: [ + // eslint-disable-next-line @typescript-eslint/no-unused-vars + async ({ task, onTestFinished }, use) => { + const dbName = inject('dbName') + // equivalent of beforeEach + const db = await getTestDb(dbName).transaction() + + // schedule the cleanup. Runs regardless of test status, and runs after afterEach. + onTestFinished(async () => { + await db.rollback() + }) + + // now run the test + await use({ db }) + }, + { auto: true } // we want to run this for each databaseIntegrationTest, even if the context is not explicitly requested by the test + ] +}) + +export interface E2ETestContext extends DatabaseIntegrationTestContext { + context: { + db: Knex.Transaction + metricsServer: Server + } +} + +// vitest reference: https://vitest.dev/guide/test-context#fixture-initialization +export const e2eTest = test.extend({ + // this key has to match the top level key in the interface (i.e. `context`). Some vitest typing magic at work here. + context: [ + // eslint-disable-next-line @typescript-eslint/no-unused-vars + async ({ task, onTestFinished }, use) => { + const dbName = inject('dbName') + // equivalent of beforeEach + const db = await getTestDb(dbName).transaction() + const { metricsServer } = await startAndWaitOnServers() + + // schedule the cleanup. Runs regardless of test status, and runs after afterEach. + onTestFinished(async () => { + if (metricsServer) stopServer({ server: metricsServer }) + if (db) await db.rollback() + }) + + // now run the test + await use({ db, metricsServer }) + }, + { auto: true } // we want to run this for each e2eTest, even if the context is not explicitly requested by the test + ] +}) diff --git a/packages/database-monitor/tests/helpers/testKnexClient.ts b/packages/database-monitor/tests/helpers/testKnexClient.ts new file mode 100644 index 0000000000..008605bb96 --- /dev/null +++ b/packages/database-monitor/tests/helpers/testKnexClient.ts @@ -0,0 +1,20 @@ +/* eslint-disable camelcase */ +import { knex } from 'knex' +import { customizePostgresConnectionString } from '#/helpers/helpers.js' + +export const getTestDb = (databaseName?: string) => + knex({ + client: 'pg', + connection: { + application_name: 'speckle_database_monitor', + connectionString: customizePostgresConnectionString(databaseName) + }, + pool: { min: 0, max: 2 } + // migrations are managed in the server package for production + // for tests, we are creating a new database for each test run so we can't use this default migration functionality + // migrations: { + // extension: '.ts', + // directory: path.resolve(__dirname, '../migrations'), + // loadExtensions: ['js', 'ts'] + // } + }) diff --git a/packages/database-monitor/tests/hooks/globalSetup.ts b/packages/database-monitor/tests/hooks/globalSetup.ts new file mode 100644 index 0000000000..ce050da6fa --- /dev/null +++ b/packages/database-monitor/tests/hooks/globalSetup.ts @@ -0,0 +1,50 @@ +/** + * These hooks are run once, before and after the test suite. + * It is configured via the vitest.config.ts file. + */ +import '@/bootstrap.js' // This has side-effects and has to be imported first +import { testLogger as logger } from '@/observability/logging.js' +import cryptoRandomString from 'crypto-random-string' +import type { GlobalSetupContext } from 'vitest/node' + +declare module 'vitest' { + export interface ProvidedContext { + dbName: string + } +} + +const dbName = + process.env.TEST_DB || + `preview_service_${cryptoRandomString({ + length: 10, + type: 'alphanumeric' + })}`.toLocaleLowerCase() //postgres will automatically lower case new db names + +/** + * Global setup hook + * This hook is run once before any tests are run + * Defined in vitest.config.ts under test.globalSetup + */ +export function setup({ provide }: GlobalSetupContext) { + logger.info('🏃🏻‍♀️‍➡️ Running vitest setup global hook') + + // this provides the dbName to all tests, and can be accessed via inject('dbName'). NB: The test extensions already implement this, so use a test extension. + provide('dbName', dbName) + + logger.info( + `💁🏽‍♀️ Completed the vitest setup global hook. Database created at ${dbName}` + ) +} + +/** + * Global teardown hook + * This hook is run once after all tests are run + * Defined in vitest.config.ts under test.globalTeardown + */ +export function teardown() { + logger.info('🏃🏻‍♀️ Running vitest teardown global hook') + + logger.info( + `✅ Completed the vitest teardown global hook. Database ${dbName} down migrated.` + ) +} diff --git a/packages/database-monitor/tsconfig.build.json b/packages/database-monitor/tsconfig.build.json new file mode 100644 index 0000000000..f2d7b55d71 --- /dev/null +++ b/packages/database-monitor/tsconfig.build.json @@ -0,0 +1,5 @@ +{ + "extends": "./tsconfig.json", + "include": ["src/**/*"], + "exclude": ["**/*.spec.js", "**/*.spec.ts", "tests/**/*"] +} diff --git a/packages/database-monitor/tsconfig.json b/packages/database-monitor/tsconfig.json new file mode 100644 index 0000000000..0eebc69565 --- /dev/null +++ b/packages/database-monitor/tsconfig.json @@ -0,0 +1,109 @@ +{ + "compilerOptions": { + /* Visit https://aka.ms/tsconfig.json to read more about this file */ + + /* Projects */ + // "incremental": true, /* Enable incremental compilation */ + // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ + // "tsBuildInfoFile": "./", /* Specify the folder for .tsbuildinfo incremental compilation files. */ + // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects */ + // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ + // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ + + /* Language and Environment */ + "target": "ES2022" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */, + // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ + // "jsx": "preserve", /* Specify what JSX code is generated. */ + // "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */ + // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ + // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h' */ + // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ + // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using `jsx: react-jsx*`.` */ + // "reactNamespace": "", /* Specify the object invoked for `createElement`. This only applies when targeting `react` JSX emit. */ + // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ + // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ + + /* Modules */ + "module": "node16" /* Specify what module code is generated. */, + "rootDir": "./" /* Specify the root folder within your source files. */, + "moduleResolution": "node16" /* Specify how TypeScript looks up a file from a given module specifier. */, + "baseUrl": "./" /* Specify the base directory to resolve non-relative module names. */, + "paths": { + "@/*": ["./src/*"], + "#/*": ["./tests/*"] + }, + // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ + // "typeRoots": [], /* Specify multiple folders that act like `./node_modules/@types`. */ + // "types": [], /* Specify type package names to be included without being referenced in a source file. */ + // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ + // "resolveJsonModule": true, /* Enable importing .json files */ + // "noResolve": true, /* Disallow `import`s, `require`s or ``s from expanding the number of files TypeScript should add to a project. */ + + /* JavaScript Support */ + "allowJs": true /* Allow JavaScript files to be a part of your program. Use the `checkJS` option to get errors from these files. */, + "checkJs": false /* Enable error reporting in type-checked JavaScript files. */, + // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from `node_modules`. Only applicable with `allowJs`. */ + + /* Emit */ + // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ + // "declarationMap": true, /* Create sourcemaps for d.ts files. */ + // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ + "sourceMap": true /* Create source map files for emitted JavaScript files. */, + // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If `declaration` is true, also designates a file that bundles all .d.ts output. */ + "outDir": "./dist" /* Specify an output folder for all emitted files. */, + // "removeComments": true, /* Disable emitting comments. */ + // "noEmit": true, /* Disable emitting files from a compilation. */ + // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ + // "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types */ + // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ + // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ + // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ + // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ + // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ + // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ + // "newLine": "crlf", /* Set the newline character for emitting files. */ + // "stripInternal": true, /* Disable emitting declarations that have `@internal` in their JSDoc comments. */ + // "noEmitHelpers": true, /* Disable generating custom helper functions like `__extends` in compiled output. */ + // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ + // "preserveConstEnums": true, /* Disable erasing `const enum` declarations in generated code. */ + // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ + // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ + + /* Interop Constraints */ + // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ + "allowSyntheticDefaultImports": true /* Allow 'import x from y' when a module doesn't have a default export. */, + "esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables `allowSyntheticDefaultImports` for type compatibility. */, + // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ + "forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */, + + /* Type Checking */ + "strict": true /* Enable all strict type-checking options. */, + // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied `any` type.. */ + // "strictNullChecks": true, /* When type checking, take into account `null` and `undefined`. */ + // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ + // "strictBindCallApply": true, /* Check that the arguments for `bind`, `call`, and `apply` methods match the original function. */ + // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ + // "noImplicitThis": true, /* Enable error reporting when `this` is given the type `any`. */ + // "useUnknownInCatchVariables": true, /* Type catch clause variables as 'unknown' instead of 'any'. */ + // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ + // "noUnusedLocals": true, /* Enable error reporting when a local variables aren't read. */ + // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read */ + // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ + // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ + // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ + // "noUncheckedIndexedAccess": true, /* Include 'undefined' in index signature results */ + // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ + // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type */ + // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ + // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ + + /* Completeness */ + // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ + "skipLibCheck": true /* Skip type checking all .d.ts files. */ + }, + "ts-node": { + "swc": true + }, + "include": ["src/**/*", "tests/**/*", "vitest.config.ts"], + "exclude": ["node_modules", "coverage", "reports"] +} diff --git a/packages/database-monitor/vitest.config.ts b/packages/database-monitor/vitest.config.ts new file mode 100644 index 0000000000..a751fa9b1b --- /dev/null +++ b/packages/database-monitor/vitest.config.ts @@ -0,0 +1,20 @@ +import path from 'path' +import { configDefaults, defineConfig } from 'vitest/config' + +export default defineConfig({ + test: { + exclude: [...configDefaults.exclude], + globalSetup: ['./tests/hooks/globalSetup.ts'], + // reporters: ['verbose', 'hanging-process'] //uncomment to debug hanging processes etc. + sequence: { + shuffle: true, + concurrent: true + } + }, + resolve: { + alias: { + '@': path.resolve(__dirname, './src'), + '#': path.resolve(__dirname, './tests') + } + } +}) diff --git a/packages/preview-service/src/clients/knex.ts b/packages/preview-service/src/clients/knex.ts index 33dc042362..9f86bd139a 100644 --- a/packages/preview-service/src/clients/knex.ts +++ b/packages/preview-service/src/clients/knex.ts @@ -26,7 +26,7 @@ export const getDbClients = async () => { isDevOrTestEnv: isDevEnv, logger, maxConnections, - applicationName: 'speckle_fileimport_service' + applicationName: 'speckle_preview_service' } if (!FF_WORKSPACES_MULTI_REGION_ENABLED) { const mainClient = configureKnexClient( diff --git a/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts b/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts index b11bfe6cdc..e4d35be891 100644 --- a/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts +++ b/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts @@ -9,6 +9,7 @@ import { processCpuTotal } from '@/logging/highFrequencyMetrics/processCPUTotal' import { heapSizeAndUsed } from '@/logging/highFrequencyMetrics/heapSizeAndUsed' import { knexConnections } from '@/logging/highFrequencyMetrics/knexConnectionPool' import { type Knex } from 'knex' +import { join } from 'lodash' type MetricConfig = { prefix?: string @@ -28,8 +29,7 @@ export const initHighFrequencyMonitoring = (params: { collectionPeriodMilliseconds: number config: MetricConfig }): HighFrequencyMonitor => { - const { register, collectionPeriodMilliseconds } = params - const config = params.config + const { register, collectionPeriodMilliseconds, config } = params const registers = register ? [register] : undefined const namePrefix = config.prefix ?? '' const labels = config.labels ?? {} @@ -42,7 +42,7 @@ export const initHighFrequencyMonitoring = (params: { ] const selfMonitor = new Histogram({ - name: namePrefix + 'self_monitor_time_high_frequency', + name: join([namePrefix, 'self_monitor_time_high_frequency'], '_'), help: 'The time taken to collect all of the high frequency metrics, seconds.', registers, buckets: [0, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2], diff --git a/packages/server/logging/index.ts b/packages/server/logging/index.ts index c1020813ef..eb25062e32 100644 --- a/packages/server/logging/index.ts +++ b/packages/server/logging/index.ts @@ -24,7 +24,8 @@ export default async function (app: express.Express) { register: prometheusClient.register, collectionPeriodMilliseconds: highFrequencyMetricsCollectionPeriodMs(), config: { - getDbClients: getAllRegisteredDbClients + getDbClients: getAllRegisteredDbClients, + prefix: 'speckle' } }) highfrequencyMonitoring.start() diff --git a/utils/monitor-deployment/Dockerfile b/utils/monitor-deployment/Dockerfile deleted file mode 100644 index 361fd25be3..0000000000 --- a/utils/monitor-deployment/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -FROM debian:12-slim@sha256:67f3931ad8cb1967beec602d8c0506af1e37e8d73c2a0b38b181ec5d8560d395 AS build-stage - -WORKDIR /build - -# install tini -ARG TINI_VERSION=v0.19.0 -ENV TINI_VERSION=${TINI_VERSION} -ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini ./tini -RUN chmod +x ./tini - -# Add python virtual env -WORKDIR /venv -RUN apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install \ - --no-install-suggests --no-install-recommends --yes \ - python3-venv=3.11.2-1+b1 && \ - python3 -m venv /venv - -COPY utils/monitor-deployment/requirements.txt /requirements.txt -RUN /venv/bin/pip install --disable-pip-version-check --requirement /requirements.txt - -FROM gcr.io/distroless/python3-debian12:nonroot@sha256:14c62b8925d3bb30319de2f346bde203fe18103a68898284a62db9d4aa54c794 as production-stage -ARG PG_CONNECTION_STRING -ARG NODE_EXTRA_CA_CERTS -ENV PG_CONNECTION_STRING=${PG_CONNECTION_STRING} \ - NODE_EXTRA_CA_CERTS=${NODE_EXTRA_CA_CERTS} - -COPY --from=build-stage /venv /venv -COPY --from=build-stage /build/tini /usr/bin/tini -WORKDIR /app -COPY utils/monitor-deployment . - -ENTRYPOINT [ "tini", "--", "/venv/bin/python3", "-u", "src/run.py"] diff --git a/utils/monitor-deployment/dev_run.sh b/utils/monitor-deployment/dev_run.sh deleted file mode 100755 index 1cfafe16e1..0000000000 --- a/utils/monitor-deployment/dev_run.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -set -eo pipefail - -GIT_ROOT="$(git rev-parse --show-toplevel)" - -export PG_CONNECTION_STRING=postgres://speckle:speckle@localhost/speckle -pushd "${GIT_ROOT}/utils/monitor-deployment" -trap popd EXIT -pip install --disable-pip-version-check --requirement ./requirements.txt -python3 -u src/run.py diff --git a/utils/monitor-deployment/requirements.txt b/utils/monitor-deployment/requirements.txt deleted file mode 100644 index a6ea698bd5..0000000000 --- a/utils/monitor-deployment/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -psycopg2-binary==2.9.9 -prometheus-client==0.19.0 -structlog==23.3.0 diff --git a/utils/monitor-deployment/src/run.py b/utils/monitor-deployment/src/run.py deleted file mode 100644 index c7c34a73ec..0000000000 --- a/utils/monitor-deployment/src/run.py +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env python -import os -import sys - -import psycopg2 -from prometheus_client import start_http_server, Gauge -import time -import structlog -from logging import INFO, basicConfig - -basicConfig(format="%(message)s", stream=sys.stdout, level=INFO) - -structlog.configure( - processors=[ - structlog.stdlib.filter_by_level, - structlog.contextvars.merge_contextvars, - structlog.processors.add_log_level, - structlog.processors.StackInfoRenderer(), - structlog.processors.format_exc_info, - structlog.processors.TimeStamper(fmt="iso"), - structlog.stdlib.PositionalArgumentsFormatter(), - structlog.processors.UnicodeDecoder(), - structlog.processors.CallsiteParameterAdder( - { - structlog.processors.CallsiteParameter.FILENAME, - structlog.processors.CallsiteParameter.FUNC_NAME, - structlog.processors.CallsiteParameter.LINENO, - } - ), - structlog.processors.EventRenamer("msg"), - structlog.processors.JSONRenderer(), - ], - wrapper_class=structlog.make_filtering_bound_logger(INFO), - logger_factory=structlog.stdlib.LoggerFactory(), - cache_logger_on_first_use=True, -) -LOG = structlog.get_logger() -PG_CONNECTION_STRING = os.environ["PG_CONNECTION_STRING"] - -PROM = { - "db_size": Gauge("speckle_db_size", "Size of the entire database (in bytes)"), - "objects": Gauge("speckle_db_objects", "Number of objects"), - "streams": Gauge("speckle_db_streams", "Number of streams"), - "commits": Gauge("speckle_db_commits", "Number of commits"), - "users": Gauge("speckle_db_users", "Number of users"), - "fileimports": Gauge( - "speckle_db_fileimports", - "Number of imported files, by type and status", - labelnames=("filetype", "status"), - ), - "webhooks": Gauge( - "speckle_db_webhooks", - "Number of webhook calls, by status", - labelnames=("status",), - ), - "previews": Gauge( - "speckle_db_previews", "Number of previews, by status", labelnames=("status",) - ), - "filesize": Gauge( - "speckle_db_filesize", - "Size of imported files, by type (in bytes)", - labelnames=("filetype",), - ), - "tablesize": Gauge( - "speckle_db_tablesize", - "Size of tables in the database, by table (in bytes)", - labelnames=("table",), - ), -} - - -def tick(cur): - # Total DB size - cur.execute("SELECT pg_database_size(%s)", (cur.connection.info.dbname,)) - PROM["db_size"].set(cur.fetchone()[0]) - - # Counts for users, streams, commits, objects - cur.execute("SELECT reltuples AS estimate FROM pg_class WHERE relname = 'objects';") - PROM["objects"].set(cur.fetchone()[0]) - cur.execute("SELECT reltuples AS estimate FROM pg_class WHERE relname = 'streams';") - PROM["streams"].set(cur.fetchone()[0]) - cur.execute("SELECT reltuples AS estimate FROM pg_class WHERE relname = 'commits';") - PROM["commits"].set(cur.fetchone()[0]) - cur.execute("SELECT reltuples AS estimate FROM pg_class WHERE relname = 'users';") - PROM["users"].set(cur.fetchone()[0]) - - # File Imports - cur.execute( - """ - SELECT LOWER("fileType"), "convertedStatus", count(*) - FROM file_uploads - GROUP BY (LOWER("fileType"), "convertedStatus"); - """ - ) - # put in a dictionary so we fill non-existing statuses with zeroes - # (query can return PENDING files, then the next query will not return any PENDING rows. -> need to reset the metric to 0) - used_labels = {} - for row in cur: - if row[0] not in used_labels: - used_labels[row[0]] = {} - used_labels[row[0]][str(row[1])] = row[2] - for file_type in used_labels: - for status in range(4): - if str(status) in used_labels[file_type]: - PROM["fileimports"].labels(file_type, str(status)).set( - used_labels[file_type][str(status)] - ) - else: - PROM["fileimports"].labels(file_type, str(status)).set(0) - - cur.execute( - """ - SELECT LOWER("fileType") AS fileType, SUM("fileSize") AS fileSize - FROM file_uploads - GROUP BY LOWER("fileType"); - """ - ) - for row in cur: - PROM["filesize"].labels(row[0]).set(row[1]) - - # Webhooks - cur.execute( - """ - SELECT status, count(*) - FROM webhooks_events - GROUP BY status - """ - ) - values = {} - for row in cur: - values[str(row[0])] = row[1] - for status in range(4): - if str(status) in values: - PROM["webhooks"].labels(str(status)).set(values[str(status)]) - else: - PROM["webhooks"].labels(str(status)).set(0) - - # Previews - cur.execute( - """ - SELECT "previewStatus", count(*) - FROM object_preview - GROUP BY "previewStatus" - """ - ) - values = {} - for row in cur: - values[str(row[0])] = row[1] - for status in range(4): - if str(status) in values: - PROM["previews"].labels(str(status)).set(values[str(status)]) - else: - PROM["previews"].labels(str(status)).set(0) - - # Table sizes - cur.execute( - """ - SELECT - relname, - table_size - - FROM ( - SELECT - pg_catalog.pg_namespace.nspname AS schema_name, - relname, - pg_relation_size(pg_catalog.pg_class.oid) AS table_size - - FROM pg_catalog.pg_class - JOIN pg_catalog.pg_namespace ON relnamespace = pg_catalog.pg_namespace.oid - ) t - WHERE schema_name = 'public' - ORDER BY table_size DESC; - """ - ) - values = {} - for row in cur: - PROM["tablesize"].labels(row[0]).set(row[1]) - - -def main(): - start_http_server(9092) - - while True: - conn = None - cur = None - try: - t0 = time.time() - conn = psycopg2.connect( - PG_CONNECTION_STRING, - application_name="speckle_monitor_deployment", - ) - cur = conn.cursor() - t1 = time.time() - tick(cur) - t2 = time.time() - LOG.info( - "Updated metrics.", connection_period=(t1 - t0), query_period=(t2 - t1) - ) - except Exception as ex: - LOG.exception(ex) - finally: - if cur: - cur.close() - if conn: - conn.close() - - time.sleep(120) - - -if __name__ == "__main__": - main() diff --git a/yarn.lock b/yarn.lock index 6b9a839592..f516b0098a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -16613,6 +16613,43 @@ __metadata: languageName: node linkType: hard +"@speckle/database-monitor@workspace:packages/database-monitor": + version: 0.0.0-use.local + resolution: "@speckle/database-monitor@workspace:packages/database-monitor" + dependencies: + "@speckle/shared": "workspace:^" + "@types/express": "npm:^4.17.13" + "@types/http-errors": "npm:^2.0.4" + "@types/lodash-es": "npm:^4.17.6" + "@types/node": "npm:^18.19.38" + "@vitest/coverage-istanbul": "npm:^1.6.0" + concurrently: "npm:^8.2.2" + crypto: "npm:^1.0.1" + crypto-random-string: "npm:^5.0.0" + dotenv: "npm:^16.4.5" + eslint: "npm:^9.4.0" + eslint-config-prettier: "npm:^9.1.0" + eslint-plugin-vitest: "npm:^0.5.4" + esm-module-alias: "npm:^2.2.0" + express: "npm:^4.19.2" + http-errors: "npm:~1.6.3" + knex: "npm:^2.4.1" + lodash: "npm:^4.17.21" + lodash-es: "npm:^4.17.21" + nodemon: "npm:^2.0.20" + pg: "npm:^8.7.3" + pino: "npm:^8.7.0" + pino-http: "npm:^8.2.1" + pino-pretty: "npm:^9.1.1" + prettier: "npm:^2.5.1" + prom-client: "npm:^14.0.1" + rimraf: "npm:^5.0.7" + typescript: "npm:^4.6.4" + typescript-eslint: "npm:^7.12.0" + vitest: "npm:^1.6.0" + languageName: unknown + linkType: soft + "@speckle/dui3@workspace:packages/dui3": version: 0.0.0-use.local resolution: "@speckle/dui3@workspace:packages/dui3" @@ -19961,7 +19998,7 @@ __metadata: languageName: node linkType: hard -"@types/http-errors@npm:*": +"@types/http-errors@npm:*, @types/http-errors@npm:^2.0.4": version: 2.0.4 resolution: "@types/http-errors@npm:2.0.4" checksum: 10/1f3d7c3b32c7524811a45690881736b3ef741bf9849ae03d32ad1ab7062608454b150a4e7f1351f83d26a418b2d65af9bdc06198f1c079d75578282884c4e8e3 From 264c4f22bf0c82467bb13884a4d61b0f5003364b Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Thu, 12 Dec 2024 21:34:55 +0000 Subject: [PATCH 02/27] Restore changes to unrelated components --- .../logging/highFrequencyMetrics/highfrequencyMonitoring.ts | 6 +++--- packages/server/logging/index.ts | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts b/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts index e4d35be891..b11bfe6cdc 100644 --- a/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts +++ b/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts @@ -9,7 +9,6 @@ import { processCpuTotal } from '@/logging/highFrequencyMetrics/processCPUTotal' import { heapSizeAndUsed } from '@/logging/highFrequencyMetrics/heapSizeAndUsed' import { knexConnections } from '@/logging/highFrequencyMetrics/knexConnectionPool' import { type Knex } from 'knex' -import { join } from 'lodash' type MetricConfig = { prefix?: string @@ -29,7 +28,8 @@ export const initHighFrequencyMonitoring = (params: { collectionPeriodMilliseconds: number config: MetricConfig }): HighFrequencyMonitor => { - const { register, collectionPeriodMilliseconds, config } = params + const { register, collectionPeriodMilliseconds } = params + const config = params.config const registers = register ? [register] : undefined const namePrefix = config.prefix ?? '' const labels = config.labels ?? {} @@ -42,7 +42,7 @@ export const initHighFrequencyMonitoring = (params: { ] const selfMonitor = new Histogram({ - name: join([namePrefix, 'self_monitor_time_high_frequency'], '_'), + name: namePrefix + 'self_monitor_time_high_frequency', help: 'The time taken to collect all of the high frequency metrics, seconds.', registers, buckets: [0, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2], diff --git a/packages/server/logging/index.ts b/packages/server/logging/index.ts index eb25062e32..c1020813ef 100644 --- a/packages/server/logging/index.ts +++ b/packages/server/logging/index.ts @@ -24,8 +24,7 @@ export default async function (app: express.Express) { register: prometheusClient.register, collectionPeriodMilliseconds: highFrequencyMetricsCollectionPeriodMs(), config: { - getDbClients: getAllRegisteredDbClients, - prefix: 'speckle' + getDbClients: getAllRegisteredDbClients } }) highfrequencyMonitoring.start() From c9e5758d095e1d81189006f653dba53a330ea755 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Thu, 12 Dec 2024 21:52:19 +0000 Subject: [PATCH 03/27] Fix for testing --- packages/database-monitor/src/observability/metricsApp.ts | 2 ++ .../database-monitor/src/observability/prometheusMetrics.ts | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/database-monitor/src/observability/metricsApp.ts b/packages/database-monitor/src/observability/metricsApp.ts index 6d43a94046..8754c979e3 100644 --- a/packages/database-monitor/src/observability/metricsApp.ts +++ b/packages/database-monitor/src/observability/metricsApp.ts @@ -1,6 +1,7 @@ import { loggingExpressMiddleware } from '@/observability/expressLogging.js' import { metricsRouterFactory } from '@/observability/metricsRoute.js' import { initPrometheusMetrics } from '@/observability/prometheusMetrics.js' +import indexRouterFactory from '@/server/routes/index.js' import { errorHandler } from '@/utils/errorHandler.js' import express from 'express' import createError from 'http-errors' @@ -13,6 +14,7 @@ export const appFactory = () => { app.use(express.json({ limit: '100mb' })) app.use(express.urlencoded({ limit: '100mb', extended: false })) + app.use('/', indexRouterFactory()) app.use('/metrics', metricsRouterFactory()) // catch 404 and forward to error handler diff --git a/packages/database-monitor/src/observability/prometheusMetrics.ts b/packages/database-monitor/src/observability/prometheusMetrics.ts index d143830c38..df3c5cfedc 100644 --- a/packages/database-monitor/src/observability/prometheusMetrics.ts +++ b/packages/database-monitor/src/observability/prometheusMetrics.ts @@ -1,6 +1,6 @@ import { DbClients, getDbClients } from '@/clients/knex.js' import { logger } from '@/observability/logging.js' -import { databaseMonitorCollectionPeriodSeconds } from '@/utils/env.js' +import { databaseMonitorCollectionPeriodSeconds, isDevOrTestEnv } from '@/utils/env.js' import { get, join } from 'lodash-es' import { Histogram, Registry } from 'prom-client' import prometheusClient from 'prom-client' @@ -98,7 +98,7 @@ function initMonitoringMetrics(params: { const dbClientsRecord = await getDbClients() const dbClients = [ ...Object.entries(dbClientsRecord).map(([regionKey, client]) => ({ - client: client.private, //this has to be the private client, as we need to get the database name from the connection string. The public client, if via a connection pool, does not has the connection pool name not the database name. + client: isDevOrTestEnv() ? client.public : client.private, //this has to be the private client in production, as we need to get the database name from the connection string. The public client, if via a connection pool, does not has the connection pool name not the database name. isMain: regionKey === 'main', regionKey })) From 3825b8ecaf60bdf481f2babb313a02d74b510735 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:43:10 +0000 Subject: [PATCH 04/27] refactor prometheus metrics to keep the logic for collecting for a metric with the metric definition --- .../src/observability/prometheusMetrics.ts | 300 ++++++++++++------ 1 file changed, 199 insertions(+), 101 deletions(-) diff --git a/packages/database-monitor/src/observability/prometheusMetrics.ts b/packages/database-monitor/src/observability/prometheusMetrics.ts index df3c5cfedc..4e2933625d 100644 --- a/packages/database-monitor/src/observability/prometheusMetrics.ts +++ b/packages/database-monitor/src/observability/prometheusMetrics.ts @@ -1,8 +1,9 @@ import { DbClients, getDbClients } from '@/clients/knex.js' import { logger } from '@/observability/logging.js' import { databaseMonitorCollectionPeriodSeconds, isDevOrTestEnv } from '@/utils/env.js' +import { Knex } from 'knex' import { get, join } from 'lodash-es' -import { Histogram, Registry } from 'prom-client' +import { Gauge, Histogram, Registry } from 'prom-client' import prometheusClient from 'prom-client' let prometheusInitialized = false @@ -22,6 +23,18 @@ type MetricsMonitor = { start: () => () => void } +type WithOnDemandCollector = T & { + triggerCollect?: (params: { + dbClients: { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + client: Knex | undefined + isMain: boolean + regionKey: string + }[] + mainDbClient: Knex + }) => Promise +} + function initMonitoringMetrics(params: { register: Registry collectionPeriodMilliseconds: number @@ -35,74 +48,13 @@ function initMonitoringMetrics(params: { const labelNames = Object.keys(labels) const getDbClients = config.getDbClients - const dbSize = new prometheusClient.Gauge({ + const dbSize: WithOnDemandCollector> = new prometheusClient.Gauge({ name: join([namePrefix, 'db_size'], '_'), help: 'Size of the entire database (in bytes)', labelNames: ['region', ...labelNames] }) - const objects = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_objects'], '_'), - help: 'Number of objects', - labelNames - }) - const streams = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_streams'], '_'), - help: 'Number of streams/projects', - labelNames - }) - const commits = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_commits'], '_'), - help: 'Number of commits/versions', - labelNames - }) - const users = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_users'], '_'), - help: 'Number of users', - labelNames - }) - const fileimports = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_fileimports'], '_'), - help: 'Number of imported files, by type and status', - labelNames: ['filetype', 'status', ...labelNames] - }) - const filesize = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_filesize'], '_'), - help: 'Size of imported files, by type (in bytes)', - labelNames: ['filetype', ...labelNames] - }) - const webhooks = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_webhooks'], '_'), - help: 'Number of webhook calls, by status', - labelNames: ['status', ...labelNames] - }) - const previews = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_previews'], '_'), - help: 'Number of previews, by status', - labelNames: ['status', ...labelNames] - }) - const tablesize = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_tablesize'], '_'), - help: 'Size of tables in the database, by table (in bytes)', - labelNames: ['table', 'region', ...labelNames] - }) - - const selfMonitor = new Histogram({ - name: join([namePrefix, 'self_monitor_time_monitoring_metrics'], '_'), - help: 'The time taken to collect all of the database monitoring metrics, seconds.', - registers, - buckets: [0, 0.1, 0.25, 0.5, 1, 2, 5, 10], - labelNames - }) - - const collect = async () => { - const dbClientsRecord = await getDbClients() - const dbClients = [ - ...Object.entries(dbClientsRecord).map(([regionKey, client]) => ({ - client: isDevOrTestEnv() ? client.public : client.private, //this has to be the private client in production, as we need to get the database name from the connection string. The public client, if via a connection pool, does not has the connection pool name not the database name. - isMain: regionKey === 'main', - regionKey - })) - ] + dbSize.triggerCollect = async (params) => { + const { dbClients } = params await Promise.all( dbClients.map(async ({ client, regionKey }) => { if (!client) { @@ -131,61 +83,70 @@ function initMonitoringMetrics(params: { } else { logger.warn({ regionKey }, 'Could not get database name from client config') } - - const tableSizeResults = await client.raw<{ - rows: [{ table_name: string; table_size: string }] //bigints are returned as strings - }>( - ` - SELECT - table_name, - table_size - - FROM ( - SELECT - pg_catalog.pg_namespace.nspname AS schema_name, - relname AS table_name, - pg_relation_size(pg_catalog.pg_class.oid) AS table_size - - FROM pg_catalog.pg_class - JOIN pg_catalog.pg_namespace ON relnamespace = pg_catalog.pg_namespace.oid - ) t - WHERE schema_name = 'public' - ORDER BY table_size DESC; - ` - ) - for (const row of tableSizeResults.rows) { - tablesize.set( - { ...labels, table: row.table_name, region: regionKey }, - parseInt(row.table_size) //FIXME risk this bigint being too big for JS - ) - } }) ) + } - const mainDbClient = dbClients.find((c) => c.isMain)?.client - if (!mainDbClient) { - logger.warn('Could not find main database client') - return - } + const objects: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_objects'], '_'), + help: 'Number of objects', + labelNames + }) + objects.triggerCollect = async (params) => { + const { mainDbClient } = params - // Counts for users, streams, commits, objects const objectsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'objects' LIMIT 1;" ) objects.set({ ...labels }, objectsEstimate.rows[0].estimate) + } + + const streams: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_streams'], '_'), + help: 'Number of streams/projects', + labelNames + }) + streams.triggerCollect = async (params) => { + const { mainDbClient } = params const streamsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'streams' LIMIT 1;" ) streams.set({ ...labels }, streamsEstimate.rows[0].estimate) + } + + const commits: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_commits'], '_'), + help: 'Number of commits/versions', + labelNames + }) + commits.triggerCollect = async (params) => { + const { mainDbClient } = params const commitsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'commits' LIMIT 1;" ) commits.set({ ...labels }, commitsEstimate.rows[0].estimate) + } + + const users: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_users'], '_'), + help: 'Number of users', + labelNames + }) + users.triggerCollect = async (params) => { + const { mainDbClient } = params const usersEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'users' LIMIT 1;" ) users.set({ ...labels }, usersEstimate.rows[0].estimate) + } + const fileimports: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_fileimports'], '_'), + help: 'Number of imported files, by type and status', + labelNames: ['filetype', 'status', ...labelNames] + }) + fileimports.triggerCollect = async (params) => { + const { mainDbClient } = params const importedFiles = await mainDbClient.raw<{ rows: [{ fileType: string; convertedStatus: number; count: number }] }>( @@ -196,7 +157,7 @@ function initMonitoringMetrics(params: { ` ) - // Create zero-values for all possible combinations of file types and statuses + // Get the set of all unique file types and converted statuses in the database const allFileImportConvertedStatusAndFileTypes = importedFiles.rows.reduce( (acc, row) => { acc.convertedStatus.add(row.convertedStatus) @@ -205,6 +166,8 @@ function initMonitoringMetrics(params: { }, { convertedStatus: new Set(), fileType: new Set() } ) + + // now calculate the combinatorial set of all possible file types and statuses const remainingConvertedStatusAndFileTypes = new Set<{ fileType: string status: number @@ -215,7 +178,7 @@ function initMonitoringMetrics(params: { }) }) - //it's a gauge, so the updated actual values will override the zero-values + // now set the counts for the file types and statuses that are in the database for (const row of importedFiles.rows) { remainingConvertedStatusAndFileTypes.delete({ fileType: row.fileType, @@ -230,7 +193,15 @@ function initMonitoringMetrics(params: { remainingConvertedStatusAndFileTypes.forEach(({ fileType, status }) => { fileimports.set({ ...labels, filetype: fileType, status: status.toString() }, 0) }) + } + const filesize: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_filesize'], '_'), + help: 'Size of imported files, by type (in bytes)', + labelNames: ['filetype', ...labelNames] + }) + filesize.triggerCollect = async (params) => { + const { mainDbClient } = params const fileSizeResults = await mainDbClient.raw<{ rows: [{ fileType: string; fileSize: number }] }>( @@ -243,7 +214,15 @@ function initMonitoringMetrics(params: { for (const row of fileSizeResults.rows) { filesize.set({ ...labels, filetype: row.fileType }, row.fileSize) } + } + const webhooks: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_webhooks'], '_'), + help: 'Number of webhook calls, by status', + labelNames: ['status', ...labelNames] + }) + webhooks.triggerCollect = async (params) => { + const { mainDbClient } = params const webhookResults = await mainDbClient.raw<{ rows: [{ status: number; count: number }] }>( @@ -262,7 +241,15 @@ function initMonitoringMetrics(params: { remainingWebhookStatus.forEach((status) => { webhooks.set({ ...labels, status: status.toString() }, 0) }) + } + const previews: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_previews'], '_'), + help: 'Number of previews, by status', + labelNames: ['status', ...labelNames] + }) + previews.triggerCollect = async (params) => { + const { mainDbClient } = params const previewStatusResults = await mainDbClient.raw<{ rows: [{ previewStatus: number; count: number }] }>(` @@ -282,6 +269,117 @@ function initMonitoringMetrics(params: { }) } + const tablesize: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_tablesize'], '_'), + help: 'Size of tables in the database, by table (in bytes)', + labelNames: ['table', 'region', ...labelNames] + }) + tablesize.triggerCollect = async (params) => { + const { dbClients } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + if (!client) { + logger.error({ regionKey }, 'Could not get private client for region') + return + } + logger.info({ regionKey }, 'Collecting monitoring metrics for region') + const connectionString: string = String( + get(client.client, ['config', 'connection', 'connectionString'], '') + ) + if (!connectionString) { + logger.warn( + { regionKey }, + 'Could not get connection string from client config' + ) + } + const databaseName = new URL(connectionString).pathname?.split('/').pop() + if (databaseName) { + const dbSizeResult = await client.raw<{ + rows: [{ pg_database_size: string }] //bigints are returned as strings + }>('SELECT pg_database_size(?) LIMIT 1', [databaseName]) + dbSize.set( + { ...labels, region: regionKey }, + parseInt(dbSizeResult.rows[0].pg_database_size) //FIXME risk this bigint being too big for JS, but that would be a very large database! + ) + } else { + logger.warn({ regionKey }, 'Could not get database name from client config') + } + + const tableSizeResults = await client.raw<{ + rows: [{ table_name: string; table_size: string }] //bigints are returned as strings + }>( + ` + SELECT + table_name, + table_size + + FROM ( + SELECT + pg_catalog.pg_namespace.nspname AS schema_name, + relname AS table_name, + pg_relation_size(pg_catalog.pg_class.oid) AS table_size + + FROM pg_catalog.pg_class + JOIN pg_catalog.pg_namespace ON relnamespace = pg_catalog.pg_namespace.oid + ) t + WHERE schema_name = 'public' + ORDER BY table_size DESC; + ` + ) + for (const row of tableSizeResults.rows) { + tablesize.set( + { ...labels, table: row.table_name, region: regionKey }, + parseInt(row.table_size) //FIXME risk this bigint being too big for JS + ) + } + }) + ) + } + + const metricsToCollect = [ + dbSize, + tablesize, + objects, + streams, + commits, + users, + fileimports, + filesize, + webhooks, + previews + ] + + const selfMonitor = new Histogram({ + name: join([namePrefix, 'self_monitor_time_monitoring_metrics'], '_'), + help: 'The time taken to collect all of the database monitoring metrics, seconds.', + registers, + buckets: [0, 0.1, 0.25, 0.5, 1, 2, 5, 10], + labelNames + }) + + const collect = async () => { + const dbClientsRecord = await getDbClients() + const dbClients = [ + ...Object.entries(dbClientsRecord).map(([regionKey, client]) => ({ + client: isDevOrTestEnv() ? client.public : client.private, //this has to be the private client in production, as we need to get the database name from the connection string. The public client, if via a connection pool, does not has the connection pool name not the database name. + isMain: regionKey === 'main', + regionKey + })) + ] + + const mainDbClient = dbClients.find((c) => c.isMain)?.client + if (!mainDbClient) { + logger.warn('Could not find main database client') + return + } + + await Promise.all( + metricsToCollect.map(async (metric) => { + await metric.triggerCollect?.({ dbClients, mainDbClient }) + }) + ) + } + return { start: () => { const intervalId = setInterval(() => { From 0a4173038307d4fdb876552a1f6a09d70357e514 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:31:40 +0000 Subject: [PATCH 05/27] Reintroduce zod and znv as they are peer dependencies of Environment from shared --- packages/database-monitor/package.json | 4 ++- packages/database-monitor/src/clients/knex.ts | 26 +++++++++++----- .../src/observability/prometheusMetrics.ts | 30 +++++++++---------- yarn.lock | 9 ++++++ 4 files changed, 46 insertions(+), 23 deletions(-) diff --git a/packages/database-monitor/package.json b/packages/database-monitor/package.json index 4ff826e119..7d1d743dfb 100644 --- a/packages/database-monitor/package.json +++ b/packages/database-monitor/package.json @@ -42,7 +42,9 @@ "pino": "^8.7.0", "pino-http": "^8.2.1", "pino-pretty": "^9.1.1", - "prom-client": "^14.0.1" + "prom-client": "^14.0.1", + "znv": "^0.4.0", + "zod": "^3.24.1" }, "devDependencies": { "@types/express": "^4.17.13", diff --git a/packages/database-monitor/src/clients/knex.ts b/packages/database-monitor/src/clients/knex.ts index d47a234990..593f1efa25 100644 --- a/packages/database-monitor/src/clients/knex.ts +++ b/packages/database-monitor/src/clients/knex.ts @@ -10,13 +10,16 @@ import { loadMultiRegionsConfig, configureKnexClient } from '@speckle/shared/dist/commonjs/environment/multiRegionConfig.js' +import { Knex } from 'knex' const { FF_WORKSPACES_MULTI_REGION_ENABLED } = Environment.getFeatureFlags() -type ConfiguredKnexClient = ReturnType -export type DbClients = Record<'main', ConfiguredKnexClient> & - Record -let dbClients: DbClients +let dbClients: { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + client: Knex | undefined + isMain: boolean + regionKey: string +}[] export const getDbClients = async () => { if (dbClients) return dbClients @@ -39,15 +42,24 @@ export const getDbClients = async () => { }, configArgs ) - dbClients = { main: mainClient } + dbClients = [{ client: mainClient.private, regionKey: 'main', isMain: true }] } else { const configPath = process.env.MULTI_REGION_CONFIG_PATH || 'multiregion.json' const config = await loadMultiRegionsConfig({ path: configPath }) - const clients = [['main', configureKnexClient(config.main, configArgs)]] + const clients: [string, { public: Knex; private?: Knex }][] = [ + ['main', configureKnexClient(config.main, configArgs)] + ] Object.entries(config.regions).map(([key, config]) => { clients.push([key, configureKnexClient(config, configArgs)]) }) - dbClients = Object.fromEntries(clients) as DbClients + + dbClients = [ + ...clients.map(([regionKey, c]) => ({ + client: isDevOrTestEnv() ? c.public : c.private, //this has to be the private client in production, as we need to get the database name from the connection string. The public client, if via a connection pool, does not has the connection pool name not the database name. + isMain: regionKey === 'main', + regionKey + })) + ] } return dbClients } diff --git a/packages/database-monitor/src/observability/prometheusMetrics.ts b/packages/database-monitor/src/observability/prometheusMetrics.ts index 4e2933625d..9cd22f61cc 100644 --- a/packages/database-monitor/src/observability/prometheusMetrics.ts +++ b/packages/database-monitor/src/observability/prometheusMetrics.ts @@ -1,6 +1,6 @@ -import { DbClients, getDbClients } from '@/clients/knex.js' +import { getDbClients } from '@/clients/knex.js' import { logger } from '@/observability/logging.js' -import { databaseMonitorCollectionPeriodSeconds, isDevOrTestEnv } from '@/utils/env.js' +import { databaseMonitorCollectionPeriodSeconds } from '@/utils/env.js' import { Knex } from 'knex' import { get, join } from 'lodash-es' import { Gauge, Histogram, Registry } from 'prom-client' @@ -16,7 +16,14 @@ type MetricConfig = { prefix?: string labels?: Record buckets?: Record - getDbClients: () => Promise + getDbClients: () => Promise< + { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + client: Knex | undefined + isMain: boolean + regionKey: string + }[] + > } type MetricsMonitor = { @@ -98,7 +105,7 @@ function initMonitoringMetrics(params: { const objectsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'objects' LIMIT 1;" ) - objects.set({ ...labels }, objectsEstimate.rows[0].estimate) + objects.set({ ...labels }, Math.max(objectsEstimate.rows[0].estimate, 0)) } const streams: WithOnDemandCollector> = new prometheusClient.Gauge({ @@ -111,7 +118,7 @@ function initMonitoringMetrics(params: { const streamsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'streams' LIMIT 1;" ) - streams.set({ ...labels }, streamsEstimate.rows[0].estimate) + streams.set({ ...labels }, Math.max(streamsEstimate.rows[0].estimate)) } const commits: WithOnDemandCollector> = new prometheusClient.Gauge({ @@ -124,7 +131,7 @@ function initMonitoringMetrics(params: { const commitsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'commits' LIMIT 1;" ) - commits.set({ ...labels }, commitsEstimate.rows[0].estimate) + commits.set({ ...labels }, Math.max(commitsEstimate.rows[0].estimate)) } const users: WithOnDemandCollector> = new prometheusClient.Gauge({ @@ -137,7 +144,7 @@ function initMonitoringMetrics(params: { const usersEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'users' LIMIT 1;" ) - users.set({ ...labels }, usersEstimate.rows[0].estimate) + users.set({ ...labels }, Math.max(usersEstimate.rows[0].estimate)) } const fileimports: WithOnDemandCollector> = new prometheusClient.Gauge({ @@ -358,14 +365,7 @@ function initMonitoringMetrics(params: { }) const collect = async () => { - const dbClientsRecord = await getDbClients() - const dbClients = [ - ...Object.entries(dbClientsRecord).map(([regionKey, client]) => ({ - client: isDevOrTestEnv() ? client.public : client.private, //this has to be the private client in production, as we need to get the database name from the connection string. The public client, if via a connection pool, does not has the connection pool name not the database name. - isMain: regionKey === 'main', - regionKey - })) - ] + const dbClients = await getDbClients() const mainDbClient = dbClients.find((c) => c.isMain)?.client if (!mainDbClient) { diff --git a/yarn.lock b/yarn.lock index f516b0098a..b402fbadd3 100644 --- a/yarn.lock +++ b/yarn.lock @@ -16647,6 +16647,8 @@ __metadata: typescript: "npm:^4.6.4" typescript-eslint: "npm:^7.12.0" vitest: "npm:^1.6.0" + znv: "npm:^0.4.0" + zod: "npm:^3.24.1" languageName: unknown linkType: soft @@ -54912,6 +54914,13 @@ __metadata: languageName: node linkType: hard +"zod@npm:^3.24.1": + version: 3.24.1 + resolution: "zod@npm:3.24.1" + checksum: 10/54e25956495dec22acb9399c168c6ba657ff279801a7fcd0530c414d867f1dcca279335e160af9b138dd70c332e17d548be4bc4d2f7eaf627dead50d914fec27 + languageName: node + linkType: hard + "zx@npm:^8.1.2": version: 8.1.2 resolution: "zx@npm:8.1.2" From 417ca632d20c2c971655e32ac638e6200132f890 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 12:11:11 +0000 Subject: [PATCH 06/27] pass in databasename from config, simplify types --- .../database-monitor/multiregion.example.json | 6 +- packages/database-monitor/src/clients/knex.ts | 42 +++++++-- .../src/observability/prometheusMetrics.ts | 91 +++++-------------- .../src/environment/multiRegionConfig.ts | 6 ++ 4 files changed, 65 insertions(+), 80 deletions(-) diff --git a/packages/database-monitor/multiregion.example.json b/packages/database-monitor/multiregion.example.json index 77e82f94ef..61bd76bf9a 100644 --- a/packages/database-monitor/multiregion.example.json +++ b/packages/database-monitor/multiregion.example.json @@ -2,7 +2,8 @@ "main": { "postgres": { "connectionUri": "postgresql://speckle:speckle@127.0.0.1:5432/speckle", - "privateConnectionUri": "postgresql://speckle:speckle@postgres:5432/speckle" + "privateConnectionUri": "postgresql://speckle:speckle@postgres:5432/speckle", + "databaseName": "speckle" }, "blobStorage": { "accessKey": "minioadmin", @@ -17,7 +18,8 @@ "region1": { "postgres": { "connectionUri": "postgresql://speckle:speckle@127.0.0.1:5401/speckle", - "privateConnectionUri": "postgresql://speckle:speckle@postgres-region1:5432/speckle" + "privateConnectionUri": "postgresql://speckle:speckle@postgres-region1:5432/speckle", + "databaseName": "speckle" }, "blobStorage": { "accessKey": "minioadmin", diff --git a/packages/database-monitor/src/clients/knex.ts b/packages/database-monitor/src/clients/knex.ts index 593f1efa25..878ad7c6a9 100644 --- a/packages/database-monitor/src/clients/knex.ts +++ b/packages/database-monitor/src/clients/knex.ts @@ -14,12 +14,15 @@ import { Knex } from 'knex' const { FF_WORKSPACES_MULTI_REGION_ENABLED } = Environment.getFeatureFlags() -let dbClients: { +export type DbClient = { // eslint-disable-next-line @typescript-eslint/no-explicit-any - client: Knex | undefined + client: Knex isMain: boolean regionKey: string -}[] + databaseName?: string +} + +let dbClients: DbClient[] export const getDbClients = async () => { if (dbClients) return dbClients @@ -42,22 +45,41 @@ export const getDbClients = async () => { }, configArgs ) - dbClients = [{ client: mainClient.private, regionKey: 'main', isMain: true }] + const databaseName = new URL(getPostgresConnectionString()).pathname + .split('/') + .pop() + dbClients = [ + { client: mainClient.public, regionKey: 'main', isMain: true, databaseName } + ] } else { const configPath = process.env.MULTI_REGION_CONFIG_PATH || 'multiregion.json' const config = await loadMultiRegionsConfig({ path: configPath }) - const clients: [string, { public: Knex; private?: Knex }][] = [ - ['main', configureKnexClient(config.main, configArgs)] - ] + const clients: [string, { databaseName?: string; public: Knex; private?: Knex }][] = + [ + [ + 'main', + { + ...configureKnexClient(config.main, configArgs), + databaseName: config.main.postgres.databaseName + } + ] + ] Object.entries(config.regions).map(([key, config]) => { - clients.push([key, configureKnexClient(config, configArgs)]) + clients.push([ + key, + { + ...configureKnexClient(config, configArgs), + databaseName: config.postgres.databaseName + } + ]) }) dbClients = [ ...clients.map(([regionKey, c]) => ({ - client: isDevOrTestEnv() ? c.public : c.private, //this has to be the private client in production, as we need to get the database name from the connection string. The public client, if via a connection pool, does not has the connection pool name not the database name. + client: c.public, isMain: regionKey === 'main', - regionKey + regionKey, + databaseName: c.databaseName })) ] } diff --git a/packages/database-monitor/src/observability/prometheusMetrics.ts b/packages/database-monitor/src/observability/prometheusMetrics.ts index 9cd22f61cc..324b2f8837 100644 --- a/packages/database-monitor/src/observability/prometheusMetrics.ts +++ b/packages/database-monitor/src/observability/prometheusMetrics.ts @@ -1,8 +1,8 @@ -import { getDbClients } from '@/clients/knex.js' +import { DbClient, getDbClients } from '@/clients/knex.js' import { logger } from '@/observability/logging.js' import { databaseMonitorCollectionPeriodSeconds } from '@/utils/env.js' import { Knex } from 'knex' -import { get, join } from 'lodash-es' +import { join } from 'lodash-es' import { Gauge, Histogram, Registry } from 'prom-client' import prometheusClient from 'prom-client' @@ -16,14 +16,7 @@ type MetricConfig = { prefix?: string labels?: Record buckets?: Record - getDbClients: () => Promise< - { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - client: Knex | undefined - isMain: boolean - regionKey: string - }[] - > + getDbClients: () => Promise } type MetricsMonitor = { @@ -32,12 +25,7 @@ type MetricsMonitor = { type WithOnDemandCollector = T & { triggerCollect?: (params: { - dbClients: { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - client: Knex | undefined - isMain: boolean - regionKey: string - }[] + dbClients: DbClient[] mainDbClient: Knex }) => Promise } @@ -63,33 +51,27 @@ function initMonitoringMetrics(params: { dbSize.triggerCollect = async (params) => { const { dbClients } = params await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - if (!client) { - logger.error({ regionKey }, 'Could not get private client for region') - return - } - logger.info({ regionKey }, 'Collecting monitoring metrics for region') - const connectionString: string = String( - get(client.client, ['config', 'connection', 'connectionString'], '') - ) - if (!connectionString) { + dbClients.map(async ({ client, regionKey, databaseName }) => { + if (!databaseName) { logger.warn( - { regionKey }, - 'Could not get connection string from client config' - ) - } - const databaseName = new URL(connectionString).pathname?.split('/').pop() - if (databaseName) { - const dbSizeResult = await client.raw<{ - rows: [{ pg_database_size: string }] //bigints are returned as strings - }>('SELECT pg_database_size(?) LIMIT 1', [databaseName]) - dbSize.set( - { ...labels, region: regionKey }, - parseInt(dbSizeResult.rows[0].pg_database_size) //FIXME risk this bigint being too big for JS, but that would be a very large database! + { region: regionKey }, + "Could not get database name from client config for region '{region}'" ) - } else { - logger.warn({ regionKey }, 'Could not get database name from client config') + return } + + logger.info( + { region: regionKey, databaseName }, + "Collecting database size for region '{region}' from database '{databaseName}'" + ) + + const dbSizeResult = await client.raw<{ + rows: [{ pg_database_size: string }] //bigints are returned as strings + }>('SELECT pg_database_size(?) LIMIT 1', [databaseName]) + dbSize.set( + { ...labels, region: regionKey }, + parseInt(dbSizeResult.rows[0].pg_database_size) //NOTE risk this bigint being too big for JS, but that would be a very large database! + ) }) ) } @@ -285,33 +267,6 @@ function initMonitoringMetrics(params: { const { dbClients } = params await Promise.all( dbClients.map(async ({ client, regionKey }) => { - if (!client) { - logger.error({ regionKey }, 'Could not get private client for region') - return - } - logger.info({ regionKey }, 'Collecting monitoring metrics for region') - const connectionString: string = String( - get(client.client, ['config', 'connection', 'connectionString'], '') - ) - if (!connectionString) { - logger.warn( - { regionKey }, - 'Could not get connection string from client config' - ) - } - const databaseName = new URL(connectionString).pathname?.split('/').pop() - if (databaseName) { - const dbSizeResult = await client.raw<{ - rows: [{ pg_database_size: string }] //bigints are returned as strings - }>('SELECT pg_database_size(?) LIMIT 1', [databaseName]) - dbSize.set( - { ...labels, region: regionKey }, - parseInt(dbSizeResult.rows[0].pg_database_size) //FIXME risk this bigint being too big for JS, but that would be a very large database! - ) - } else { - logger.warn({ regionKey }, 'Could not get database name from client config') - } - const tableSizeResults = await client.raw<{ rows: [{ table_name: string; table_size: string }] //bigints are returned as strings }>( @@ -336,7 +291,7 @@ function initMonitoringMetrics(params: { for (const row of tableSizeResults.rows) { tablesize.set( { ...labels, table: row.table_name, region: regionKey }, - parseInt(row.table_size) //FIXME risk this bigint being too big for JS + parseInt(row.table_size) //NOTE risk this bigint being too big for JS, but that would be a very large table! ) } }) diff --git a/packages/shared/src/environment/multiRegionConfig.ts b/packages/shared/src/environment/multiRegionConfig.ts index 27c504165a..b308d85615 100644 --- a/packages/shared/src/environment/multiRegionConfig.ts +++ b/packages/shared/src/environment/multiRegionConfig.ts @@ -13,6 +13,12 @@ const regionConfigSchemaV1 = z.object({ .describe( 'Full Postgres connection URI (e.g. "postgres://user:password@host:port/dbname")' ), + databaseName: z + .string() + .describe( + 'Name of the database to connect to. Used where the connection string is to a connection pool, and does not include the database name.' + ) + .optional(), privateConnectionUri: z .string() .describe( From 64f606e00ce789cd10cbea00eafe4723a77a1fae Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 12:24:33 +0000 Subject: [PATCH 07/27] We don't have any tests, simplify the PR by removing the unused files for now --- packages/database-monitor/src/aliasLoader.ts | 6 +- .../database-monitor/tests/helpers/helpers.ts | 45 ------------- .../tests/helpers/testExtensions.ts | 65 ------------------- .../tests/helpers/testKnexClient.ts | 20 ------ .../tests/hooks/globalSetup.ts | 50 -------------- packages/database-monitor/tsconfig.build.json | 2 +- packages/database-monitor/tsconfig.json | 5 +- packages/database-monitor/vitest.config.ts | 4 +- 8 files changed, 6 insertions(+), 191 deletions(-) delete mode 100644 packages/database-monitor/tests/helpers/helpers.ts delete mode 100644 packages/database-monitor/tests/helpers/testExtensions.ts delete mode 100644 packages/database-monitor/tests/helpers/testKnexClient.ts delete mode 100644 packages/database-monitor/tests/hooks/globalSetup.ts diff --git a/packages/database-monitor/src/aliasLoader.ts b/packages/database-monitor/src/aliasLoader.ts index 8deeda0895..d77980f8da 100644 --- a/packages/database-monitor/src/aliasLoader.ts +++ b/packages/database-monitor/src/aliasLoader.ts @@ -1,8 +1,6 @@ import generateAliasesResolver from 'esm-module-alias' -import { packageRoot, srcRoot } from './root.js' -import path from 'node:path' +import { srcRoot } from './root.js' export const resolve = generateAliasesResolver({ - '@': srcRoot, - '#': path.resolve(packageRoot, './tests') + '@': srcRoot }) diff --git a/packages/database-monitor/tests/helpers/helpers.ts b/packages/database-monitor/tests/helpers/helpers.ts deleted file mode 100644 index 5ddaa5c683..0000000000 --- a/packages/database-monitor/tests/helpers/helpers.ts +++ /dev/null @@ -1,45 +0,0 @@ -import { startServer } from '@/server/server.js' -import http from 'http' -import type { AddressInfo } from 'net' -import { getPostgresConnectionString } from '@/utils/env.js' - -export const startAndWaitOnServers = async () => { - let metricsServerAddress: string | AddressInfo | null = null - - const { metricsServer } = startServer({ - serveOnRandomPort: true - }) - metricsServer.on('listening', () => { - metricsServerAddress = metricsServer.address() - }) - - //HACK wait until both servers are available - while (!metricsServerAddress) { - // wait for the servers to start - await new Promise((resolve) => setTimeout(resolve, 100)) - } - - return { metricsServer } -} - -export const getServerPort = (server: http.Server) => { - const address = server.address() - if (address && typeof address !== 'string') { - return address.port - } - throw new Error('Server port is not available') -} - -export const customizePostgresConnectionString = (databaseName?: string) => { - const originalPostgresConnectionString = getPostgresConnectionString() - if (!databaseName) return originalPostgresConnectionString - - const originalPostgresUrl = new URL(originalPostgresConnectionString) - const protocol = originalPostgresUrl.protocol - const user = originalPostgresUrl.username - const pass = originalPostgresUrl.password - const host = originalPostgresUrl.hostname - const port = originalPostgresUrl.port - const origin = `${protocol}//${user}:${pass}@${host}:${port}` - return new URL(databaseName, origin).toString() -} diff --git a/packages/database-monitor/tests/helpers/testExtensions.ts b/packages/database-monitor/tests/helpers/testExtensions.ts deleted file mode 100644 index 36f570e294..0000000000 --- a/packages/database-monitor/tests/helpers/testExtensions.ts +++ /dev/null @@ -1,65 +0,0 @@ -import { stopServer } from '@/server/server.js' -import { inject, test } from 'vitest' -import { getTestDb } from '#/helpers/testKnexClient.js' -import { startAndWaitOnServers } from '#/helpers/helpers.js' -import type { Knex } from 'knex' -import { Server } from 'http' - -export interface DatabaseIntegrationTestContext { - context: { - db: Knex.Transaction - } -} - -// vitest reference: https://vitest.dev/guide/test-context#fixture-initialization -export const databaseIntegrationTest = test.extend({ - // this key has to match the top level key in the interface (i.e. `context`). Some vitest typing magic at work here. - context: [ - // eslint-disable-next-line @typescript-eslint/no-unused-vars - async ({ task, onTestFinished }, use) => { - const dbName = inject('dbName') - // equivalent of beforeEach - const db = await getTestDb(dbName).transaction() - - // schedule the cleanup. Runs regardless of test status, and runs after afterEach. - onTestFinished(async () => { - await db.rollback() - }) - - // now run the test - await use({ db }) - }, - { auto: true } // we want to run this for each databaseIntegrationTest, even if the context is not explicitly requested by the test - ] -}) - -export interface E2ETestContext extends DatabaseIntegrationTestContext { - context: { - db: Knex.Transaction - metricsServer: Server - } -} - -// vitest reference: https://vitest.dev/guide/test-context#fixture-initialization -export const e2eTest = test.extend({ - // this key has to match the top level key in the interface (i.e. `context`). Some vitest typing magic at work here. - context: [ - // eslint-disable-next-line @typescript-eslint/no-unused-vars - async ({ task, onTestFinished }, use) => { - const dbName = inject('dbName') - // equivalent of beforeEach - const db = await getTestDb(dbName).transaction() - const { metricsServer } = await startAndWaitOnServers() - - // schedule the cleanup. Runs regardless of test status, and runs after afterEach. - onTestFinished(async () => { - if (metricsServer) stopServer({ server: metricsServer }) - if (db) await db.rollback() - }) - - // now run the test - await use({ db, metricsServer }) - }, - { auto: true } // we want to run this for each e2eTest, even if the context is not explicitly requested by the test - ] -}) diff --git a/packages/database-monitor/tests/helpers/testKnexClient.ts b/packages/database-monitor/tests/helpers/testKnexClient.ts deleted file mode 100644 index 008605bb96..0000000000 --- a/packages/database-monitor/tests/helpers/testKnexClient.ts +++ /dev/null @@ -1,20 +0,0 @@ -/* eslint-disable camelcase */ -import { knex } from 'knex' -import { customizePostgresConnectionString } from '#/helpers/helpers.js' - -export const getTestDb = (databaseName?: string) => - knex({ - client: 'pg', - connection: { - application_name: 'speckle_database_monitor', - connectionString: customizePostgresConnectionString(databaseName) - }, - pool: { min: 0, max: 2 } - // migrations are managed in the server package for production - // for tests, we are creating a new database for each test run so we can't use this default migration functionality - // migrations: { - // extension: '.ts', - // directory: path.resolve(__dirname, '../migrations'), - // loadExtensions: ['js', 'ts'] - // } - }) diff --git a/packages/database-monitor/tests/hooks/globalSetup.ts b/packages/database-monitor/tests/hooks/globalSetup.ts deleted file mode 100644 index ce050da6fa..0000000000 --- a/packages/database-monitor/tests/hooks/globalSetup.ts +++ /dev/null @@ -1,50 +0,0 @@ -/** - * These hooks are run once, before and after the test suite. - * It is configured via the vitest.config.ts file. - */ -import '@/bootstrap.js' // This has side-effects and has to be imported first -import { testLogger as logger } from '@/observability/logging.js' -import cryptoRandomString from 'crypto-random-string' -import type { GlobalSetupContext } from 'vitest/node' - -declare module 'vitest' { - export interface ProvidedContext { - dbName: string - } -} - -const dbName = - process.env.TEST_DB || - `preview_service_${cryptoRandomString({ - length: 10, - type: 'alphanumeric' - })}`.toLocaleLowerCase() //postgres will automatically lower case new db names - -/** - * Global setup hook - * This hook is run once before any tests are run - * Defined in vitest.config.ts under test.globalSetup - */ -export function setup({ provide }: GlobalSetupContext) { - logger.info('🏃🏻‍♀️‍➡️ Running vitest setup global hook') - - // this provides the dbName to all tests, and can be accessed via inject('dbName'). NB: The test extensions already implement this, so use a test extension. - provide('dbName', dbName) - - logger.info( - `💁🏽‍♀️ Completed the vitest setup global hook. Database created at ${dbName}` - ) -} - -/** - * Global teardown hook - * This hook is run once after all tests are run - * Defined in vitest.config.ts under test.globalTeardown - */ -export function teardown() { - logger.info('🏃🏻‍♀️ Running vitest teardown global hook') - - logger.info( - `✅ Completed the vitest teardown global hook. Database ${dbName} down migrated.` - ) -} diff --git a/packages/database-monitor/tsconfig.build.json b/packages/database-monitor/tsconfig.build.json index f2d7b55d71..8e1bc28f9b 100644 --- a/packages/database-monitor/tsconfig.build.json +++ b/packages/database-monitor/tsconfig.build.json @@ -1,5 +1,5 @@ { "extends": "./tsconfig.json", "include": ["src/**/*"], - "exclude": ["**/*.spec.js", "**/*.spec.ts", "tests/**/*"] + "exclude": ["**/*.spec.js", "**/*.spec.ts"] } diff --git a/packages/database-monitor/tsconfig.json b/packages/database-monitor/tsconfig.json index 0eebc69565..4351d814b7 100644 --- a/packages/database-monitor/tsconfig.json +++ b/packages/database-monitor/tsconfig.json @@ -29,8 +29,7 @@ "moduleResolution": "node16" /* Specify how TypeScript looks up a file from a given module specifier. */, "baseUrl": "./" /* Specify the base directory to resolve non-relative module names. */, "paths": { - "@/*": ["./src/*"], - "#/*": ["./tests/*"] + "@/*": ["./src/*"] }, // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ // "typeRoots": [], /* Specify multiple folders that act like `./node_modules/@types`. */ @@ -104,6 +103,6 @@ "ts-node": { "swc": true }, - "include": ["src/**/*", "tests/**/*", "vitest.config.ts"], + "include": ["src/**/*", "vitest.config.ts"], "exclude": ["node_modules", "coverage", "reports"] } diff --git a/packages/database-monitor/vitest.config.ts b/packages/database-monitor/vitest.config.ts index a751fa9b1b..dbf4dedd5b 100644 --- a/packages/database-monitor/vitest.config.ts +++ b/packages/database-monitor/vitest.config.ts @@ -4,7 +4,6 @@ import { configDefaults, defineConfig } from 'vitest/config' export default defineConfig({ test: { exclude: [...configDefaults.exclude], - globalSetup: ['./tests/hooks/globalSetup.ts'], // reporters: ['verbose', 'hanging-process'] //uncomment to debug hanging processes etc. sequence: { shuffle: true, @@ -13,8 +12,7 @@ export default defineConfig({ }, resolve: { alias: { - '@': path.resolve(__dirname, './src'), - '#': path.resolve(__dirname, './tests') + '@': path.resolve(__dirname, './src') } } }) From 8bbfde547734b3206966a7a0e58cd345dc60eb8b Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:07:14 +0000 Subject: [PATCH 08/27] fix(server): correct type in notifications helper --- packages/server/test/notificationsHelper.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/server/test/notificationsHelper.ts b/packages/server/test/notificationsHelper.ts index 2d70d10aee..fb2d7d6518 100644 --- a/packages/server/test/notificationsHelper.ts +++ b/packages/server/test/notificationsHelper.ts @@ -89,7 +89,7 @@ export function buildNotificationsStateTracker() { * otherwise it might get processed so fast that you miss it */ waitForAck: async (predicate?: (e: AckEvent) => boolean, timeout = 3000) => { - let timeoutRef: NodeJS.Timer + let timeoutRef: NodeJS.Timeout let promiseAckTracker: (e: AckEvent) => void // We start tracking even before promise is created so that we can't possibly miss it From ef5921ed81e60f683e1a98f347e0b009a5959591 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:31:12 +0000 Subject: [PATCH 09/27] ci(publish): update publish job --- .circleci/config.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c3588c135f..4a79d3cc2c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1118,8 +1118,7 @@ jobs: docker-publish-monitor-container: <<: *publish-job environment: - FOLDER: utils - SPECKLE_SERVER_PACKAGE: monitor-deployment + SPECKLE_SERVER_PACKAGE: database-monitor docker-publish-docker-compose-ingress: <<: *publish-job From bb8355d8de608b5868d5bcac976c7a57781de353 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:35:05 +0000 Subject: [PATCH 10/27] rename to ensure consistent image name with existing database monitor --- .circleci/config.yml | 4 +- .../.env.example | 0 .../Dockerfile | 10 +-- .../README.md | 0 .../eslint.config.mjs | 0 .../multiregion.example.json | 0 .../package.json | 4 +- .../src/aliasLoader.ts | 0 .../src/bin.ts | 0 .../src/bootstrap.ts | 0 .../src/clients/knex.ts | 0 .../src/domain/const.ts | 0 .../src/observability/expressLogging.ts | 0 .../src/observability/logging.ts | 2 +- .../src/observability/metricsApp.ts | 0 .../src/observability/metricsRoute.ts | 0 .../src/observability/prometheusMetrics.ts | 2 +- .../src/root.ts | 0 .../src/server/routes/index.ts | 0 .../src/server/server.ts | 0 .../src/utils/env.ts | 0 .../src/utils/errorHandler.ts | 0 .../tsconfig.build.json | 0 .../tsconfig.json | 0 .../vitest.config.ts | 0 yarn.lock | 78 +++++++++---------- 26 files changed, 50 insertions(+), 50 deletions(-) rename packages/{database-monitor => monitor-deployment}/.env.example (100%) rename packages/{database-monitor => monitor-deployment}/Dockerfile (80%) rename packages/{database-monitor => monitor-deployment}/README.md (100%) rename packages/{database-monitor => monitor-deployment}/eslint.config.mjs (100%) rename packages/{database-monitor => monitor-deployment}/multiregion.example.json (100%) rename packages/{database-monitor => monitor-deployment}/package.json (95%) rename packages/{database-monitor => monitor-deployment}/src/aliasLoader.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/bin.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/bootstrap.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/clients/knex.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/domain/const.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/observability/expressLogging.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/observability/logging.ts (95%) rename packages/{database-monitor => monitor-deployment}/src/observability/metricsApp.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/observability/metricsRoute.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/observability/prometheusMetrics.ts (99%) rename packages/{database-monitor => monitor-deployment}/src/root.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/server/routes/index.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/server/server.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/utils/env.ts (100%) rename packages/{database-monitor => monitor-deployment}/src/utils/errorHandler.ts (100%) rename packages/{database-monitor => monitor-deployment}/tsconfig.build.json (100%) rename packages/{database-monitor => monitor-deployment}/tsconfig.json (100%) rename packages/{database-monitor => monitor-deployment}/vitest.config.ts (100%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4a79d3cc2c..78c50c415c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1049,7 +1049,7 @@ jobs: docker-build-monitor-container: <<: *build-job environment: - SPECKLE_SERVER_PACKAGE: database-monitor + SPECKLE_SERVER_PACKAGE: monitor-deployment docker-build-docker-compose-ingress: <<: *build-job @@ -1118,7 +1118,7 @@ jobs: docker-publish-monitor-container: <<: *publish-job environment: - SPECKLE_SERVER_PACKAGE: database-monitor + SPECKLE_SERVER_PACKAGE: monitor-deployment docker-publish-docker-compose-ingress: <<: *publish-job diff --git a/packages/database-monitor/.env.example b/packages/monitor-deployment/.env.example similarity index 100% rename from packages/database-monitor/.env.example rename to packages/monitor-deployment/.env.example diff --git a/packages/database-monitor/Dockerfile b/packages/monitor-deployment/Dockerfile similarity index 80% rename from packages/database-monitor/Dockerfile rename to packages/monitor-deployment/Dockerfile index 48e25ead95..19e296d22d 100644 --- a/packages/database-monitor/Dockerfile +++ b/packages/monitor-deployment/Dockerfile @@ -22,17 +22,17 @@ COPY package.json yarn.lock ./ # Only copy in the relevant package.json files for the dependencies COPY packages/shared/package.json ./packages/shared/ -COPY packages/database-monitor/package.json ./packages/database-monitor/ +COPY packages/monitor-deployment/package.json ./packages/monitor-deployment/ RUN yarn workspaces focus -A && yarn install # Only copy in the relevant source files for the dependencies COPY packages/shared ./packages/shared/ -COPY packages/database-monitor ./packages/database-monitor/ +COPY packages/monitor-deployment ./packages/monitor-deployment/ RUN yarn workspaces foreach -W run build -WORKDIR /speckle-server/packages/database-monitor +WORKDIR /speckle-server/packages/monitor-deployment RUN yarn workspaces focus --production FROM gcr.io/distroless/nodejs18-debian12:nonroot@sha256:afdea027580f7afcaf1f316b2b3806690c297cb3ce6ddc5cf6a15804dc1c790f AS production-stage @@ -43,9 +43,9 @@ ENV NODE_ENV=${NODE_ENV} WORKDIR /speckle-server COPY --from=build-stage /speckle-server/tini /usr/bin/tini COPY --from=build-stage /speckle-server/packages/shared ./packages/shared -COPY --from=build-stage /speckle-server/packages/database-monitor ./packages/database-monitor +COPY --from=build-stage /speckle-server/packages/monitor-deployment ./packages/monitor-deployment COPY --from=build-stage /speckle-server/node_modules ./node_modules -WORKDIR /speckle-server/packages/database-monitor +WORKDIR /speckle-server/packages/monitor-deployment ENTRYPOINT [ "tini", "--", "/nodejs/bin/node", "--loader=./dist/src/aliasLoader.js", "bin/www.js" ] diff --git a/packages/database-monitor/README.md b/packages/monitor-deployment/README.md similarity index 100% rename from packages/database-monitor/README.md rename to packages/monitor-deployment/README.md diff --git a/packages/database-monitor/eslint.config.mjs b/packages/monitor-deployment/eslint.config.mjs similarity index 100% rename from packages/database-monitor/eslint.config.mjs rename to packages/monitor-deployment/eslint.config.mjs diff --git a/packages/database-monitor/multiregion.example.json b/packages/monitor-deployment/multiregion.example.json similarity index 100% rename from packages/database-monitor/multiregion.example.json rename to packages/monitor-deployment/multiregion.example.json diff --git a/packages/database-monitor/package.json b/packages/monitor-deployment/package.json similarity index 95% rename from packages/database-monitor/package.json rename to packages/monitor-deployment/package.json index 7d1d743dfb..ae6fbc60bc 100644 --- a/packages/database-monitor/package.json +++ b/packages/monitor-deployment/package.json @@ -1,5 +1,5 @@ { - "name": "@speckle/database-monitor", + "name": "@speckle/monitor-deployment", "private": true, "version": "2.5.4", "description": "Query connected databases and generate metrics.", @@ -8,7 +8,7 @@ "repository": { "type": "git", "url": "https://github.com/specklesystems/speckle-server.git", - "directory": "packages/database-monitor" + "directory": "packages/monitor-deployment" }, "type": "module", "engines": { diff --git a/packages/database-monitor/src/aliasLoader.ts b/packages/monitor-deployment/src/aliasLoader.ts similarity index 100% rename from packages/database-monitor/src/aliasLoader.ts rename to packages/monitor-deployment/src/aliasLoader.ts diff --git a/packages/database-monitor/src/bin.ts b/packages/monitor-deployment/src/bin.ts similarity index 100% rename from packages/database-monitor/src/bin.ts rename to packages/monitor-deployment/src/bin.ts diff --git a/packages/database-monitor/src/bootstrap.ts b/packages/monitor-deployment/src/bootstrap.ts similarity index 100% rename from packages/database-monitor/src/bootstrap.ts rename to packages/monitor-deployment/src/bootstrap.ts diff --git a/packages/database-monitor/src/clients/knex.ts b/packages/monitor-deployment/src/clients/knex.ts similarity index 100% rename from packages/database-monitor/src/clients/knex.ts rename to packages/monitor-deployment/src/clients/knex.ts diff --git a/packages/database-monitor/src/domain/const.ts b/packages/monitor-deployment/src/domain/const.ts similarity index 100% rename from packages/database-monitor/src/domain/const.ts rename to packages/monitor-deployment/src/domain/const.ts diff --git a/packages/database-monitor/src/observability/expressLogging.ts b/packages/monitor-deployment/src/observability/expressLogging.ts similarity index 100% rename from packages/database-monitor/src/observability/expressLogging.ts rename to packages/monitor-deployment/src/observability/expressLogging.ts diff --git a/packages/database-monitor/src/observability/logging.ts b/packages/monitor-deployment/src/observability/logging.ts similarity index 95% rename from packages/database-monitor/src/observability/logging.ts rename to packages/monitor-deployment/src/observability/logging.ts index 8fef22a6c9..b947653829 100644 --- a/packages/database-monitor/src/observability/logging.ts +++ b/packages/monitor-deployment/src/observability/logging.ts @@ -7,7 +7,7 @@ export const extendLoggerComponent = elc export const logger = extendLoggerComponent( getLogger(getLogLevel(), isLogPretty()), - 'database-monitor' + 'monitor-deployment' ) export const serverLogger = extendLoggerComponent(logger, 'server') export const testLogger = getLogger(getLogLevel(), isLogPretty()) diff --git a/packages/database-monitor/src/observability/metricsApp.ts b/packages/monitor-deployment/src/observability/metricsApp.ts similarity index 100% rename from packages/database-monitor/src/observability/metricsApp.ts rename to packages/monitor-deployment/src/observability/metricsApp.ts diff --git a/packages/database-monitor/src/observability/metricsRoute.ts b/packages/monitor-deployment/src/observability/metricsRoute.ts similarity index 100% rename from packages/database-monitor/src/observability/metricsRoute.ts rename to packages/monitor-deployment/src/observability/metricsRoute.ts diff --git a/packages/database-monitor/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts similarity index 99% rename from packages/database-monitor/src/observability/prometheusMetrics.ts rename to packages/monitor-deployment/src/observability/prometheusMetrics.ts index 324b2f8837..aa25568c21 100644 --- a/packages/database-monitor/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -365,7 +365,7 @@ export function initPrometheusMetrics() { prometheusClient.register.clear() prometheusClient.register.setDefaultLabels({ project: 'speckle-server', - app: 'database-monitor' + app: 'monitor-deployment' }) try { diff --git a/packages/database-monitor/src/root.ts b/packages/monitor-deployment/src/root.ts similarity index 100% rename from packages/database-monitor/src/root.ts rename to packages/monitor-deployment/src/root.ts diff --git a/packages/database-monitor/src/server/routes/index.ts b/packages/monitor-deployment/src/server/routes/index.ts similarity index 100% rename from packages/database-monitor/src/server/routes/index.ts rename to packages/monitor-deployment/src/server/routes/index.ts diff --git a/packages/database-monitor/src/server/server.ts b/packages/monitor-deployment/src/server/server.ts similarity index 100% rename from packages/database-monitor/src/server/server.ts rename to packages/monitor-deployment/src/server/server.ts diff --git a/packages/database-monitor/src/utils/env.ts b/packages/monitor-deployment/src/utils/env.ts similarity index 100% rename from packages/database-monitor/src/utils/env.ts rename to packages/monitor-deployment/src/utils/env.ts diff --git a/packages/database-monitor/src/utils/errorHandler.ts b/packages/monitor-deployment/src/utils/errorHandler.ts similarity index 100% rename from packages/database-monitor/src/utils/errorHandler.ts rename to packages/monitor-deployment/src/utils/errorHandler.ts diff --git a/packages/database-monitor/tsconfig.build.json b/packages/monitor-deployment/tsconfig.build.json similarity index 100% rename from packages/database-monitor/tsconfig.build.json rename to packages/monitor-deployment/tsconfig.build.json diff --git a/packages/database-monitor/tsconfig.json b/packages/monitor-deployment/tsconfig.json similarity index 100% rename from packages/database-monitor/tsconfig.json rename to packages/monitor-deployment/tsconfig.json diff --git a/packages/database-monitor/vitest.config.ts b/packages/monitor-deployment/vitest.config.ts similarity index 100% rename from packages/database-monitor/vitest.config.ts rename to packages/monitor-deployment/vitest.config.ts diff --git a/yarn.lock b/yarn.lock index b402fbadd3..1726c53f86 100644 --- a/yarn.lock +++ b/yarn.lock @@ -16613,45 +16613,6 @@ __metadata: languageName: node linkType: hard -"@speckle/database-monitor@workspace:packages/database-monitor": - version: 0.0.0-use.local - resolution: "@speckle/database-monitor@workspace:packages/database-monitor" - dependencies: - "@speckle/shared": "workspace:^" - "@types/express": "npm:^4.17.13" - "@types/http-errors": "npm:^2.0.4" - "@types/lodash-es": "npm:^4.17.6" - "@types/node": "npm:^18.19.38" - "@vitest/coverage-istanbul": "npm:^1.6.0" - concurrently: "npm:^8.2.2" - crypto: "npm:^1.0.1" - crypto-random-string: "npm:^5.0.0" - dotenv: "npm:^16.4.5" - eslint: "npm:^9.4.0" - eslint-config-prettier: "npm:^9.1.0" - eslint-plugin-vitest: "npm:^0.5.4" - esm-module-alias: "npm:^2.2.0" - express: "npm:^4.19.2" - http-errors: "npm:~1.6.3" - knex: "npm:^2.4.1" - lodash: "npm:^4.17.21" - lodash-es: "npm:^4.17.21" - nodemon: "npm:^2.0.20" - pg: "npm:^8.7.3" - pino: "npm:^8.7.0" - pino-http: "npm:^8.2.1" - pino-pretty: "npm:^9.1.1" - prettier: "npm:^2.5.1" - prom-client: "npm:^14.0.1" - rimraf: "npm:^5.0.7" - typescript: "npm:^4.6.4" - typescript-eslint: "npm:^7.12.0" - vitest: "npm:^1.6.0" - znv: "npm:^0.4.0" - zod: "npm:^3.24.1" - languageName: unknown - linkType: soft - "@speckle/dui3@workspace:packages/dui3": version: 0.0.0-use.local resolution: "@speckle/dui3@workspace:packages/dui3" @@ -16949,6 +16910,45 @@ __metadata: languageName: unknown linkType: soft +"@speckle/monitor-deployment@workspace:packages/monitor-deployment": + version: 0.0.0-use.local + resolution: "@speckle/monitor-deployment@workspace:packages/monitor-deployment" + dependencies: + "@speckle/shared": "workspace:^" + "@types/express": "npm:^4.17.13" + "@types/http-errors": "npm:^2.0.4" + "@types/lodash-es": "npm:^4.17.6" + "@types/node": "npm:^18.19.38" + "@vitest/coverage-istanbul": "npm:^1.6.0" + concurrently: "npm:^8.2.2" + crypto: "npm:^1.0.1" + crypto-random-string: "npm:^5.0.0" + dotenv: "npm:^16.4.5" + eslint: "npm:^9.4.0" + eslint-config-prettier: "npm:^9.1.0" + eslint-plugin-vitest: "npm:^0.5.4" + esm-module-alias: "npm:^2.2.0" + express: "npm:^4.19.2" + http-errors: "npm:~1.6.3" + knex: "npm:^2.4.1" + lodash: "npm:^4.17.21" + lodash-es: "npm:^4.17.21" + nodemon: "npm:^2.0.20" + pg: "npm:^8.7.3" + pino: "npm:^8.7.0" + pino-http: "npm:^8.2.1" + pino-pretty: "npm:^9.1.1" + prettier: "npm:^2.5.1" + prom-client: "npm:^14.0.1" + rimraf: "npm:^5.0.7" + typescript: "npm:^4.6.4" + typescript-eslint: "npm:^7.12.0" + vitest: "npm:^1.6.0" + znv: "npm:^0.4.0" + zod: "npm:^3.24.1" + languageName: unknown + linkType: soft + "@speckle/objectloader@npm:^2.17.8": version: 2.17.9 resolution: "@speckle/objectloader@npm:2.17.9" From b24c23b4506351eba8cbce3d47bd8c023b695874 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:05:54 +0000 Subject: [PATCH 11/27] Do not gitignore some bin directories --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8c373e06f6..449785fc3b 100644 --- a/.gitignore +++ b/.gitignore @@ -74,7 +74,9 @@ redis-data/ .tshy-build obj/ bin/ - +!packages/monitor-deployment/bin +!packages/preview-service/bin +!packages/server/bin # Server multiregion.json From e99316f2f57e52d37626c22289ee4441b0211ac4 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:07:14 +0000 Subject: [PATCH 12/27] Include bin/www.js file --- packages/monitor-deployment/bin/www.js | 2 ++ 1 file changed, 2 insertions(+) create mode 100755 packages/monitor-deployment/bin/www.js diff --git a/packages/monitor-deployment/bin/www.js b/packages/monitor-deployment/bin/www.js new file mode 100755 index 0000000000..5bbe596274 --- /dev/null +++ b/packages/monitor-deployment/bin/www.js @@ -0,0 +1,2 @@ +#!/usr/bin/env node +import '../dist/src/bin.js' From e3c298c3281a5e8224f72b5e75d612f4b8b44310 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:14:21 +0000 Subject: [PATCH 13/27] All env vars are configurable via helm chart --- .../src/observability/prometheusMetrics.ts | 18 ++++++---- .../templates/monitoring/deployment.yml | 33 ++++++++++++++++++- .../templates/monitoring/service.yml | 2 +- utils/helm/speckle-server/values.schema.json | 15 +++++++++ utils/helm/speckle-server/values.yaml | 11 +++++++ 5 files changed, 71 insertions(+), 8 deletions(-) diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index aa25568c21..696437461c 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -137,7 +137,7 @@ function initMonitoringMetrics(params: { fileimports.triggerCollect = async (params) => { const { mainDbClient } = params const importedFiles = await mainDbClient.raw<{ - rows: [{ fileType: string; convertedStatus: number; count: number }] + rows: [{ fileType: string; convertedStatus: number; count: string }] }>( ` SELECT LOWER("fileType") AS "fileType", "convertedStatus", count(*) @@ -175,7 +175,7 @@ function initMonitoringMetrics(params: { }) fileimports.set( { ...labels, filetype: row.fileType, status: row.convertedStatus.toString() }, - row.count + parseInt(row.count) ) } // zero-values for all remaining file types and statuses @@ -213,7 +213,7 @@ function initMonitoringMetrics(params: { webhooks.triggerCollect = async (params) => { const { mainDbClient } = params const webhookResults = await mainDbClient.raw<{ - rows: [{ status: number; count: number }] + rows: [{ status: number; count: string }] }>( ` SELECT status, count(*) @@ -224,7 +224,10 @@ function initMonitoringMetrics(params: { const remainingWebhookStatus = new Set(Array(4).keys()) for (const row of webhookResults.rows) { remainingWebhookStatus.delete(row.status) - webhooks.set({ ...labels, status: row.status.toString() }, row.count) + webhooks.set( + { ...labels, status: row.status.toString() }, + parseInt(row.count) //NOTE risk this bigint being too big for JS, but that would be a very large number of webhooks + ) } // zero-values for all remaining webhook statuses remainingWebhookStatus.forEach((status) => { @@ -240,7 +243,7 @@ function initMonitoringMetrics(params: { previews.triggerCollect = async (params) => { const { mainDbClient } = params const previewStatusResults = await mainDbClient.raw<{ - rows: [{ previewStatus: number; count: number }] + rows: [{ previewStatus: number; count: string }] }>(` SELECT "previewStatus", count(*) FROM object_preview @@ -250,7 +253,10 @@ function initMonitoringMetrics(params: { const remainingPreviewStatus = new Set(Array(4).keys()) for (const row of previewStatusResults.rows) { remainingPreviewStatus.delete(row.previewStatus) - previews.set({ ...labels, status: row.previewStatus.toString() }, row.count) + previews.set( + { ...labels, status: row.previewStatus.toString() }, + parseInt(row.count) + ) } // zero-values for all remaining preview statuses remainingPreviewStatus.forEach((status) => { diff --git a/utils/helm/speckle-server/templates/monitoring/deployment.yml b/utils/helm/speckle-server/templates/monitoring/deployment.yml index 725edf3772..3a922967fa 100644 --- a/utils/helm/speckle-server/templates/monitoring/deployment.yml +++ b/utils/helm/speckle-server/templates/monitoring/deployment.yml @@ -25,7 +25,7 @@ spec: ports: - name: metrics - containerPort: 9092 + containerPort: {{ .Values.monitoring.port }} protocol: TCP resources: @@ -63,6 +63,12 @@ spec: secretKeyRef: name: {{ default .Values.secretName .Values.db.connectionString.secretName }} key: {{ default "postgres_url" .Values.db.connectionString.secretKey }} + - name: POSTGRES_MAX_CONNECTIONS + value: {{ .Values.monitoring.maximumPostgresConnections | quote }} + - name: PROMETHEUS_METRICS_PORT + value: {{ .Values.monitoring.port | quote }} + - name: METRICS_COLLECTION_PERIOD_SECONDS + value: {{ .Values.monitoring.metricsCollectionPeriodSeconds | quote }} {{- if .Values.db.useCertificate }} - name: NODE_EXTRA_CA_CERTS @@ -94,6 +100,31 @@ spec: seccompProfile: type: RuntimeDefault + startupProbe: + periodSeconds: 10 + failureThreshold: 60 # 10*60 = 600s; allows for long-running db migrations + timeoutSeconds: 3 + httpGet: + path: / + port: {{ .Values.monitoring.port }} + + livenessProbe: + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 3 + httpGet: + path: / + port: {{ .Values.monitoring.port }} + + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + httpGet: + path: /metrics + port: {{ .Values.monitoring.port }} + volumes: {{- if .Values.db.useCertificate }} - name: postgres-certificate diff --git a/utils/helm/speckle-server/templates/monitoring/service.yml b/utils/helm/speckle-server/templates/monitoring/service.yml index 0beec6abb0..b27e0ac33c 100644 --- a/utils/helm/speckle-server/templates/monitoring/service.yml +++ b/utils/helm/speckle-server/templates/monitoring/service.yml @@ -12,5 +12,5 @@ spec: ports: - protocol: TCP name: web - port: 9092 + port: {{ .Values.monitoring.port }} targetPort: metrics diff --git a/utils/helm/speckle-server/values.schema.json b/utils/helm/speckle-server/values.schema.json index dafd836a48..e970b2244e 100644 --- a/utils/helm/speckle-server/values.schema.json +++ b/utils/helm/speckle-server/values.schema.json @@ -2187,6 +2187,21 @@ "description": "If enabled, will output logs in a human-readable format. Otherwise, logs will be output in JSON format.", "default": false }, + "port": { + "type": "number", + "description": "The port on which the Monitoring Service will run.", + "default": 9092 + }, + "maximumPostgresConnections": { + "type": "number", + "description": "The maximum number of connections that the Monitoring Service will allow to the Postgres database. A connection pool exists to manage access to the connections.", + "default": 2 + }, + "metricsCollectionPeriodSeconds": { + "type": "number", + "description": "The period in seconds at which the Monitoring Service will query the Postgres database for metrics. Unlike typical Prometheus metrics, the data from the database is not collected in real-time when /metrics is accessed, and is instead done out-of-band on a timed interval.", + "default": 120 + }, "image": { "type": "string", "description": "The Docker image to be used for the Speckle Monitoring component. If blank, defaults to speckle/speckle-monitoring-deployment:{{ .Values.docker_image_tag }}. If provided, this value should be the full path including tag. The docker_image_tag value will be ignored.", diff --git a/utils/helm/speckle-server/values.yaml b/utils/helm/speckle-server/values.yaml index 2076ed2e38..a6e50ef8b4 100644 --- a/utils/helm/speckle-server/values.yaml +++ b/utils/helm/speckle-server/values.yaml @@ -1356,6 +1356,17 @@ monitoring: ## logPretty: false + ## @param monitoring.port The port on which the Monitoring Service will run. + ## + port: 9092 + + ## @param monitoring.maximumPostgresConnections The maximum number of connections that the Monitoring Service will allow to the Postgres database. A connection pool exists to manage access to the connections. + ## + maximumPostgresConnections: 2 + + ## @param monitoring.metricsCollectionPeriodSeconds The period in seconds at which the Monitoring Service will query the Postgres database for metrics. Unlike typical Prometheus metrics, the data from the database is not collected in real-time when /metrics is accessed, and is instead done out-of-band on a timed interval. + metricsCollectionPeriodSeconds: 120 + ## @param monitoring.image The Docker image to be used for the Speckle Monitoring component. If blank, defaults to speckle/speckle-monitoring-deployment:{{ .Values.docker_image_tag }}. If provided, this value should be the full path including tag. The docker_image_tag value will be ignored. ## image: '' From 439acf1bd383f579b37232b2feafb61e3c80e506 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:49:07 +0000 Subject: [PATCH 14/27] Allows configuration of database name via env var - fixes helm chart indentation --- .../monitor-deployment/src/clients/knex.ts | 8 +-- packages/monitor-deployment/src/utils/env.ts | 1 + .../templates/monitoring/deployment.yml | 49 ++++++++++--------- utils/helm/speckle-server/values.schema.json | 5 ++ utils/helm/speckle-server/values.yaml | 3 ++ 5 files changed, 41 insertions(+), 25 deletions(-) diff --git a/packages/monitor-deployment/src/clients/knex.ts b/packages/monitor-deployment/src/clients/knex.ts index 878ad7c6a9..5aa0d6c4d9 100644 --- a/packages/monitor-deployment/src/clients/knex.ts +++ b/packages/monitor-deployment/src/clients/knex.ts @@ -1,5 +1,6 @@ import { knexLogger as logger } from '@/observability/logging.js' import { + getDatabaseName, getPostgresConnectionString, getPostgresMaxConnections, isDevOrTestEnv, @@ -45,9 +46,10 @@ export const getDbClients = async () => { }, configArgs ) - const databaseName = new URL(getPostgresConnectionString()).pathname - .split('/') - .pop() + const databaseName = + // try to get the database name from the environment variable, if not default to parsing the connection string + getDatabaseName() || + new URL(getPostgresConnectionString()).pathname.split('/').pop() dbClients = [ { client: mainClient.public, regionKey: 'main', isMain: true, databaseName } ] diff --git a/packages/monitor-deployment/src/utils/env.ts b/packages/monitor-deployment/src/utils/env.ts index d406421d85..6e27e961e2 100644 --- a/packages/monitor-deployment/src/utils/env.ts +++ b/packages/monitor-deployment/src/utils/env.ts @@ -13,6 +13,7 @@ export const getPostgresMaxConnections = () => export function databaseMonitorCollectionPeriodSeconds() { return getIntFromEnv('METRICS_COLLECTION_PERIOD_SECONDS', '120') } +export const getDatabaseName = () => process.env.POSTGRES_DATABASE export const isDevelopment = () => getNodeEnv() === 'development' || getNodeEnv() === 'dev' diff --git a/utils/helm/speckle-server/templates/monitoring/deployment.yml b/utils/helm/speckle-server/templates/monitoring/deployment.yml index 3a922967fa..d86d026ad5 100644 --- a/utils/helm/speckle-server/templates/monitoring/deployment.yml +++ b/utils/helm/speckle-server/templates/monitoring/deployment.yml @@ -70,6 +70,11 @@ spec: - name: METRICS_COLLECTION_PERIOD_SECONDS value: {{ .Values.monitoring.metricsCollectionPeriodSeconds | quote }} + {{- if .Values.db.databaseName }} + - name: POSTGRES_DATABASE + value: {{ .Values.db.databaseName | quote }} + {{- end }} + {{- if .Values.db.useCertificate }} - name: NODE_EXTRA_CA_CERTS value: "/postgres-certificate/ca-certificate.crt" @@ -100,30 +105,30 @@ spec: seccompProfile: type: RuntimeDefault - startupProbe: - periodSeconds: 10 - failureThreshold: 60 # 10*60 = 600s; allows for long-running db migrations - timeoutSeconds: 3 - httpGet: - path: / - port: {{ .Values.monitoring.port }} + startupProbe: + periodSeconds: 10 + failureThreshold: 60 # 10*60 = 600s; allows for long-running db migrations + timeoutSeconds: 3 + httpGet: + path: / + port: {{ .Values.monitoring.port }} - livenessProbe: - periodSeconds: 60 - timeoutSeconds: 3 - failureThreshold: 3 - httpGet: - path: / - port: {{ .Values.monitoring.port }} + livenessProbe: + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 3 + httpGet: + path: / + port: {{ .Values.monitoring.port }} - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 30 - timeoutSeconds: 10 - failureThreshold: 3 - httpGet: - path: /metrics - port: {{ .Values.monitoring.port }} + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + httpGet: + path: /metrics + port: {{ .Values.monitoring.port }} volumes: {{- if .Values.db.useCertificate }} diff --git a/utils/helm/speckle-server/values.schema.json b/utils/helm/speckle-server/values.schema.json index e970b2244e..6dc6bfb0ae 100644 --- a/utils/helm/speckle-server/values.schema.json +++ b/utils/helm/speckle-server/values.schema.json @@ -236,6 +236,11 @@ "description": "This defines the level of security used when connecting to the Postgres database", "default": "require" }, + "databaseName": { + "type": "string", + "description": "(Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored.", + "default": "" + }, "connectionString": { "type": "object", "properties": { diff --git a/utils/helm/speckle-server/values.yaml b/utils/helm/speckle-server/values.yaml index a6e50ef8b4..f52458ede6 100644 --- a/utils/helm/speckle-server/values.yaml +++ b/utils/helm/speckle-server/values.yaml @@ -190,6 +190,9 @@ db: ## ref: https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-PROTECTION ## PGSSLMODE: require + ## @param db.databaseName (Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored. + databaseName: '' + connectionString: ## @param db.connectionString.secretName Required. A secret containing the full connection string to the Postgres database (e.g. in format of `protocol://username:password@host:port/database`) stored within the Kubernetes cluster as an opaque Kubernetes Secret. Ref: https://kubernetes.io/docs/concepts/configuration/secret/#opaque-secrets ## From 3059df4964e71d9ad3d8bd212f5cb7838c5cc375 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 17:19:39 +0000 Subject: [PATCH 15/27] fix helm chart indentation --- .../templates/monitoring/deployment.yml | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/utils/helm/speckle-server/templates/monitoring/deployment.yml b/utils/helm/speckle-server/templates/monitoring/deployment.yml index d86d026ad5..4eb02d141f 100644 --- a/utils/helm/speckle-server/templates/monitoring/deployment.yml +++ b/utils/helm/speckle-server/templates/monitoring/deployment.yml @@ -91,6 +91,31 @@ spec: - name: LOG_PRETTY value: {{ .Values.monitoring.logPretty | quote }} + startupProbe: + periodSeconds: 10 + failureThreshold: 60 # 10*60 = 600s; allows for long-running db migrations + timeoutSeconds: 3 + httpGet: + path: / + port: {{ .Values.monitoring.port }} + + livenessProbe: + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 3 + httpGet: + path: / + port: {{ .Values.monitoring.port }} + + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + httpGet: + path: /metrics + port: {{ .Values.monitoring.port }} + priorityClassName: low-priority {{- if .Values.monitoring.serviceAccount.create }} serviceAccountName: {{ include "monitoring.name" $ }} @@ -105,31 +130,6 @@ spec: seccompProfile: type: RuntimeDefault - startupProbe: - periodSeconds: 10 - failureThreshold: 60 # 10*60 = 600s; allows for long-running db migrations - timeoutSeconds: 3 - httpGet: - path: / - port: {{ .Values.monitoring.port }} - - livenessProbe: - periodSeconds: 60 - timeoutSeconds: 3 - failureThreshold: 3 - httpGet: - path: / - port: {{ .Values.monitoring.port }} - - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 30 - timeoutSeconds: 10 - failureThreshold: 3 - httpGet: - path: /metrics - port: {{ .Values.monitoring.port }} - volumes: {{- if .Values.db.useCertificate }} - name: postgres-certificate From 3506d2f2ea894c0fbcb93cf8967a064acab51ed8 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 17:59:56 +0000 Subject: [PATCH 16/27] Metrics host must be externally exposed --- .../monitor-deployment/src/observability/prometheusMetrics.ts | 4 ++-- utils/helm/speckle-server/templates/monitoring/deployment.yml | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index 696437461c..378df5e4f3 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -192,7 +192,7 @@ function initMonitoringMetrics(params: { filesize.triggerCollect = async (params) => { const { mainDbClient } = params const fileSizeResults = await mainDbClient.raw<{ - rows: [{ fileType: string; fileSize: number }] + rows: [{ fileType: string; fileSize: string }] }>( ` SELECT LOWER("fileType") AS fileType, SUM("fileSize") AS fileSize @@ -201,7 +201,7 @@ function initMonitoringMetrics(params: { ` ) for (const row of fileSizeResults.rows) { - filesize.set({ ...labels, filetype: row.fileType }, row.fileSize) + filesize.set({ ...labels, filetype: row.fileType }, parseInt(row.fileSize)) } } diff --git a/utils/helm/speckle-server/templates/monitoring/deployment.yml b/utils/helm/speckle-server/templates/monitoring/deployment.yml index 4eb02d141f..ecc628820b 100644 --- a/utils/helm/speckle-server/templates/monitoring/deployment.yml +++ b/utils/helm/speckle-server/templates/monitoring/deployment.yml @@ -65,6 +65,8 @@ spec: key: {{ default "postgres_url" .Values.db.connectionString.secretKey }} - name: POSTGRES_MAX_CONNECTIONS value: {{ .Values.monitoring.maximumPostgresConnections | quote }} + - name: METRICS_HOST + value: '0.0.0.0' # bind to all interfaces, not just localhost. Required to allow prometheus to scrape metrics, and healthchecks to work. - name: PROMETHEUS_METRICS_PORT value: {{ .Values.monitoring.port | quote }} - name: METRICS_COLLECTION_PERIOD_SECONDS From 89f924fb55692aacce967a28a664059be2c63bce Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 19:38:35 +0000 Subject: [PATCH 17/27] Express logging should be debug --- .../monitor-deployment/src/observability/expressLogging.ts | 5 +---- packages/monitor-deployment/src/observability/metricsApp.ts | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/packages/monitor-deployment/src/observability/expressLogging.ts b/packages/monitor-deployment/src/observability/expressLogging.ts index 8beb5a6437..0060defb88 100644 --- a/packages/monitor-deployment/src/observability/expressLogging.ts +++ b/packages/monitor-deployment/src/observability/expressLogging.ts @@ -25,9 +25,6 @@ export const loggingExpressMiddleware = pinoHttp({ // and we don't really care about 3xx stuff // all the user related 4xx responses are treated as info customLogLevel: (req, res, error) => { - const path = getRequestPath(req) - const shouldBeDebug = ['/metrics'].includes(path || '') ?? false - if (res.statusCode >= 400 && res.statusCode < 500) { return 'info' } else if (res.statusCode >= 500 || error) { @@ -36,7 +33,7 @@ export const loggingExpressMiddleware = pinoHttp({ return 'silent' } - return shouldBeDebug ? 'debug' : 'info' + return 'debug' }, // we need to redact any potential sensitive data from being logged. diff --git a/packages/monitor-deployment/src/observability/metricsApp.ts b/packages/monitor-deployment/src/observability/metricsApp.ts index 8754c979e3..8f6b30f1fb 100644 --- a/packages/monitor-deployment/src/observability/metricsApp.ts +++ b/packages/monitor-deployment/src/observability/metricsApp.ts @@ -10,6 +10,7 @@ export const appFactory = () => { initPrometheusMetrics() const app = express() + app.disable('x-powered-by') app.use(loggingExpressMiddleware) app.use(express.json({ limit: '100mb' })) app.use(express.urlencoded({ limit: '100mb', extended: false })) From e2ad346c7fa20f49919681ecd326961c001d606f Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 20:52:19 +0000 Subject: [PATCH 18/27] Fix issue with NaN in metrics, and allow self-signed CA certs --- packages/monitor-deployment/.env.example | 9 +++++++++ packages/monitor-deployment/README.md | 6 ++++++ packages/monitor-deployment/src/clients/knex.ts | 4 +++- .../src/observability/prometheusMetrics.ts | 12 +++++++----- packages/monitor-deployment/src/utils/env.ts | 1 + 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/packages/monitor-deployment/.env.example b/packages/monitor-deployment/.env.example index 3ac8c2c9ac..920c795dee 100644 --- a/packages/monitor-deployment/.env.example +++ b/packages/monitor-deployment/.env.example @@ -5,3 +5,12 @@ LOG_LEVEL='info' LOG_PRETTY='true' METRICS_COLLECTION_PERIOD_SECONDS='120' FF_WORKSPACES_MULTI_REGION_ENABLED='false' + +# Enable this if you want to use a custom CA certificate for the connection to the database +# You also have to save it in a file and set NODE_EXTRA_CA_CERTS when running `yarn start` or `yarn dev`` +# POSTGRES_CA_CERTIFICATE='-----BEGIN CERTIFICATE----- +# XXXX +# -----END CERTIFICATE-----' + +# Used if the database name to be queried is different from the path in the connection string, which is the case for connection pools +# POSTGRES_DATABASE='speckle' diff --git a/packages/monitor-deployment/README.md b/packages/monitor-deployment/README.md index 043b811e04..7cb8e3b861 100644 --- a/packages/monitor-deployment/README.md +++ b/packages/monitor-deployment/README.md @@ -10,6 +10,12 @@ Metrics are available at `/metrics` endpoint and are in Prometheus format. yarn dev ``` +## Databases with self-signed certificates + +Add the self-signed CA certificate to a file at `packages/monitor-deployment/ca-certificate.crt` + +Run `NODE_EXTRA_CA_CERTS=./ca-certificate.crt yarn dev` or `NODE_EXTRA_CA_CERTS=./ca-certificate.crt yarn start` + ## Production ```bash diff --git a/packages/monitor-deployment/src/clients/knex.ts b/packages/monitor-deployment/src/clients/knex.ts index 5aa0d6c4d9..6b617bcc6c 100644 --- a/packages/monitor-deployment/src/clients/knex.ts +++ b/packages/monitor-deployment/src/clients/knex.ts @@ -1,6 +1,7 @@ import { knexLogger as logger } from '@/observability/logging.js' import { getDatabaseName, + getPostgresCACertificate, getPostgresConnectionString, getPostgresMaxConnections, isDevOrTestEnv, @@ -41,7 +42,8 @@ export const getDbClients = async () => { const mainClient = configureKnexClient( { postgres: { - connectionUri: getPostgresConnectionString() + connectionUri: getPostgresConnectionString(), + publicTlsCertificate: getPostgresCACertificate() } }, configArgs diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index 378df5e4f3..78fbaf1ca2 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -169,9 +169,11 @@ function initMonitoringMetrics(params: { // now set the counts for the file types and statuses that are in the database for (const row of importedFiles.rows) { - remainingConvertedStatusAndFileTypes.delete({ - fileType: row.fileType, - status: row.convertedStatus + // objects are stored by reference, so we have to search for the original item in the set + remainingConvertedStatusAndFileTypes.forEach((item) => { + if (item.fileType === row.fileType && item.status === row.convertedStatus) { + remainingConvertedStatusAndFileTypes.delete(item) + } }) fileimports.set( { ...labels, filetype: row.fileType, status: row.convertedStatus.toString() }, @@ -192,7 +194,7 @@ function initMonitoringMetrics(params: { filesize.triggerCollect = async (params) => { const { mainDbClient } = params const fileSizeResults = await mainDbClient.raw<{ - rows: [{ fileType: string; fileSize: string }] + rows: [{ filetype: string; filesize: string }] }>( ` SELECT LOWER("fileType") AS fileType, SUM("fileSize") AS fileSize @@ -201,7 +203,7 @@ function initMonitoringMetrics(params: { ` ) for (const row of fileSizeResults.rows) { - filesize.set({ ...labels, filetype: row.fileType }, parseInt(row.fileSize)) + filesize.set({ ...labels, filetype: row.filetype }, parseInt(row.filesize)) } } diff --git a/packages/monitor-deployment/src/utils/env.ts b/packages/monitor-deployment/src/utils/env.ts index 6e27e961e2..e8fc6c597b 100644 --- a/packages/monitor-deployment/src/utils/env.ts +++ b/packages/monitor-deployment/src/utils/env.ts @@ -8,6 +8,7 @@ export const getMetricsPort = () => process.env.PROMETHEUS_METRICS_PORT || '9092 export const getNodeEnv = () => process.env.NODE_ENV || 'production' export const getPostgresConnectionString = () => process.env.PG_CONNECTION_STRING || 'postgres://speckle:speckle@127.0.0.1/speckle' +export const getPostgresCACertificate = () => process.env.POSTGRES_CA_CERTIFICATE export const getPostgresMaxConnections = () => parseInt(process.env.POSTGRES_MAX_CONNECTIONS || '2') export function databaseMonitorCollectionPeriodSeconds() { From cc464d8e57510abba3151efead107aae91727dd9 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Fri, 13 Dec 2024 21:08:39 +0000 Subject: [PATCH 19/27] Provide details about connections --- .../src/observability/prometheusMetrics.ts | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index 78fbaf1ca2..50b50334f7 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -306,6 +306,49 @@ function initMonitoringMetrics(params: { ) } + const connections: WithOnDemandCollector> = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_used_connections'], '_'), + help: 'Number of active (used) database connections', + labelNames: ['region', ...labelNames] + }) + connections.triggerCollect = async (params) => { + const { dbClients } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const connectionResults = await client.raw<{ + rows: [{ used_connections: string }] + }>(`SELECT COUNT(*) AS used_connections FROM pg_stat_activity;`) + connections.set( + { ...labels, region: regionKey }, + parseInt(connectionResults.rows[0].used_connections) + ) + }) + ) + } + + const totalConnections: WithOnDemandCollector> = + new prometheusClient.Gauge({ + name: join([namePrefix, 'db_total_connections'], '_'), + help: 'Total number of database connections', + labelNames: ['region', ...labelNames] + }) + totalConnections.triggerCollect = async (params) => { + const { dbClients } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const connectionResults = await client.raw<{ + rows: [{ maximum_connections: number }] + }>( + `SELECT setting::int AS maximum_connections FROM pg_settings WHERE name=$$max_connections$$;` + ) + totalConnections.set( + { ...labels, region: regionKey }, + connectionResults.rows[0].maximum_connections + ) + }) + ) + } + const metricsToCollect = [ dbSize, tablesize, From 694dd261c3e649938ceb6497ffaf3aaacab00c3b Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Sat, 14 Dec 2024 15:01:30 +0000 Subject: [PATCH 20/27] Improve error handling and add more logging of edge cases - as database monitor tries to monitor all configured databases, there may be unregistered databases which are not yet migrated and do not have the required tables - many of the queries are multi-region and now query all database clients --- .../src/observability/prometheusMetrics.ts | 347 ++++++++++++------ 1 file changed, 242 insertions(+), 105 deletions(-) diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index 50b50334f7..57ffd8a0ee 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -68,6 +68,13 @@ function initMonitoringMetrics(params: { const dbSizeResult = await client.raw<{ rows: [{ pg_database_size: string }] //bigints are returned as strings }>('SELECT pg_database_size(?) LIMIT 1', [databaseName]) + if (!dbSizeResult.rows.length) { + logger.error( + { region: regionKey }, + "No database size found for region '{region}'. This is odd." + ) + return + } dbSize.set( { ...labels, region: regionKey }, parseInt(dbSizeResult.rows[0].pg_database_size) //NOTE risk this bigint being too big for JS, but that would be a very large database! @@ -79,15 +86,32 @@ function initMonitoringMetrics(params: { const objects: WithOnDemandCollector> = new prometheusClient.Gauge({ name: join([namePrefix, 'db_objects'], '_'), help: 'Number of objects', - labelNames + labelNames: [...labelNames, 'region'] }) objects.triggerCollect = async (params) => { - const { mainDbClient } = params - - const objectsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( - "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'objects' LIMIT 1;" + const { dbClients } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const objectsEstimate = await client.raw<{ + rows: [{ estimate: number }] + }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'objects' LIMIT 1;" + ) + if (objectsEstimate.rows.length) { + objects.set( + { ...labels, region: regionKey }, + Math.max(objectsEstimate.rows[0]?.estimate, 0) + ) + } + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect objects from region '{region}'. This may be because the region is not yet registered and has no 'objects' table." + ) + } + }) ) - objects.set({ ...labels }, Math.max(objectsEstimate.rows[0].estimate, 0)) } const streams: WithOnDemandCollector> = new prometheusClient.Gauge({ @@ -97,23 +121,50 @@ function initMonitoringMetrics(params: { }) streams.triggerCollect = async (params) => { const { mainDbClient } = params - const streamsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( - "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'streams' LIMIT 1;" - ) - streams.set({ ...labels }, Math.max(streamsEstimate.rows[0].estimate)) + try { + const streamsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'streams' LIMIT 1;" + ) + if (streamsEstimate.rows.length) { + streams.set({ ...labels }, Math.max(streamsEstimate.rows[0]?.estimate)) + } + } catch (err) { + logger.warn( + err, + 'Failed to collect streams metrics. This may be because the main database is not yet migrated.' + ) + } } const commits: WithOnDemandCollector> = new prometheusClient.Gauge({ name: join([namePrefix, 'db_commits'], '_'), help: 'Number of commits/versions', - labelNames + labelNames: [...labelNames, 'region'] }) commits.triggerCollect = async (params) => { - const { mainDbClient } = params - const commitsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( - "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'commits' LIMIT 1;" + const { dbClients } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const commitsEstimate = await client.raw<{ + rows: [{ estimate: number }] + }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'commits' LIMIT 1;" + ) + if (commitsEstimate.rows.length) { + commits.set( + { ...labels, region: regionKey }, + Math.max(commitsEstimate.rows[0]?.estimate) + ) + } + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect commits metrics from region '{region}'. This may be because the region is not yet registered and has no 'commits' table." + ) + } + }) ) - commits.set({ ...labels }, Math.max(commitsEstimate.rows[0].estimate)) } const users: WithOnDemandCollector> = new prometheusClient.Gauge({ @@ -123,147 +174,219 @@ function initMonitoringMetrics(params: { }) users.triggerCollect = async (params) => { const { mainDbClient } = params - const usersEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( - "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'users' LIMIT 1;" - ) - users.set({ ...labels }, Math.max(usersEstimate.rows[0].estimate)) + try { + const usersEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'users' LIMIT 1;" + ) + if (usersEstimate.rows.length) { + users.set({ ...labels }, Math.max(usersEstimate.rows[0]?.estimate)) + } + } catch (err) { + logger.warn( + err, + "Failed to collect users metrics. This may be because the migrations have not yet occcurred and has no 'users' table." + ) + } } const fileimports: WithOnDemandCollector> = new prometheusClient.Gauge({ name: join([namePrefix, 'db_fileimports'], '_'), help: 'Number of imported files, by type and status', - labelNames: ['filetype', 'status', ...labelNames] + labelNames: ['filetype', 'status', 'region', ...labelNames] }) fileimports.triggerCollect = async (params) => { - const { mainDbClient } = params - const importedFiles = await mainDbClient.raw<{ - rows: [{ fileType: string; convertedStatus: number; count: string }] - }>( - ` + const { dbClients } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const importedFiles = await client.raw<{ + rows: [{ fileType: string; convertedStatus: number; count: string }] + }>( + ` SELECT LOWER("fileType") AS "fileType", "convertedStatus", count(*) FROM file_uploads GROUP BY (LOWER("fileType"), "convertedStatus"); ` - ) - - // Get the set of all unique file types and converted statuses in the database - const allFileImportConvertedStatusAndFileTypes = importedFiles.rows.reduce( - (acc, row) => { - acc.convertedStatus.add(row.convertedStatus) - acc.fileType.add(row.fileType) - return acc - }, - { convertedStatus: new Set(), fileType: new Set() } - ) + ) - // now calculate the combinatorial set of all possible file types and statuses - const remainingConvertedStatusAndFileTypes = new Set<{ - fileType: string - status: number - }>() - allFileImportConvertedStatusAndFileTypes.convertedStatus.forEach((status) => { - allFileImportConvertedStatusAndFileTypes.fileType.forEach((fileType) => { - remainingConvertedStatusAndFileTypes.add({ fileType, status }) - }) - }) + // Get the set of all unique file types and converted statuses in the database + const allFileImportConvertedStatusAndFileTypes = importedFiles.rows.reduce( + (acc, row) => { + acc.convertedStatus.add(row.convertedStatus) + acc.fileType.add(row.fileType) + return acc + }, + { convertedStatus: new Set(), fileType: new Set() } + ) - // now set the counts for the file types and statuses that are in the database - for (const row of importedFiles.rows) { - // objects are stored by reference, so we have to search for the original item in the set - remainingConvertedStatusAndFileTypes.forEach((item) => { - if (item.fileType === row.fileType && item.status === row.convertedStatus) { - remainingConvertedStatusAndFileTypes.delete(item) + // now calculate the combinatorial set of all possible file types and statuses + const remainingConvertedStatusAndFileTypes = new Set<{ + fileType: string + status: number + }>() + allFileImportConvertedStatusAndFileTypes.convertedStatus.forEach((status) => { + allFileImportConvertedStatusAndFileTypes.fileType.forEach((fileType) => { + remainingConvertedStatusAndFileTypes.add({ fileType, status }) + }) + }) + + // now set the counts for the file types and statuses that are in the database + for (const row of importedFiles.rows) { + // objects are stored by reference, so we have to search for the original item in the set + remainingConvertedStatusAndFileTypes.forEach((item) => { + if ( + item.fileType === row.fileType && + item.status === row.convertedStatus + ) { + remainingConvertedStatusAndFileTypes.delete(item) + } + }) + fileimports.set( + { + ...labels, + filetype: row.fileType, + status: row.convertedStatus.toString(), + region: regionKey + }, + parseInt(row.count) + ) + } + // zero-values for all remaining file types and statuses + remainingConvertedStatusAndFileTypes.forEach(({ fileType, status }) => { + fileimports.set( + { + ...labels, + filetype: fileType, + status: status.toString(), + region: regionKey + }, + 0 + ) + }) + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect file import status metrics from region '{region}'. This may be because the region is not yet registered and has no file_uploads table." + ) } }) - fileimports.set( - { ...labels, filetype: row.fileType, status: row.convertedStatus.toString() }, - parseInt(row.count) - ) - } - // zero-values for all remaining file types and statuses - remainingConvertedStatusAndFileTypes.forEach(({ fileType, status }) => { - fileimports.set({ ...labels, filetype: fileType, status: status.toString() }, 0) - }) + ) } const filesize: WithOnDemandCollector> = new prometheusClient.Gauge({ name: join([namePrefix, 'db_filesize'], '_'), help: 'Size of imported files, by type (in bytes)', - labelNames: ['filetype', ...labelNames] + labelNames: ['filetype', 'region', ...labelNames] }) filesize.triggerCollect = async (params) => { - const { mainDbClient } = params - const fileSizeResults = await mainDbClient.raw<{ - rows: [{ filetype: string; filesize: string }] - }>( - ` + const { dbClients } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const fileSizeResults = await client.raw<{ + rows: [{ filetype: string; filesize: string }] + }>( + ` SELECT LOWER("fileType") AS fileType, SUM("fileSize") AS fileSize FROM file_uploads GROUP BY LOWER("fileType"); ` + ) + for (const row of fileSizeResults.rows) { + filesize.set( + { ...labels, filetype: row.filetype, region: regionKey }, + parseInt(row.filesize) + ) + } + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect file upload metrics from region '{region}'. This may be because the region is not yet registered and has no 'file_uploads' table." + ) + } + }) ) - for (const row of fileSizeResults.rows) { - filesize.set({ ...labels, filetype: row.filetype }, parseInt(row.filesize)) - } } const webhooks: WithOnDemandCollector> = new prometheusClient.Gauge({ name: join([namePrefix, 'db_webhooks'], '_'), help: 'Number of webhook calls, by status', - labelNames: ['status', ...labelNames] + labelNames: ['status', 'region', ...labelNames] }) webhooks.triggerCollect = async (params) => { - const { mainDbClient } = params - const webhookResults = await mainDbClient.raw<{ - rows: [{ status: number; count: string }] - }>( - ` + const { dbClients } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const webhookResults = await client.raw<{ + rows: [{ status: number; count: string }] + }>( + ` SELECT status, count(*) FROM webhooks_events GROUP BY status; ` + ) + const remainingWebhookStatus = new Set(Array(4).keys()) + for (const row of webhookResults.rows) { + remainingWebhookStatus.delete(row.status) + webhooks.set( + { ...labels, status: row.status.toString(), region: regionKey }, + parseInt(row.count) //NOTE risk this bigint being too big for JS, but that would be a very large number of webhooks + ) + } + // zero-values for all remaining webhook statuses + remainingWebhookStatus.forEach((status) => { + webhooks.set({ ...labels, status: status.toString(), region: regionKey }, 0) + }) + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect webhook metrics from region '{region}'. This may be because the region is not yet registered and has no webhooks_events table." + ) + } + }) ) - const remainingWebhookStatus = new Set(Array(4).keys()) - for (const row of webhookResults.rows) { - remainingWebhookStatus.delete(row.status) - webhooks.set( - { ...labels, status: row.status.toString() }, - parseInt(row.count) //NOTE risk this bigint being too big for JS, but that would be a very large number of webhooks - ) - } - // zero-values for all remaining webhook statuses - remainingWebhookStatus.forEach((status) => { - webhooks.set({ ...labels, status: status.toString() }, 0) - }) } const previews: WithOnDemandCollector> = new prometheusClient.Gauge({ name: join([namePrefix, 'db_previews'], '_'), help: 'Number of previews, by status', - labelNames: ['status', ...labelNames] + labelNames: ['status', 'region', ...labelNames] }) previews.triggerCollect = async (params) => { - const { mainDbClient } = params - const previewStatusResults = await mainDbClient.raw<{ - rows: [{ previewStatus: number; count: string }] - }>(` + const { dbClients } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const previewStatusResults = await client.raw<{ + rows: [{ previewStatus: number; count: string }] + }>(` SELECT "previewStatus", count(*) FROM object_preview GROUP BY "previewStatus"; `) - const remainingPreviewStatus = new Set(Array(4).keys()) - for (const row of previewStatusResults.rows) { - remainingPreviewStatus.delete(row.previewStatus) - previews.set( - { ...labels, status: row.previewStatus.toString() }, - parseInt(row.count) - ) - } - // zero-values for all remaining preview statuses - remainingPreviewStatus.forEach((status) => { - previews.set({ ...labels, status: status.toString() }, 0) - }) + const remainingPreviewStatus = new Set(Array(4).keys()) + for (const row of previewStatusResults.rows) { + remainingPreviewStatus.delete(row.previewStatus) + previews.set( + { ...labels, region: regionKey, status: row.previewStatus.toString() }, + parseInt(row.count) + ) + } + // zero-values for all remaining preview statuses + remainingPreviewStatus.forEach((status) => { + previews.set({ ...labels, region: regionKey, status: status.toString() }, 0) + }) + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect private status metrics from region '{region}'. This may be because the region is not yet registered and has no 'object_preview' table." + ) + } + }) + ) } const tablesize: WithOnDemandCollector> = new prometheusClient.Gauge({ @@ -318,6 +441,13 @@ function initMonitoringMetrics(params: { const connectionResults = await client.raw<{ rows: [{ used_connections: string }] }>(`SELECT COUNT(*) AS used_connections FROM pg_stat_activity;`) + if (!connectionResults.rows.length) { + logger.error( + { region: regionKey }, + "No active connections found for region '{region}'. This is odd." + ) + return + } connections.set( { ...labels, region: regionKey }, parseInt(connectionResults.rows[0].used_connections) @@ -341,6 +471,13 @@ function initMonitoringMetrics(params: { }>( `SELECT setting::int AS maximum_connections FROM pg_settings WHERE name=$$max_connections$$;` ) + if (!connectionResults.rows.length) { + logger.error( + { region: regionKey }, + "No maximum connections found for region '{region}'. This is odd." + ) + return + } totalConnections.set( { ...labels, region: regionKey }, connectionResults.rows[0].maximum_connections From efe2416aa29c5d14b028926d32d09ab118e8d902 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:05:46 +0000 Subject: [PATCH 21/27] Refactor to place metrics in separate files --- .../src/observability/metrics/commits.ts | 37 ++ .../src/observability/metrics/connections.ts | 33 ++ .../observability/metrics/connectionsTotal.ts | 35 ++ .../src/observability/metrics/dbSize.ts | 47 ++ .../src/observability/metrics/fileImports.ts | 90 ++++ .../src/observability/metrics/fileSize.ts | 41 ++ .../src/observability/metrics/objects.ts | 38 ++ .../src/observability/metrics/previews.ts | 46 ++ .../src/observability/metrics/streams.ts | 29 ++ .../src/observability/metrics/tableSize.ts | 46 ++ .../src/observability/metrics/users.ts | 28 + .../src/observability/metrics/webhooks.ts | 47 ++ .../src/observability/prometheusMetrics.ts | 487 +----------------- .../src/observability/types.ts | 13 + 14 files changed, 557 insertions(+), 460 deletions(-) create mode 100644 packages/monitor-deployment/src/observability/metrics/commits.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/connections.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/connectionsTotal.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/dbSize.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/fileImports.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/fileSize.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/objects.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/previews.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/streams.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/tableSize.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/users.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/webhooks.ts create mode 100644 packages/monitor-deployment/src/observability/types.ts diff --git a/packages/monitor-deployment/src/observability/metrics/commits.ts b/packages/monitor-deployment/src/observability/metrics/commits.ts new file mode 100644 index 0000000000..7e05c96cd9 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/commits.ts @@ -0,0 +1,37 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricConfig, MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config: MetricConfig) => { + const { labelNames, namePrefix, logger } = config + const commits = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_commits'], '_'), + help: 'Number of commits/versions', + labelNames: [...labelNames, 'region'] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const commitsEstimate = await client.raw<{ + rows: [{ estimate: number }] + }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'commits' LIMIT 1;" + ) + if (commitsEstimate.rows.length) { + commits.set( + { ...labels, region: regionKey }, + Math.max(commitsEstimate.rows[0]?.estimate) + ) + } + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect commits metrics from region '{region}'. This may be because the region is not yet registered and has no 'commits' table." + ) + } + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/connections.ts b/packages/monitor-deployment/src/observability/metrics/connections.ts new file mode 100644 index 0000000000..e22a72ab9a --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/connections.ts @@ -0,0 +1,33 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const connections = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_used_connections'], '_'), + help: 'Number of active (used) database connections', + labelNames: ['region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const connectionResults = await client.raw<{ + rows: [{ used_connections: string }] + }>(`SELECT COUNT(*) AS used_connections FROM pg_stat_activity;`) + if (!connectionResults.rows.length) { + logger.error( + { region: regionKey }, + "No active connections found for region '{region}'. This is odd." + ) + return + } + connections.set( + { ...labels, region: regionKey }, + parseInt(connectionResults.rows[0].used_connections) + ) + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/connectionsTotal.ts b/packages/monitor-deployment/src/observability/metrics/connectionsTotal.ts new file mode 100644 index 0000000000..a534d2d37b --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/connectionsTotal.ts @@ -0,0 +1,35 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const totalConnections = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_total_connections'], '_'), + help: 'Total number of database connections', + labelNames: ['region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const connectionResults = await client.raw<{ + rows: [{ maximum_connections: number }] + }>( + `SELECT setting::int AS maximum_connections FROM pg_settings WHERE name=$$max_connections$$;` + ) + if (!connectionResults.rows.length) { + logger.error( + { region: regionKey }, + "No maximum connections found for region '{region}'. This is odd." + ) + return + } + totalConnections.set( + { ...labels, region: regionKey }, + connectionResults.rows[0].maximum_connections + ) + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/dbSize.ts b/packages/monitor-deployment/src/observability/metrics/dbSize.ts new file mode 100644 index 0000000000..bcbe7a16dd --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/dbSize.ts @@ -0,0 +1,47 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const dbSize = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_size'], '_'), + help: 'Size of the entire database (in bytes)', + labelNames: ['region', ...labelNames] + }) + + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey, databaseName }) => { + if (!databaseName) { + logger.warn( + { region: regionKey }, + "Could not get database name from client config for region '{region}'" + ) + return + } + + logger.info( + { region: regionKey, databaseName }, + "Collecting database size for region '{region}' from database '{databaseName}'" + ) + + const dbSizeResult = await client.raw<{ + rows: [{ pg_database_size: string }] //bigints are returned as strings + }>('SELECT pg_database_size(?) LIMIT 1', [databaseName]) + if (!dbSizeResult.rows.length) { + logger.error( + { region: regionKey }, + "No database size found for region '{region}'. This is odd." + ) + return + } + dbSize.set( + { ...labels, region: regionKey }, + parseInt(dbSizeResult.rows[0].pg_database_size) //NOTE risk this bigint being too big for JS, but that would be a very large database! + ) + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/fileImports.ts b/packages/monitor-deployment/src/observability/metrics/fileImports.ts new file mode 100644 index 0000000000..28a10b0175 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/fileImports.ts @@ -0,0 +1,90 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const fileimports = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_fileimports'], '_'), + help: 'Number of imported files, by type and status', + labelNames: ['filetype', 'status', 'region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const importedFiles = await client.raw<{ + rows: [{ fileType: string; convertedStatus: number; count: string }] + }>( + ` + SELECT LOWER("fileType") AS "fileType", "convertedStatus", count(*) + FROM file_uploads + GROUP BY (LOWER("fileType"), "convertedStatus"); + ` + ) + + // Get the set of all unique file types and converted statuses in the database + const allFileImportConvertedStatusAndFileTypes = importedFiles.rows.reduce( + (acc, row) => { + acc.convertedStatus.add(row.convertedStatus) + acc.fileType.add(row.fileType) + acc.presentConvertedStatusAndFileType.add( + `${row.convertedStatus}:::${row.fileType}` + ) + return acc + }, + { + convertedStatus: new Set(), + fileType: new Set(), + presentConvertedStatusAndFileType: new Set() + } + ) + + // now calculate the combinatorial set of all possible file types and statuses + const remainingConvertedStatusAndFileTypes = new Set() + allFileImportConvertedStatusAndFileTypes.convertedStatus.forEach((status) => { + allFileImportConvertedStatusAndFileTypes.fileType.forEach((fileType) => { + remainingConvertedStatusAndFileTypes.add(`${status}:::${fileType}`) + }) + }) + + // now set the counts for the file types and statuses that are in the database + for (const row of importedFiles.rows) { + remainingConvertedStatusAndFileTypes.delete( + `${row.convertedStatus}:::${row.fileType}` + ) + + fileimports.set( + { + ...labels, + filetype: row.fileType, + status: row.convertedStatus.toString(), + region: regionKey + }, + parseInt(row.count) + ) + } + // zero-values for all remaining file types and statuses + remainingConvertedStatusAndFileTypes.forEach((formattedStatusAndFileType) => { + const [status, fileType] = formattedStatusAndFileType.split(':::') + fileimports.set( + { + ...labels, + filetype: fileType, + status, + region: regionKey + }, + 0 + ) + }) + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect file import status metrics from region '{region}'. This may be because the region is not yet registered and has no file_uploads table." + ) + } + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/fileSize.ts b/packages/monitor-deployment/src/observability/metrics/fileSize.ts new file mode 100644 index 0000000000..ac76cc8782 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/fileSize.ts @@ -0,0 +1,41 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const filesize = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_filesize'], '_'), + help: 'Size of imported files, by type (in bytes)', + labelNames: ['filetype', 'region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const fileSizeResults = await client.raw<{ + rows: [{ filetype: string; filesize: string }] + }>( + ` + SELECT LOWER("fileType") AS fileType, SUM("fileSize") AS fileSize + FROM file_uploads + GROUP BY LOWER("fileType"); + ` + ) + for (const row of fileSizeResults.rows) { + filesize.set( + { ...labels, filetype: row.filetype, region: regionKey }, + parseInt(row.filesize) + ) + } + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect file upload metrics from region '{region}'. This may be because the region is not yet registered and has no 'file_uploads' table." + ) + } + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/objects.ts b/packages/monitor-deployment/src/observability/metrics/objects.ts new file mode 100644 index 0000000000..a05eaed63d --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/objects.ts @@ -0,0 +1,38 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const objects = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_objects'], '_'), + help: 'Number of objects', + labelNames: [...labelNames, 'region'] + }) + + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const objectsEstimate = await client.raw<{ + rows: [{ estimate: number }] + }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'objects' LIMIT 1;" + ) + if (objectsEstimate.rows.length) { + objects.set( + { ...labels, region: regionKey }, + Math.max(objectsEstimate.rows[0]?.estimate, 0) + ) + } + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect objects from region '{region}'. This may be because the region is not yet registered and has no 'objects' table." + ) + } + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/previews.ts b/packages/monitor-deployment/src/observability/metrics/previews.ts new file mode 100644 index 0000000000..dfb07afb96 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/previews.ts @@ -0,0 +1,46 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const previews = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_previews'], '_'), + help: 'Number of previews, by status', + labelNames: ['status', 'region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const previewStatusResults = await client.raw<{ + rows: [{ previewStatus: number; count: string }] + }>(` + SELECT "previewStatus", count(*) + FROM object_preview + GROUP BY "previewStatus"; + `) + + const remainingPreviewStatus = new Set(Array(4).keys()) + for (const row of previewStatusResults.rows) { + remainingPreviewStatus.delete(row.previewStatus) + previews.set( + { ...labels, region: regionKey, status: row.previewStatus.toString() }, + parseInt(row.count) + ) + } + // zero-values for all remaining preview statuses + remainingPreviewStatus.forEach((status) => { + previews.set({ ...labels, region: regionKey, status: status.toString() }, 0) + }) + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect private status metrics from region '{region}'. This may be because the region is not yet registered and has no 'object_preview' table." + ) + } + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/streams.ts b/packages/monitor-deployment/src/observability/metrics/streams.ts new file mode 100644 index 0000000000..811c586d04 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/streams.ts @@ -0,0 +1,29 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const streams = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_streams'], '_'), + help: 'Number of streams/projects', + labelNames + }) + + return async (params) => { + const { mainDbClient, labels } = params + try { + const streamsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'streams' LIMIT 1;" + ) + if (streamsEstimate.rows.length) { + streams.set({ ...labels }, Math.max(streamsEstimate.rows[0]?.estimate)) + } + } catch (err) { + logger.warn( + err, + 'Failed to collect streams metrics. This may be because the main database is not yet migrated.' + ) + } + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/tableSize.ts b/packages/monitor-deployment/src/observability/metrics/tableSize.ts new file mode 100644 index 0000000000..2aadf6f884 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/tableSize.ts @@ -0,0 +1,46 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix } = config + const tablesize = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_tablesize'], '_'), + help: 'Size of tables in the database, by table (in bytes)', + labelNames: ['table', 'region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const tableSizeResults = await client.raw<{ + rows: [{ table_name: string; table_size: string }] //bigints are returned as strings + }>( + ` + SELECT + table_name, + table_size + + FROM ( + SELECT + pg_catalog.pg_namespace.nspname AS schema_name, + relname AS table_name, + pg_relation_size(pg_catalog.pg_class.oid) AS table_size + + FROM pg_catalog.pg_class + JOIN pg_catalog.pg_namespace ON relnamespace = pg_catalog.pg_namespace.oid + ) t + WHERE schema_name = 'public' + ORDER BY table_size DESC; + ` + ) + for (const row of tableSizeResults.rows) { + tablesize.set( + { ...labels, table: row.table_name, region: regionKey }, + parseInt(row.table_size) //NOTE risk this bigint being too big for JS, but that would be a very large table! + ) + } + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/users.ts b/packages/monitor-deployment/src/observability/metrics/users.ts new file mode 100644 index 0000000000..a49ac237c4 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/users.ts @@ -0,0 +1,28 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const users = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_users'], '_'), + help: 'Number of users', + labelNames + }) + return async (params) => { + const { mainDbClient, labels } = params + try { + const usersEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( + "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'users' LIMIT 1;" + ) + if (usersEstimate.rows.length) { + users.set({ ...labels }, Math.max(usersEstimate.rows[0]?.estimate)) + } + } catch (err) { + logger.warn( + err, + "Failed to collect users metrics. This may be because the migrations have not yet occcurred and has no 'users' table." + ) + } + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/webhooks.ts b/packages/monitor-deployment/src/observability/metrics/webhooks.ts new file mode 100644 index 0000000000..b3d496a184 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/webhooks.ts @@ -0,0 +1,47 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const webhooks = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_webhooks'], '_'), + help: 'Number of webhook calls, by status', + labelNames: ['status', 'region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + try { + const webhookResults = await client.raw<{ + rows: [{ status: number; count: string }] + }>( + ` + SELECT status, count(*) + FROM webhooks_events + GROUP BY status; + ` + ) + const remainingWebhookStatus = new Set(Array(4).keys()) + for (const row of webhookResults.rows) { + remainingWebhookStatus.delete(row.status) + webhooks.set( + { ...labels, status: row.status.toString(), region: regionKey }, + parseInt(row.count) //NOTE risk this bigint being too big for JS, but that would be a very large number of webhooks + ) + } + // zero-values for all remaining webhook statuses + remainingWebhookStatus.forEach((status) => { + webhooks.set({ ...labels, status: status.toString(), region: regionKey }, 0) + }) + } catch (err) { + logger.warn( + { err, region: regionKey }, + "Failed to collect webhook metrics from region '{region}'. This may be because the region is not yet registered and has no webhooks_events table." + ) + } + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index 57ffd8a0ee..68c6341646 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -1,10 +1,21 @@ import { DbClient, getDbClients } from '@/clients/knex.js' import { logger } from '@/observability/logging.js' import { databaseMonitorCollectionPeriodSeconds } from '@/utils/env.js' -import { Knex } from 'knex' import { join } from 'lodash-es' -import { Gauge, Histogram, Registry } from 'prom-client' +import { Histogram, Registry } from 'prom-client' import prometheusClient from 'prom-client' +import { init as commits } from '@/observability/metrics/commits.js' +import { init as connections } from '@/observability/metrics/connections.js' +import { init as connectionsTotal } from '@/observability/metrics/connectionsTotal.js' +import { init as dbSize } from '@/observability/metrics/dbSize.js' +import { init as fileImports } from '@/observability/metrics/fileImports.js' +import { init as fileSize } from '@/observability/metrics/fileSize.js' +import { init as objects } from '@/observability/metrics/objects.js' +import { init as previews } from '@/observability/metrics/previews.js' +import { init as streams } from '@/observability/metrics/streams.js' +import { init as tablesize } from '@/observability/metrics/tableSize.js' +import { init as users } from '@/observability/metrics/users.js' +import { init as webhooks } from '@/observability/metrics/webhooks.js' let prometheusInitialized = false @@ -23,13 +34,6 @@ type MetricsMonitor = { start: () => () => void } -type WithOnDemandCollector = T & { - triggerCollect?: (params: { - dbClients: DbClient[] - mainDbClient: Knex - }) => Promise -} - function initMonitoringMetrics(params: { register: Registry collectionPeriodMilliseconds: number @@ -43,462 +47,25 @@ function initMonitoringMetrics(params: { const labelNames = Object.keys(labels) const getDbClients = config.getDbClients - const dbSize: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_size'], '_'), - help: 'Size of the entire database (in bytes)', - labelNames: ['region', ...labelNames] - }) - dbSize.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey, databaseName }) => { - if (!databaseName) { - logger.warn( - { region: regionKey }, - "Could not get database name from client config for region '{region}'" - ) - return - } - - logger.info( - { region: regionKey, databaseName }, - "Collecting database size for region '{region}' from database '{databaseName}'" - ) - - const dbSizeResult = await client.raw<{ - rows: [{ pg_database_size: string }] //bigints are returned as strings - }>('SELECT pg_database_size(?) LIMIT 1', [databaseName]) - if (!dbSizeResult.rows.length) { - logger.error( - { region: regionKey }, - "No database size found for region '{region}'. This is odd." - ) - return - } - dbSize.set( - { ...labels, region: regionKey }, - parseInt(dbSizeResult.rows[0].pg_database_size) //NOTE risk this bigint being too big for JS, but that would be a very large database! - ) - }) - ) - } - - const objects: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_objects'], '_'), - help: 'Number of objects', - labelNames: [...labelNames, 'region'] - }) - objects.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - try { - const objectsEstimate = await client.raw<{ - rows: [{ estimate: number }] - }>( - "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'objects' LIMIT 1;" - ) - if (objectsEstimate.rows.length) { - objects.set( - { ...labels, region: regionKey }, - Math.max(objectsEstimate.rows[0]?.estimate, 0) - ) - } - } catch (err) { - logger.warn( - { err, region: regionKey }, - "Failed to collect objects from region '{region}'. This may be because the region is not yet registered and has no 'objects' table." - ) - } - }) - ) - } - - const streams: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_streams'], '_'), - help: 'Number of streams/projects', - labelNames - }) - streams.triggerCollect = async (params) => { - const { mainDbClient } = params - try { - const streamsEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( - "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'streams' LIMIT 1;" - ) - if (streamsEstimate.rows.length) { - streams.set({ ...labels }, Math.max(streamsEstimate.rows[0]?.estimate)) - } - } catch (err) { - logger.warn( - err, - 'Failed to collect streams metrics. This may be because the main database is not yet migrated.' - ) - } - } - - const commits: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_commits'], '_'), - help: 'Number of commits/versions', - labelNames: [...labelNames, 'region'] - }) - commits.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - try { - const commitsEstimate = await client.raw<{ - rows: [{ estimate: number }] - }>( - "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'commits' LIMIT 1;" - ) - if (commitsEstimate.rows.length) { - commits.set( - { ...labels, region: regionKey }, - Math.max(commitsEstimate.rows[0]?.estimate) - ) - } - } catch (err) { - logger.warn( - { err, region: regionKey }, - "Failed to collect commits metrics from region '{region}'. This may be because the region is not yet registered and has no 'commits' table." - ) - } - }) - ) - } - - const users: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_users'], '_'), - help: 'Number of users', - labelNames - }) - users.triggerCollect = async (params) => { - const { mainDbClient } = params - try { - const usersEstimate = await mainDbClient.raw<{ rows: [{ estimate: number }] }>( - "SELECT reltuples AS estimate FROM pg_class WHERE relname = 'users' LIMIT 1;" - ) - if (usersEstimate.rows.length) { - users.set({ ...labels }, Math.max(usersEstimate.rows[0]?.estimate)) - } - } catch (err) { - logger.warn( - err, - "Failed to collect users metrics. This may be because the migrations have not yet occcurred and has no 'users' table." - ) - } - } - - const fileimports: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_fileimports'], '_'), - help: 'Number of imported files, by type and status', - labelNames: ['filetype', 'status', 'region', ...labelNames] - }) - fileimports.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - try { - const importedFiles = await client.raw<{ - rows: [{ fileType: string; convertedStatus: number; count: string }] - }>( - ` - SELECT LOWER("fileType") AS "fileType", "convertedStatus", count(*) - FROM file_uploads - GROUP BY (LOWER("fileType"), "convertedStatus"); - ` - ) - - // Get the set of all unique file types and converted statuses in the database - const allFileImportConvertedStatusAndFileTypes = importedFiles.rows.reduce( - (acc, row) => { - acc.convertedStatus.add(row.convertedStatus) - acc.fileType.add(row.fileType) - return acc - }, - { convertedStatus: new Set(), fileType: new Set() } - ) - - // now calculate the combinatorial set of all possible file types and statuses - const remainingConvertedStatusAndFileTypes = new Set<{ - fileType: string - status: number - }>() - allFileImportConvertedStatusAndFileTypes.convertedStatus.forEach((status) => { - allFileImportConvertedStatusAndFileTypes.fileType.forEach((fileType) => { - remainingConvertedStatusAndFileTypes.add({ fileType, status }) - }) - }) - - // now set the counts for the file types and statuses that are in the database - for (const row of importedFiles.rows) { - // objects are stored by reference, so we have to search for the original item in the set - remainingConvertedStatusAndFileTypes.forEach((item) => { - if ( - item.fileType === row.fileType && - item.status === row.convertedStatus - ) { - remainingConvertedStatusAndFileTypes.delete(item) - } - }) - fileimports.set( - { - ...labels, - filetype: row.fileType, - status: row.convertedStatus.toString(), - region: regionKey - }, - parseInt(row.count) - ) - } - // zero-values for all remaining file types and statuses - remainingConvertedStatusAndFileTypes.forEach(({ fileType, status }) => { - fileimports.set( - { - ...labels, - filetype: fileType, - status: status.toString(), - region: regionKey - }, - 0 - ) - }) - } catch (err) { - logger.warn( - { err, region: regionKey }, - "Failed to collect file import status metrics from region '{region}'. This may be because the region is not yet registered and has no file_uploads table." - ) - } - }) - ) - } - - const filesize: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_filesize'], '_'), - help: 'Size of imported files, by type (in bytes)', - labelNames: ['filetype', 'region', ...labelNames] - }) - filesize.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - try { - const fileSizeResults = await client.raw<{ - rows: [{ filetype: string; filesize: string }] - }>( - ` - SELECT LOWER("fileType") AS fileType, SUM("fileSize") AS fileSize - FROM file_uploads - GROUP BY LOWER("fileType"); - ` - ) - for (const row of fileSizeResults.rows) { - filesize.set( - { ...labels, filetype: row.filetype, region: regionKey }, - parseInt(row.filesize) - ) - } - } catch (err) { - logger.warn( - { err, region: regionKey }, - "Failed to collect file upload metrics from region '{region}'. This may be because the region is not yet registered and has no 'file_uploads' table." - ) - } - }) - ) - } - - const webhooks: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_webhooks'], '_'), - help: 'Number of webhook calls, by status', - labelNames: ['status', 'region', ...labelNames] - }) - webhooks.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - try { - const webhookResults = await client.raw<{ - rows: [{ status: number; count: string }] - }>( - ` - SELECT status, count(*) - FROM webhooks_events - GROUP BY status; - ` - ) - const remainingWebhookStatus = new Set(Array(4).keys()) - for (const row of webhookResults.rows) { - remainingWebhookStatus.delete(row.status) - webhooks.set( - { ...labels, status: row.status.toString(), region: regionKey }, - parseInt(row.count) //NOTE risk this bigint being too big for JS, but that would be a very large number of webhooks - ) - } - // zero-values for all remaining webhook statuses - remainingWebhookStatus.forEach((status) => { - webhooks.set({ ...labels, status: status.toString(), region: regionKey }, 0) - }) - } catch (err) { - logger.warn( - { err, region: regionKey }, - "Failed to collect webhook metrics from region '{region}'. This may be because the region is not yet registered and has no webhooks_events table." - ) - } - }) - ) - } - - const previews: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_previews'], '_'), - help: 'Number of previews, by status', - labelNames: ['status', 'region', ...labelNames] - }) - previews.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - try { - const previewStatusResults = await client.raw<{ - rows: [{ previewStatus: number; count: string }] - }>(` - SELECT "previewStatus", count(*) - FROM object_preview - GROUP BY "previewStatus"; - `) - - const remainingPreviewStatus = new Set(Array(4).keys()) - for (const row of previewStatusResults.rows) { - remainingPreviewStatus.delete(row.previewStatus) - previews.set( - { ...labels, region: regionKey, status: row.previewStatus.toString() }, - parseInt(row.count) - ) - } - // zero-values for all remaining preview statuses - remainingPreviewStatus.forEach((status) => { - previews.set({ ...labels, region: regionKey, status: status.toString() }, 0) - }) - } catch (err) { - logger.warn( - { err, region: regionKey }, - "Failed to collect private status metrics from region '{region}'. This may be because the region is not yet registered and has no 'object_preview' table." - ) - } - }) - ) - } - - const tablesize: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_tablesize'], '_'), - help: 'Size of tables in the database, by table (in bytes)', - labelNames: ['table', 'region', ...labelNames] - }) - tablesize.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - const tableSizeResults = await client.raw<{ - rows: [{ table_name: string; table_size: string }] //bigints are returned as strings - }>( - ` - SELECT - table_name, - table_size - - FROM ( - SELECT - pg_catalog.pg_namespace.nspname AS schema_name, - relname AS table_name, - pg_relation_size(pg_catalog.pg_class.oid) AS table_size - - FROM pg_catalog.pg_class - JOIN pg_catalog.pg_namespace ON relnamespace = pg_catalog.pg_namespace.oid - ) t - WHERE schema_name = 'public' - ORDER BY table_size DESC; - ` - ) - for (const row of tableSizeResults.rows) { - tablesize.set( - { ...labels, table: row.table_name, region: regionKey }, - parseInt(row.table_size) //NOTE risk this bigint being too big for JS, but that would be a very large table! - ) - } - }) - ) - } - - const connections: WithOnDemandCollector> = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_used_connections'], '_'), - help: 'Number of active (used) database connections', - labelNames: ['region', ...labelNames] - }) - connections.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - const connectionResults = await client.raw<{ - rows: [{ used_connections: string }] - }>(`SELECT COUNT(*) AS used_connections FROM pg_stat_activity;`) - if (!connectionResults.rows.length) { - logger.error( - { region: regionKey }, - "No active connections found for region '{region}'. This is odd." - ) - return - } - connections.set( - { ...labels, region: regionKey }, - parseInt(connectionResults.rows[0].used_connections) - ) - }) - ) - } - - const totalConnections: WithOnDemandCollector> = - new prometheusClient.Gauge({ - name: join([namePrefix, 'db_total_connections'], '_'), - help: 'Total number of database connections', - labelNames: ['region', ...labelNames] - }) - totalConnections.triggerCollect = async (params) => { - const { dbClients } = params - await Promise.all( - dbClients.map(async ({ client, regionKey }) => { - const connectionResults = await client.raw<{ - rows: [{ maximum_connections: number }] - }>( - `SELECT setting::int AS maximum_connections FROM pg_settings WHERE name=$$max_connections$$;` - ) - if (!connectionResults.rows.length) { - logger.error( - { region: regionKey }, - "No maximum connections found for region '{region}'. This is odd." - ) - return - } - totalConnections.set( - { ...labels, region: regionKey }, - connectionResults.rows[0].maximum_connections - ) - }) - ) - } - - const metricsToCollect = [ + const metricsToInitialize = [ + commits, + connections, + connectionsTotal, dbSize, - tablesize, + fileImports, + fileSize, objects, + previews, streams, - commits, + tablesize, users, - fileimports, - filesize, - webhooks, - previews + webhooks ] + const metricsToCollect = metricsToInitialize.map((metricToInitialize) => + metricToInitialize({ labelNames, namePrefix, logger }) + ) + const selfMonitor = new Histogram({ name: join([namePrefix, 'self_monitor_time_monitoring_metrics'], '_'), help: 'The time taken to collect all of the database monitoring metrics, seconds.', @@ -518,7 +85,7 @@ function initMonitoringMetrics(params: { await Promise.all( metricsToCollect.map(async (metric) => { - await metric.triggerCollect?.({ dbClients, mainDbClient }) + await metric({ dbClients, mainDbClient, labels }) }) ) } diff --git a/packages/monitor-deployment/src/observability/types.ts b/packages/monitor-deployment/src/observability/types.ts new file mode 100644 index 0000000000..031155476c --- /dev/null +++ b/packages/monitor-deployment/src/observability/types.ts @@ -0,0 +1,13 @@ +import type { DbClient } from '@/clients/knex.js' +import type { Knex } from 'knex' +import type { Logger } from 'pino' + +export type MetricConfig = { labelNames: string[]; namePrefix: string; logger: Logger } +export type MetricCollectionParameters = { + dbClients: DbClient[] + mainDbClient: Knex + labels: Record +} +export type MetricInitializer = ( + config: MetricConfig +) => (params: MetricCollectionParameters) => Promise From 4261e26f83c28b4561be48ff664bbed60f46f095 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:52:34 +0000 Subject: [PATCH 22/27] feat(metrics): more metrics related to replications --- .../src/observability/metrics/dbWorkers.ts | 33 +++++++++++++ .../metrics/dbWorkersAwaitingLocks.ts | 33 +++++++++++++ ...ections.ts => inactiveReplicationSlots.ts} | 14 +++--- ...{connectionsTotal.ts => maxConnections.ts} | 4 +- .../metrics/replicationSlotLag.ts | 36 +++++++++++++++ .../metrics/replicationWorkerLag.ts | 46 +++++++++++++++++++ .../src/observability/prometheusMetrics.ts | 16 +++++-- 7 files changed, 170 insertions(+), 12 deletions(-) create mode 100644 packages/monitor-deployment/src/observability/metrics/dbWorkers.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/dbWorkersAwaitingLocks.ts rename packages/monitor-deployment/src/observability/metrics/{connections.ts => inactiveReplicationSlots.ts} (62%) rename packages/monitor-deployment/src/observability/metrics/{connectionsTotal.ts => maxConnections.ts} (89%) create mode 100644 packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts create mode 100644 packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts diff --git a/packages/monitor-deployment/src/observability/metrics/dbWorkers.ts b/packages/monitor-deployment/src/observability/metrics/dbWorkers.ts new file mode 100644 index 0000000000..f62770ef56 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/dbWorkers.ts @@ -0,0 +1,33 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const dbWorkers = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_workers'], '_'), + help: 'Number of database workers', + labelNames: ['region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const connectionResults = await client.raw<{ + rows: [{ worker_count: string }] + }>(`SELECT COUNT(*) AS worker_count FROM pg_stat_activity;`) + if (!connectionResults.rows.length) { + logger.error( + { region: regionKey }, + "No database workers found for region '{region}'. This is odd." + ) + return + } + dbWorkers.set( + { ...labels, region: regionKey }, + parseInt(connectionResults.rows[0].worker_count) + ) + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/dbWorkersAwaitingLocks.ts b/packages/monitor-deployment/src/observability/metrics/dbWorkersAwaitingLocks.ts new file mode 100644 index 0000000000..c15809b2cf --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/dbWorkersAwaitingLocks.ts @@ -0,0 +1,33 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const promMetric = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_workers_awaiting_locks'], '_'), + help: 'Number of database workers awaiting locks', + labelNames: ['region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const queryResults = await client.raw<{ + rows: [{ count: string }] + }>(`SELECT COUNT(*) FROM pg_stat_activity WHERE wait_event = 'Lock';`) + if (!queryResults.rows.length) { + logger.error( + { region: regionKey }, + "No database workers found for region '{region}'. This is odd." + ) + return + } + promMetric.set( + { ...labels, region: regionKey }, + parseInt(queryResults.rows[0].count) + ) + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/connections.ts b/packages/monitor-deployment/src/observability/metrics/inactiveReplicationSlots.ts similarity index 62% rename from packages/monitor-deployment/src/observability/metrics/connections.ts rename to packages/monitor-deployment/src/observability/metrics/inactiveReplicationSlots.ts index e22a72ab9a..1a849948fc 100644 --- a/packages/monitor-deployment/src/observability/metrics/connections.ts +++ b/packages/monitor-deployment/src/observability/metrics/inactiveReplicationSlots.ts @@ -5,8 +5,8 @@ import type { MetricInitializer } from '@/observability/types.js' export const init: MetricInitializer = (config) => { const { labelNames, namePrefix, logger } = config const connections = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_used_connections'], '_'), - help: 'Number of active (used) database connections', + name: join([namePrefix, 'db_inactive_replication_slots'], '_'), + help: 'Number of inactive database replication slots', labelNames: ['region', ...labelNames] }) return async (params) => { @@ -14,18 +14,20 @@ export const init: MetricInitializer = (config) => { await Promise.all( dbClients.map(async ({ client, regionKey }) => { const connectionResults = await client.raw<{ - rows: [{ used_connections: string }] - }>(`SELECT COUNT(*) AS used_connections FROM pg_stat_activity;`) + rows: [{ inactive_replication_slots: string }] + }>( + `SELECT count(*) AS inactive_replication_slots FROM pg_replication_slots WHERE NOT active;` + ) if (!connectionResults.rows.length) { logger.error( { region: regionKey }, - "No active connections found for region '{region}'. This is odd." + "No data related to replication slots found for region '{region}'. This is odd." ) return } connections.set( { ...labels, region: regionKey }, - parseInt(connectionResults.rows[0].used_connections) + parseInt(connectionResults.rows[0].inactive_replication_slots) ) }) ) diff --git a/packages/monitor-deployment/src/observability/metrics/connectionsTotal.ts b/packages/monitor-deployment/src/observability/metrics/maxConnections.ts similarity index 89% rename from packages/monitor-deployment/src/observability/metrics/connectionsTotal.ts rename to packages/monitor-deployment/src/observability/metrics/maxConnections.ts index a534d2d37b..b74142a359 100644 --- a/packages/monitor-deployment/src/observability/metrics/connectionsTotal.ts +++ b/packages/monitor-deployment/src/observability/metrics/maxConnections.ts @@ -5,8 +5,8 @@ import type { MetricInitializer } from '@/observability/types.js' export const init: MetricInitializer = (config) => { const { labelNames, namePrefix, logger } = config const totalConnections = new prometheusClient.Gauge({ - name: join([namePrefix, 'db_total_connections'], '_'), - help: 'Total number of database connections', + name: join([namePrefix, 'db_max_connections'], '_'), + help: 'Maximum number of database connections allowed by the server', labelNames: ['region', ...labelNames] }) return async (params) => { diff --git a/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts b/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts new file mode 100644 index 0000000000..8e1461e896 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts @@ -0,0 +1,36 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const promMetric = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_replication_slot_lag'], '_'), + help: 'Lag of replication slots in bytes', + labelNames: ['region', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const queryResults = await client.raw<{ + rows: [{ slot_lag_bytes: number }] + }>(` + SELECT pg_current_wal_lsn() - confirmed_flush_lsn AS slot_lag_bytes + FROM pg_replication_slots; + `) + if (!queryResults.rows.length) { + logger.error( + { region: regionKey }, + "No database replication slots found for region '{region}'. This is odd." + ) + return + } + promMetric.set( + { ...labels, region: regionKey }, + queryResults.rows[0].slot_lag_bytes + ) + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts b/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts new file mode 100644 index 0000000000..f5ae263501 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts @@ -0,0 +1,46 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' + +export const init: MetricInitializer = (config) => { + const { labelNames, namePrefix, logger } = config + const promMetric = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_replication_worker_lag'], '_'), + help: 'Lag of replication workers, by type of lag', + labelNames: ['region', 'lagtype', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const queryResults = await client.raw<{ + rows: [{ write_lag: number; flush_lag: number; replay_lag: number }] + }>(` + SELECT write_lsn - sent_lsn AS write_lag, + flush_lsn - write_lsn AS flush_lag, + replay_lsn - flush_lsn AS replay_lag + FROM pg_stat_replication; + `) + if (!queryResults.rows.length) { + logger.error( + { region: regionKey }, + "No database workers found for region '{region}'. This is odd." + ) + return + } + promMetric.set( + { ...labels, region: regionKey, lagtype: 'write' }, + queryResults.rows[0].write_lag + ) + promMetric.set( + { ...labels, region: regionKey, lagtype: 'flush' }, + queryResults.rows[0].flush_lag + ) + promMetric.set( + { ...labels, region: regionKey, lagtype: 'replay' }, + queryResults.rows[0].replay_lag + ) + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index 68c6341646..5ce3f38ccc 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -5,13 +5,17 @@ import { join } from 'lodash-es' import { Histogram, Registry } from 'prom-client' import prometheusClient from 'prom-client' import { init as commits } from '@/observability/metrics/commits.js' -import { init as connections } from '@/observability/metrics/connections.js' -import { init as connectionsTotal } from '@/observability/metrics/connectionsTotal.js' import { init as dbSize } from '@/observability/metrics/dbSize.js' +import { init as dbWorkers } from '@/observability/metrics/dbWorkers.js' +import { init as dbWorkersAwaitingLocks } from '@/observability/metrics/dbWorkersAwaitingLocks.js' import { init as fileImports } from '@/observability/metrics/fileImports.js' import { init as fileSize } from '@/observability/metrics/fileSize.js' +import { init as inactiveReplicationSlots } from '@/observability/metrics/inactiveReplicationSlots.js' +import { init as maxConnections } from '@/observability/metrics/maxConnections.js' import { init as objects } from '@/observability/metrics/objects.js' import { init as previews } from '@/observability/metrics/previews.js' +import { init as replicationSlotLag } from '@/observability/metrics/replicationSlotLag.js' +import { init as replicationWorkerLag } from '@/observability/metrics/replicationWorkerLag.js' import { init as streams } from '@/observability/metrics/streams.js' import { init as tablesize } from '@/observability/metrics/tableSize.js' import { init as users } from '@/observability/metrics/users.js' @@ -49,13 +53,17 @@ function initMonitoringMetrics(params: { const metricsToInitialize = [ commits, - connections, - connectionsTotal, + dbWorkers, + dbWorkersAwaitingLocks, dbSize, fileImports, fileSize, + inactiveReplicationSlots, + maxConnections, objects, previews, + replicationSlotLag, + replicationWorkerLag, streams, tablesize, users, From ad1813575ffd9e767ea84bd0353f0bef8f8448b4 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:11:02 +0000 Subject: [PATCH 23/27] tidying up new metrics --- .../metrics/replicationSlotLag.ts | 16 +++--- .../metrics/replicationWorkerLag.ts | 55 ++++++++++++++----- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts b/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts index 8e1461e896..ef19cd3003 100644 --- a/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts +++ b/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts @@ -7,16 +7,16 @@ export const init: MetricInitializer = (config) => { const promMetric = new prometheusClient.Gauge({ name: join([namePrefix, 'db_replication_slot_lag'], '_'), help: 'Lag of replication slots in bytes', - labelNames: ['region', ...labelNames] + labelNames: ['region', 'slotname', ...labelNames] }) return async (params) => { const { dbClients, labels } = params await Promise.all( dbClients.map(async ({ client, regionKey }) => { const queryResults = await client.raw<{ - rows: [{ slot_lag_bytes: number }] + rows: [{ slot_name: string; slot_lag_bytes: string }] }>(` - SELECT pg_current_wal_lsn() - confirmed_flush_lsn AS slot_lag_bytes + SELECT slot_name, pg_current_wal_lsn() - confirmed_flush_lsn AS slot_lag_bytes FROM pg_replication_slots; `) if (!queryResults.rows.length) { @@ -26,10 +26,12 @@ export const init: MetricInitializer = (config) => { ) return } - promMetric.set( - { ...labels, region: regionKey }, - queryResults.rows[0].slot_lag_bytes - ) + for (const row of queryResults.rows) { + promMetric.set( + { ...labels, region: regionKey, slotname: row.slot_name }, + parseInt(row.slot_lag_bytes) + ) + } }) ) } diff --git a/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts b/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts index f5ae263501..84b3b3f0b6 100644 --- a/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts +++ b/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts @@ -7,18 +7,26 @@ export const init: MetricInitializer = (config) => { const promMetric = new prometheusClient.Gauge({ name: join([namePrefix, 'db_replication_worker_lag'], '_'), help: 'Lag of replication workers, by type of lag', - labelNames: ['region', 'lagtype', ...labelNames] + labelNames: ['region', 'lagtype', 'name', ...labelNames] }) return async (params) => { const { dbClients, labels } = params await Promise.all( dbClients.map(async ({ client, regionKey }) => { const queryResults = await client.raw<{ - rows: [{ write_lag: number; flush_lag: number; replay_lag: number }] + rows: [ + { + write_lag: string + flush_lag: string + replay_lag: string + application_name: string + } + ] }>(` SELECT write_lsn - sent_lsn AS write_lag, flush_lsn - write_lsn AS flush_lag, - replay_lsn - flush_lsn AS replay_lag + replay_lsn - flush_lsn AS replay_lag, + application_name FROM pg_stat_replication; `) if (!queryResults.rows.length) { @@ -28,18 +36,35 @@ export const init: MetricInitializer = (config) => { ) return } - promMetric.set( - { ...labels, region: regionKey, lagtype: 'write' }, - queryResults.rows[0].write_lag - ) - promMetric.set( - { ...labels, region: regionKey, lagtype: 'flush' }, - queryResults.rows[0].flush_lag - ) - promMetric.set( - { ...labels, region: regionKey, lagtype: 'replay' }, - queryResults.rows[0].replay_lag - ) + for (const row of queryResults.rows) { + promMetric.set( + { + ...labels, + region: regionKey, + lagtype: 'write', + name: row.application_name + }, + parseInt(row.write_lag) + ) + promMetric.set( + { + ...labels, + region: regionKey, + lagtype: 'flush', + name: row.application_name + }, + parseInt(row.flush_lag) + ) + promMetric.set( + { + ...labels, + region: regionKey, + lagtype: 'replay', + name: row.application_name + }, + parseInt(row.replay_lag) + ) + } }) ) } From a69175b564883b373af1a0554aef2cb5bbdf424b Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:28:11 +0000 Subject: [PATCH 24/27] Allow database monitor to configure connection timeouts --- packages/fileimport-service/Dockerfile | 2 +- .../monitor-deployment/src/clients/knex.ts | 6 ++++- packages/monitor-deployment/src/utils/env.ts | 27 ++++++++++++------- .../templates/monitoring/deployment.yml | 4 +++ 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/packages/fileimport-service/Dockerfile b/packages/fileimport-service/Dockerfile index 088f024c2e..2a893ef635 100644 --- a/packages/fileimport-service/Dockerfile +++ b/packages/fileimport-service/Dockerfile @@ -20,7 +20,7 @@ RUN chmod +x /usr/bin/tini RUN apt-get update -y \ && DEBIAN_FRONTEND=noninteractive apt-get install -y \ --no-install-recommends \ - curl=8.5.0-2ubuntu10.5 \ + curl=8.5.0-2ubuntu10.6 \ && curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \ && DEBIAN_FRONTEND=noninteractive apt-get install -y \ --no-install-recommends \ diff --git a/packages/monitor-deployment/src/clients/knex.ts b/packages/monitor-deployment/src/clients/knex.ts index 6b617bcc6c..45430f330f 100644 --- a/packages/monitor-deployment/src/clients/knex.ts +++ b/packages/monitor-deployment/src/clients/knex.ts @@ -1,5 +1,7 @@ import { knexLogger as logger } from '@/observability/logging.js' import { + getConnectionAcquireTimeoutMillis, + getConnectionCreateTimeoutMillis, getDatabaseName, getPostgresCACertificate, getPostgresConnectionString, @@ -36,7 +38,9 @@ export const getDbClients = async () => { isDevOrTestEnv: isDevOrTestEnv(), logger, maxConnections, - applicationName: 'speckle_database_monitor' + applicationName: 'speckle_database_monitor', + connectionAcquireTimeoutMillis: getConnectionAcquireTimeoutMillis(), + connectionCreateTimeoutMillis: getConnectionCreateTimeoutMillis() } if (!FF_WORKSPACES_MULTI_REGION_ENABLED) { const mainClient = configureKnexClient( diff --git a/packages/monitor-deployment/src/utils/env.ts b/packages/monitor-deployment/src/utils/env.ts index e8fc6c597b..2450162bac 100644 --- a/packages/monitor-deployment/src/utils/env.ts +++ b/packages/monitor-deployment/src/utils/env.ts @@ -1,24 +1,33 @@ export function getIntFromEnv(envVarKey: string, aDefault = '0'): number { return parseInt(process.env[envVarKey] || aDefault) } +function getBooleanFromEnv(envVarKey: string, aDefault = false): boolean { + return ['1', 'true', true].includes( + process.env[envVarKey]?.toLocaleLowerCase() || aDefault.toString() + ) +} -export const getMetricsHost = () => process.env.METRICS_HOST || '127.0.0.1' -export const getLogLevel = () => process.env.LOG_LEVEL || 'info' -export const getMetricsPort = () => process.env.PROMETHEUS_METRICS_PORT || '9092' -export const getNodeEnv = () => process.env.NODE_ENV || 'production' +export const getMetricsHost = () => process.env['METRICS_HOST'] || '127.0.0.1' +export const getLogLevel = () => process.env['LOG_LEVEL'] || 'info' +export const getMetricsPort = () => process.env['PROMETHEUS_METRICS_PORT'] || '9092' +export const getNodeEnv = () => process.env['NODE_ENV'] || 'production' export const getPostgresConnectionString = () => - process.env.PG_CONNECTION_STRING || 'postgres://speckle:speckle@127.0.0.1/speckle' -export const getPostgresCACertificate = () => process.env.POSTGRES_CA_CERTIFICATE + process.env['PG_CONNECTION_STRING'] || 'postgres://speckle:speckle@127.0.0.1/speckle' +export const getPostgresCACertificate = () => process.env['POSTGRES_CA_CERTIFICATE'] export const getPostgresMaxConnections = () => - parseInt(process.env.POSTGRES_MAX_CONNECTIONS || '2') + getIntFromEnv('POSTGRES_MAX_CONNECTIONS_DATABASE_MONITOR', '2') +export const getConnectionAcquireTimeoutMillis = () => + getIntFromEnv('POSTGRES_CONNECTION_ACQUIRE_TIMEOUT_MILLIS', '16000') +export const getConnectionCreateTimeoutMillis = () => + getIntFromEnv('POSTGRES_CONNECTION_CREATE_TIMEOUT_MILLIS', '5000') export function databaseMonitorCollectionPeriodSeconds() { return getIntFromEnv('METRICS_COLLECTION_PERIOD_SECONDS', '120') } -export const getDatabaseName = () => process.env.POSTGRES_DATABASE +export const getDatabaseName = () => process.env['POSTGRES_DATABASE'] export const isDevelopment = () => getNodeEnv() === 'development' || getNodeEnv() === 'dev' -export const isLogPretty = () => process.env.LOG_PRETTY?.toLocaleLowerCase() === 'true' +export const isLogPretty = () => getBooleanFromEnv('LOG_PRETTY', false) export const isProduction = () => getNodeEnv() === 'production' export const isTest = () => getNodeEnv() === 'test' export const isDevOrTestEnv = () => isDevelopment() || isTest() diff --git a/utils/helm/speckle-server/templates/monitoring/deployment.yml b/utils/helm/speckle-server/templates/monitoring/deployment.yml index ecc628820b..c635241879 100644 --- a/utils/helm/speckle-server/templates/monitoring/deployment.yml +++ b/utils/helm/speckle-server/templates/monitoring/deployment.yml @@ -65,6 +65,10 @@ spec: key: {{ default "postgres_url" .Values.db.connectionString.secretKey }} - name: POSTGRES_MAX_CONNECTIONS value: {{ .Values.monitoring.maximumPostgresConnections | quote }} + - name: POSTGRES_CONNECTION_CREATE_TIMEOUT_MILLIS + value: {{ .Values.db.connectionCreateTimeoutMillis | quote }} + - name: POSTGRES_CONNECTION_ACQUIRE_TIMEOUT_MILLIS + value: {{ .Values.db.connectionAcquireTimeoutMillis | quote }} - name: METRICS_HOST value: '0.0.0.0' # bind to all interfaces, not just localhost. Required to allow prometheus to scrape metrics, and healthchecks to work. - name: PROMETHEUS_METRICS_PORT From 32ecc3614a4486fba409f60a0bafe9d64fbde6d8 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:35:44 +0000 Subject: [PATCH 25/27] Use aiven_extras to get replication & subscription, and protect via feature flag --- .../metrics/replicationSlotLag.ts | 2 +- .../metrics/replicationWorkerLag.ts | 74 ++++++++++++------- .../metrics/subscriptionsEnabled.ts | 46 ++++++++++++ .../src/observability/prometheusMetrics.ts | 11 ++- 4 files changed, 102 insertions(+), 31 deletions(-) create mode 100644 packages/monitor-deployment/src/observability/metrics/subscriptionsEnabled.ts diff --git a/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts b/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts index ef19cd3003..9d70fb4953 100644 --- a/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts +++ b/packages/monitor-deployment/src/observability/metrics/replicationSlotLag.ts @@ -17,7 +17,7 @@ export const init: MetricInitializer = (config) => { rows: [{ slot_name: string; slot_lag_bytes: string }] }>(` SELECT slot_name, pg_current_wal_lsn() - confirmed_flush_lsn AS slot_lag_bytes - FROM pg_replication_slots; + FROM pg_replication_slots WHERE slot_type='logical'; `) if (!queryResults.rows.length) { logger.error( diff --git a/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts b/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts index 84b3b3f0b6..9413f22ed2 100644 --- a/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts +++ b/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts @@ -1,8 +1,17 @@ import prometheusClient from 'prom-client' import { join } from 'lodash-es' import type { MetricInitializer } from '@/observability/types.js' +import Environment from '@speckle/shared/dist/commonjs/environment/index.js' + +const { FF_WORKSPACES_MULTI_REGION_ENABLED } = Environment.getFeatureFlags() export const init: MetricInitializer = (config) => { + if (!FF_WORKSPACES_MULTI_REGION_ENABLED) { + return async () => { + // Do nothing + } + } + const { labelNames, namePrefix, logger } = config const promMetric = new prometheusClient.Gauge({ name: join([namePrefix, 'db_replication_worker_lag'], '_'), @@ -27,7 +36,7 @@ export const init: MetricInitializer = (config) => { flush_lsn - write_lsn AS flush_lag, replay_lsn - flush_lsn AS replay_lag, application_name - FROM pg_stat_replication; + FROM aiven_extras.pg_stat_replication_list(); `) if (!queryResults.rows.length) { logger.error( @@ -37,33 +46,42 @@ export const init: MetricInitializer = (config) => { return } for (const row of queryResults.rows) { - promMetric.set( - { - ...labels, - region: regionKey, - lagtype: 'write', - name: row.application_name - }, - parseInt(row.write_lag) - ) - promMetric.set( - { - ...labels, - region: regionKey, - lagtype: 'flush', - name: row.application_name - }, - parseInt(row.flush_lag) - ) - promMetric.set( - { - ...labels, - region: regionKey, - lagtype: 'replay', - name: row.application_name - }, - parseInt(row.replay_lag) - ) + const writeLag = parseInt(row.write_lag) + if (!isNaN(writeLag)) { + promMetric.set( + { + ...labels, + region: regionKey, + lagtype: 'write', + name: row.application_name + }, + parseInt(row.write_lag) + ) + } + const flushLag = parseInt(row.flush_lag) + if (!isNaN(flushLag)) { + promMetric.set( + { + ...labels, + region: regionKey, + lagtype: 'flush', + name: row.application_name + }, + parseInt(row.flush_lag) + ) + } + const replayLag = parseInt(row.replay_lag) + if (!isNaN(replayLag)) { + promMetric.set( + { + ...labels, + region: regionKey, + lagtype: 'replay', + name: row.application_name + }, + parseInt(row.replay_lag) + ) + } } }) ) diff --git a/packages/monitor-deployment/src/observability/metrics/subscriptionsEnabled.ts b/packages/monitor-deployment/src/observability/metrics/subscriptionsEnabled.ts new file mode 100644 index 0000000000..a527e90f64 --- /dev/null +++ b/packages/monitor-deployment/src/observability/metrics/subscriptionsEnabled.ts @@ -0,0 +1,46 @@ +import prometheusClient from 'prom-client' +import { join } from 'lodash-es' +import type { MetricInitializer } from '@/observability/types.js' +import Environment from '@speckle/shared/dist/commonjs/environment/index.js' + +const { FF_WORKSPACES_MULTI_REGION_ENABLED } = Environment.getFeatureFlags() + +export const init: MetricInitializer = (config) => { + if (!FF_WORKSPACES_MULTI_REGION_ENABLED) { + return async () => { + // Do nothing + } + } + + const { labelNames, namePrefix, logger } = config + const promMetric = new prometheusClient.Gauge({ + name: join([namePrefix, 'db_subscriptions_enabled'], '_'), + help: 'Enabled subscriptions to other databases', + labelNames: ['region', 'subscriptionname', ...labelNames] + }) + return async (params) => { + const { dbClients, labels } = params + await Promise.all( + dbClients.map(async ({ client, regionKey }) => { + const queryResults = await client.raw<{ + rows: [{ subname: string; subenabled: boolean }] + }>(` + SELECT subname, subenabled FROM aiven_extras.pg_list_all_subscriptions(); + `) + if (!queryResults.rows.length) { + logger.error( + { region: regionKey }, + "No database replication slots found for region '{region}'. This is odd." + ) + return + } + for (const row of queryResults.rows) { + promMetric.set( + { ...labels, region: regionKey, subscriptionname: row.subname }, + row.subenabled ? 1 : 0 + ) + } + }) + ) + } +} diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index 5ce3f38ccc..57db1eaa13 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -17,6 +17,7 @@ import { init as previews } from '@/observability/metrics/previews.js' import { init as replicationSlotLag } from '@/observability/metrics/replicationSlotLag.js' import { init as replicationWorkerLag } from '@/observability/metrics/replicationWorkerLag.js' import { init as streams } from '@/observability/metrics/streams.js' +import { init as subscriptionsEnabled } from '@/observability/metrics/subscriptionsEnabled.js' import { init as tablesize } from '@/observability/metrics/tableSize.js' import { init as users } from '@/observability/metrics/users.js' import { init as webhooks } from '@/observability/metrics/webhooks.js' @@ -65,6 +66,7 @@ function initMonitoringMetrics(params: { replicationSlotLag, replicationWorkerLag, streams, + subscriptionsEnabled, tablesize, users, webhooks @@ -92,8 +94,13 @@ function initMonitoringMetrics(params: { } await Promise.all( - metricsToCollect.map(async (metric) => { - await metric({ dbClients, mainDbClient, labels }) + metricsToCollect.map(async (collectMetric) => { + try { + await collectMetric({ dbClients, mainDbClient, labels }) + } catch (err) { + logger.error({ err }, 'Failed to collect a metric') + // Continue collecting other metrics + } }) ) } From f647bedea5a5d9b00aed3cf1a9cfe45a053bb073 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:37:54 +0000 Subject: [PATCH 26/27] track errors --- .../src/observability/prometheusMetrics.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index 57db1eaa13..01045d7fa9 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -2,7 +2,7 @@ import { DbClient, getDbClients } from '@/clients/knex.js' import { logger } from '@/observability/logging.js' import { databaseMonitorCollectionPeriodSeconds } from '@/utils/env.js' import { join } from 'lodash-es' -import { Histogram, Registry } from 'prom-client' +import { Counter, Histogram, Registry } from 'prom-client' import prometheusClient from 'prom-client' import { init as commits } from '@/observability/metrics/commits.js' import { init as dbSize } from '@/observability/metrics/dbSize.js' @@ -84,6 +84,13 @@ function initMonitoringMetrics(params: { labelNames }) + const selfMonitorErrors = new Counter({ + name: join([namePrefix, 'self_monitor_errors_monitoring_metrics'], '_'), + help: 'The number of errors encountered while collecting monitoring metrics.', + registers, + labelNames + }) + const collect = async () => { const dbClients = await getDbClients() @@ -98,6 +105,7 @@ function initMonitoringMetrics(params: { try { await collectMetric({ dbClients, mainDbClient, labels }) } catch (err) { + selfMonitorErrors.inc(labels) logger.error({ err }, 'Failed to collect a metric') // Continue collecting other metrics } From ccd6e3026dc175ac74a7cede4807a92ca6df76ab Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Mon, 16 Dec 2024 18:09:58 +0000 Subject: [PATCH 27/27] Improve error handling where aiven_extras is not yet enabled --- .../metrics/replicationWorkerLag.ts | 42 ++++++++++++++----- .../metrics/subscriptionsEnabled.ts | 27 ++++++++++-- .../src/observability/prometheusMetrics.ts | 2 +- 3 files changed, 55 insertions(+), 16 deletions(-) diff --git a/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts b/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts index 9413f22ed2..e3feb62e06 100644 --- a/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts +++ b/packages/monitor-deployment/src/observability/metrics/replicationWorkerLag.ts @@ -5,6 +5,17 @@ import Environment from '@speckle/shared/dist/commonjs/environment/index.js' const { FF_WORKSPACES_MULTI_REGION_ENABLED } = Environment.getFeatureFlags() +type QueryResponseSchema = { + rows: [ + { + write_lag: string + flush_lag: string + replay_lag: string + application_name: string + } + ] +} + export const init: MetricInitializer = (config) => { if (!FF_WORKSPACES_MULTI_REGION_ENABLED) { return async () => { @@ -22,23 +33,32 @@ export const init: MetricInitializer = (config) => { const { dbClients, labels } = params await Promise.all( dbClients.map(async ({ client, regionKey }) => { - const queryResults = await client.raw<{ - rows: [ - { - write_lag: string - flush_lag: string - replay_lag: string - application_name: string - } - ] - }>(` + let queryResults: QueryResponseSchema | undefined = undefined + try { + queryResults = await client.raw(` SELECT write_lsn - sent_lsn AS write_lag, flush_lsn - write_lsn AS flush_lag, replay_lsn - flush_lsn AS replay_lag, application_name FROM aiven_extras.pg_stat_replication_list(); `) - if (!queryResults.rows.length) { + } catch (err) { + if ( + err instanceof Error && + err.message.includes('schema "aiven_extras" does not exist') + ) { + logger.warn( + { err, region: regionKey }, + "'aiven_extras' extension is not yet enabled for region '{region}'." + ) + return // continue to next region + } + + //else rethrow + throw err + } + + if (!queryResults?.rows.length) { logger.error( { region: regionKey }, "No database workers found for region '{region}'. This is odd." diff --git a/packages/monitor-deployment/src/observability/metrics/subscriptionsEnabled.ts b/packages/monitor-deployment/src/observability/metrics/subscriptionsEnabled.ts index a527e90f64..fb8c195fc5 100644 --- a/packages/monitor-deployment/src/observability/metrics/subscriptionsEnabled.ts +++ b/packages/monitor-deployment/src/observability/metrics/subscriptionsEnabled.ts @@ -3,6 +3,10 @@ import { join } from 'lodash-es' import type { MetricInitializer } from '@/observability/types.js' import Environment from '@speckle/shared/dist/commonjs/environment/index.js' +type QueryResponseSchema = { + rows: [{ subname: string; subenabled: boolean }] +} + const { FF_WORKSPACES_MULTI_REGION_ENABLED } = Environment.getFeatureFlags() export const init: MetricInitializer = (config) => { @@ -22,12 +26,27 @@ export const init: MetricInitializer = (config) => { const { dbClients, labels } = params await Promise.all( dbClients.map(async ({ client, regionKey }) => { - const queryResults = await client.raw<{ - rows: [{ subname: string; subenabled: boolean }] - }>(` + let queryResults: QueryResponseSchema | undefined = undefined + try { + queryResults = await client.raw(` SELECT subname, subenabled FROM aiven_extras.pg_list_all_subscriptions(); `) - if (!queryResults.rows.length) { + } catch (err) { + if ( + err instanceof Error && + err.message.includes('schema "aiven_extras" does not exist') + ) { + logger.warn( + { err, region: regionKey }, + "'aiven_extras' extension is not yet enabled for region '{region}'." + ) + return // continue to next region + } + + //else rethrow + throw err + } + if (!queryResults?.rows.length) { logger.error( { region: regionKey }, "No database replication slots found for region '{region}'. This is odd." diff --git a/packages/monitor-deployment/src/observability/prometheusMetrics.ts b/packages/monitor-deployment/src/observability/prometheusMetrics.ts index 01045d7fa9..4023e6d5ed 100644 --- a/packages/monitor-deployment/src/observability/prometheusMetrics.ts +++ b/packages/monitor-deployment/src/observability/prometheusMetrics.ts @@ -106,7 +106,7 @@ function initMonitoringMetrics(params: { await collectMetric({ dbClients, mainDbClient, labels }) } catch (err) { selfMonitorErrors.inc(labels) - logger.error({ err }, 'Failed to collect a metric') + logger.error({ err }, 'Error encountered while collecting a metric') // Continue collecting other metrics } })