From ea7177be1b0e8ab5016f7af396baaa68481535ca Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Tue, 19 Nov 2024 17:40:56 -0600 Subject: [PATCH] Add query to see jobs that need runners scaled up (#5944) This will be used by the autoscaler lambdas to figure out which instances may not have properly propagated the scale-up command (which happens fairly regularly) The query is based on the "queued_jobs" query, with a few changes: - Queues across all jobs in the pytorch and pytorch-labs organizations, which our runners support - Checks only a specific time window for queued jobs, looking for jobs that have been queued for long enough to warrant intervention, while not being so old that github cancels the job --- .../queued_jobs_aggregate/params.json | 1 + .../queued_jobs_aggregate/query.sql | 71 +++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 torchci/clickhouse_queries/queued_jobs_aggregate/params.json create mode 100644 torchci/clickhouse_queries/queued_jobs_aggregate/query.sql diff --git a/torchci/clickhouse_queries/queued_jobs_aggregate/params.json b/torchci/clickhouse_queries/queued_jobs_aggregate/params.json new file mode 100644 index 0000000000..9e26dfeeb6 --- /dev/null +++ b/torchci/clickhouse_queries/queued_jobs_aggregate/params.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/torchci/clickhouse_queries/queued_jobs_aggregate/query.sql b/torchci/clickhouse_queries/queued_jobs_aggregate/query.sql new file mode 100644 index 0000000000..fe4645d7f5 --- /dev/null +++ b/torchci/clickhouse_queries/queued_jobs_aggregate/query.sql @@ -0,0 +1,71 @@ +--- This query is used by the AWS autoscalers to scale up runner types that +--- have had jobs waiting for them for a significant period of time. +--- +--- This query returns the number of jobs per runner type that have been +--- queued for too long, which the autoscalers use to determin how many +--- additional runners to spin up. + +with possible_queued_jobs as ( + select id, run_id + from default.workflow_job + where + status = 'queued' + AND created_at < ( + -- Only consider jobs that have been queued for a significant period of time + CURRENT_TIMESTAMP() - INTERVAL 30 MINUTE + ) + AND created_at > ( + -- Queued jobs are automatically cancelled after this long. Any allegedly pending + -- jobs older than this are actually bad data + CURRENT_TIMESTAMP() - INTERVAL 3 DAY + ) +), + queued_jobs as ( + SELECT + DATE_DIFF( + 'minute', + job.created_at, + CURRENT_TIMESTAMP() + ) AS queue_m, + workflow.repository.owner.login as org, + workflow.repository.full_name as full_repo, + CONCAT(workflow.name, ' / ', job.name) AS name, + job.html_url, + IF( + LENGTH(job.labels) = 0, + 'N/A', + IF( + LENGTH(job.labels) > 1, + job.labels[2], + job.labels[1] + ) + ) AS runner_label + FROM + default.workflow_job job final + JOIN default.workflow_run workflow final ON workflow.id = job.run_id + WHERE + job.id in (select id from possible_queued_jobs) + and workflow.id in (select run_id from possible_queued_jobs) + and workflow.repository.owner.login in ('pytorch', 'pytorch-labs') + AND job.status = 'queued' + /* These two conditions are workarounds for GitHub's broken API. Sometimes */ + /* jobs get stuck in a permanently "queued" state but definitely ran. We can */ + /* detect this by looking at whether any steps executed (if there were, */ + /* obviously the job started running), and whether the workflow was marked as */ + /* complete (somehow more reliable than the job-level API) */ + AND LENGTH(job.steps) = 0 + AND workflow.status != 'completed' + ORDER BY + queue_m DESC +) +select + runner_label, + org, + full_repo, + count(*) as num_queued_jobs, + min(queue_m) as min_queue_time_min, + max(queue_m) as max_queue_time_min +from queued_jobs +group by runner_label, org, full_repo +order by max_queue_time_min desc +settings allow_experimental_analyzer = 1; \ No newline at end of file