Skip to content

Commit

Permalink
[CH] Migrate failure annotation page (#5803)
Browse files Browse the repository at this point in the history
This is more like a refactoring, not a migration, but it's needed to
make the query works.

* `failed_workflow_jobs` runs OOM on ClickHouse trying to load all
failed jobs from commits that are having more than N failures. I rewrite
the logic to get rid of N because we only set it to 0 anyway. The
simplified logic can be then be run
* https://hud.pytorch.org/failedjobs/pytorch/pytorch/main has duplicated
entries. This is a subtle bug from
`torchci/components/JobAnnotationToggle.tsx` that took me way too long
to figure out.
* There are several React warnings on the page, so I fix them too

### Testing


https://torchci-git-fork-huydhn-ch-migrate-job-anno-45acb4-fbopensource.vercel.app/failedjobs/pytorch/pytorch/main
  • Loading branch information
huydhn authored Oct 23, 2024
1 parent 866f05d commit c07dbe2
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 114 deletions.
3 changes: 1 addition & 2 deletions torchci/clickhouse_queries/failed_workflow_jobs/params.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"branch": "String",
"count": "Int64",
"repo": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)"
}
}
82 changes: 24 additions & 58 deletions torchci/clickhouse_queries/failed_workflow_jobs/query.sql
Original file line number Diff line number Diff line change
@@ -1,60 +1,26 @@
-- !!! Query is not converted to CH syntax yet. Delete this line when it gets converted
WITH repeats AS (
SELECT
array_agg(j.id) AS ids
FROM
workflow_run w
JOIN workflow_job j ON w.id = j.run_id HINT(join_strategy = lookup)
WHERE
j._event_time >= PARSE_DATETIME_ISO8601(: startTime)
AND j._event_time < PARSE_DATETIME_ISO8601(: stopTime)
AND w.head_repository.full_name = : repo
AND w.head_branch = : branch
AND w.event != 'workflow_run'
AND w.event != 'repository_dispatch'
GROUP BY
j.head_sha,
j.name,
w.name
HAVING
count(*) > : count
AND bool_or(
j.conclusion IN (
'failure', 'cancelled', 'time_out'
)
)
),
ids AS (
SELECT
ids.id
FROM
repeats,
UNNEST(repeats.ids AS id) AS ids
)
SELECT
job.head_sha AS sha,
CONCAT(w.name, ' / ', job.name) AS jobName,
job.id,
job.conclusion,
job.html_url AS htmlUrl,
CONCAT(
'https://ossci-raw-job-status.s3.amazonaws.com/log/',
CAST(job.id AS string)
) AS logUrl,
DATE_DIFF(
'SECOND',
PARSE_TIMESTAMP_ISO8601(job.started_at),
PARSE_TIMESTAMP_ISO8601(job.completed_at)
) AS durationS,
w.repository.full_name AS repo,
ARRAY_CREATE(job.torchci_classification.line) AS failureLines,
job.torchci_classification.captures AS failureCaptures,
ARRAY_CREATE(job.torchci_classification.line_num) AS failureLineNumbers,
-- This query is used to annotate job on HUD
SELECT DISTINCT
j.head_sha AS sha,
CONCAT(w.name, ' / ', j.name) AS jobName,
j.id,
j.conclusion,
j.html_url AS htmlUrl,
CONCAT(
'https://ossci-raw-job-status.s3.amazonaws.com/log/',
j.id
) AS logUrl,
DATE_DIFF('SECOND', j.started_at, j.completed_at) AS durationS,
array(j.torchci_classification. 'line') AS failureLines,
j.torchci_classification. 'captures' AS failureCaptures,
array(j.torchci_classification. 'line_num') AS failureLineNumbers
FROM
ids
JOIN workflow_job job on job.id = ids.id
INNER JOIN workflow_run w on w.id = job.run_id
workflow_job j FINAL
JOIN workflow_run w FINAL on w.id = j.run_id
WHERE
job.conclusion IN (
'failure', 'cancelled', 'time_out'
)
j.created_at >= {startTime: DateTime64(3) }
AND j.created_at < {stopTime: DateTime64(3) }
AND w.head_repository. 'full_name' = {repo: String }
AND w.head_branch = {branch: String }
AND w.event != 'workflow_run'
AND w.event != 'repository_dispatch'
AND j.conclusion IN ('failure', 'cancelled', 'time_out')
6 changes: 5 additions & 1 deletion torchci/components/JobAnnotationToggle.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { ToggleButton, ToggleButtonGroup } from "@mui/material";
import _ from "lodash";
import { useSession } from "next-auth/react";
import React from "react";
import { JobAnnotation, JobData } from "../lib/types";
Expand All @@ -15,7 +16,10 @@ export default function JobAnnotationToggle({
repo?: string | null;
}) {
const allJobs = similarJobs ?? [];
allJobs.push(job);
// Double check if the job exists before adding it
if (!_.find(allJobs, (j: JobData) => j.id === job.id)) {
allJobs.push(job);
}

const [state, setState] = React.useState<JobAnnotation>(
(annotation ?? "null") as JobAnnotation
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,12 @@
import { queryClickhouseSaved } from "lib/clickhouse";
import { getDynamoClient } from "lib/dynamo";
import getRocksetClient, { RocksetParam } from "lib/rockset";
import { JobData } from "lib/types";
import { NextApiRequest, NextApiResponse } from "next";
import rocksetVersions from "rockset/prodVersions.json";

async function fetchFailureJobs(
queryParams: RocksetParam[]
): Promise<JobData[]> {
const rocksetClient = getRocksetClient();
const failedJobs = await rocksetClient.queryLambdas.executeQueryLambda(
"commons",
"failed_workflow_jobs",
rocksetVersions.commons.failed_workflow_jobs,
{
parameters: queryParams,
}
);
return failedJobs.results ?? [];
async function fetchFailureJobs(queryParams: {
[key: string]: any;
}): Promise<JobData[]> {
return await queryClickhouseSaved("failed_workflow_jobs", queryParams);
}

export default async function handler(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import LogViewer from "components/LogViewer";
import dayjs from "dayjs";
import { fetcher } from "lib/GeneralUtils";
import { isRerunDisabledTestsJob, isUnstableJob } from "lib/jobUtils";
import { RocksetParam } from "lib/rockset";
import { JobAnnotation, JobData } from "lib/types";
import _ from "lodash";
import { useRouter } from "next/router";
Expand Down Expand Up @@ -38,14 +37,17 @@ function SimilarFailedJobs({
{showDetail ? "▼ " : "▶ "}
<code>Failing {similarJobs.length} times</code>
</button>
{showDetail &&
_.map(similarJobs, (job) => (
<FailedJob
job={job}
similarJobs={[]}
classification={classification}
/>
))}
<ul>
{showDetail &&
_.map(similarJobs, (job) => (
<FailedJob
job={job}
similarJobs={[]}
classification={classification}
key={job.id}
/>
))}
</ul>
</div>
);
}
Expand Down Expand Up @@ -125,15 +127,15 @@ function FailedJobs({
repoName,
repoOwner,
}: {
queryParams: RocksetParam[];
queryParams: { [key: string]: any };
repoName: string;
repoOwner: string;
}) {
// Note: querying the list of failed jobs here and send their IDs over to get
// their annotation is not a scalable solution because the list of failures
// could be longer than the browser-dependent URL-length limit. The workaround
// here is to send the query param over to another annotation API that will then
// make a query to Rockset to get the list of failed jobs itself and return the
// make a query to the db to get the list of failed jobs itself and return the
// list to the caller here
const { data: failedJobsWithAnnotations } = useSWR(
`/api/job_annotation/${repoOwner}/${repoName}/failures/${encodeURIComponent(
Expand Down Expand Up @@ -233,33 +235,12 @@ export default function Page() {
const [stopTime, setStopTime] = useState(dayjs());
const [timeRange, setTimeRange] = useState<number>(7);

const queryParams: RocksetParam[] = [
{
name: "startTime",
type: "string",
value: startTime,
},
{
name: "stopTime",
type: "string",
value: stopTime,
},
{
name: "repo",
type: "string",
value: `${repoOwner}/${repoName}`,
},
{
name: "branch",
type: "string",
value: `${branch}`,
},
{
name: "count",
type: "int",
value: "0", // Set the count to 0 to query all failures
},
];
const queryParams: { [key: string]: any } = {
branch: branch,
repo: `${repoOwner}/${repoName}`,
startTime: dayjs(startTime).utc().format("YYYY-MM-DDTHH:mm:ss.SSS"),
stopTime: dayjs(stopTime).utc().format("YYYY-MM-DDTHH:mm:ss.SSS"),
};

return (
<div>
Expand Down

0 comments on commit c07dbe2

Please sign in to comment.