Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

landing_worker: delete patch content after landing (bug 1879536) #374

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions landoapi/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,20 @@ def run_post_deploy_sequence():
)


@cli.command(name="clean-landing-job-patches")
def clean_landing_job_patches():
"""Iterate over all landed jobs and delete their patches."""
from landoapi.models.landing_job import LandingJob, LandingJobStatus
from landoapi.storage import db, db_subsystem

db_subsystem.ensure_ready()
jobs = LandingJob.query.filter(LandingJob.status == LandingJobStatus.LANDED).all()
for job in jobs:
for revision in job.revisions:
revision.patch_bytes = b""
db.session.commit()


Comment on lines +110 to +123
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should make the query only consider patches that are older than a certain date, with a CLI flag to set some specific time.

@cli.command(context_settings={"ignore_unknown_options": True})
@click.argument("celery_arguments", nargs=-1, type=click.UNPROCESSED)
def celery(celery_arguments):
Expand Down
5 changes: 5 additions & 0 deletions landoapi/workers/landing_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,11 @@ def run_job(
LandingJobAction.LAND, commit_id=commit_id, commit=True, db=db
)

# Patches are no longer necessary, delete them.
for revision in job.revisions:
revision.patch_bytes = b""
db.session.commit()

Comment on lines +442 to +446
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should be deleting these right away, it's useful to have the patch contents around for debugging and other archaeological purposes. It would be a pain if we needed to inspect the patch contents but we couldn't do so, because we want to save a few MB of disk space.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed this previously and concluded that since the commit hash is present, then it's not going to be much added value to have the raw patch after landing (for successful landings, at least).

I agree that there could be some benefit to keeping those around, however, these are taking a bit more than a few MBs at this time (a good portion of the 2.3GB as of today) and appear to be increasing dramatically in the last few weeks. As I understand we have a cap of 10GB (or at least, in the current tier).

The time limit approach might be better though regardless. Let me think some more about this.

mots_path = Path(hgrepo.path) / "mots.yaml"
if mots_path.exists():
logger.info(f"{mots_path} found, setting reviewer data.")
Expand Down
71 changes: 39 additions & 32 deletions tests/test_try.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,23 @@ def test_try_api_success_hgexport(
worker = LandingWorker(sleep_seconds=0.01)
hgrepo = HgRepo(hg_clone.strpath)

assert len(job.revisions) == 1, "Job should be landing a single revision."
revision = job.revisions[0]
assert revision.patch_bytes == (
b"# HG changeset patch\n"
b"# User Test User <[email protected]>\n"
b"# Date 0 +0000\n"
b"# Diff Start Line 6\n"
b"add another file.\n"
b"\n"
b"diff --git a/test.txt b/test.txt\n"
b"--- a/test.txt\n"
b"+++ b/test.txt\n"
b"@@ -1,1 +1,2 @@\n"
b" TEST\n"
b"+adding another line"
), "Patch diff should be parsed from patch body."

assert worker.run_job(job, repo, hgrepo, treestatus)
assert job.status == LandingJobStatus.LANDED
assert len(job.landed_commit_id) == 40
Expand All @@ -267,20 +284,7 @@ def test_try_api_success_hgexport(
assert (
revision.patch_data["timestamp"] == "0"
), "Timestamp should be parsed from `Date` header."
assert revision.patch_bytes == (
b"# HG changeset patch\n"
b"# User Test User <[email protected]>\n"
b"# Date 0 +0000\n"
b"# Diff Start Line 6\n"
b"add another file.\n"
b"\n"
b"diff --git a/test.txt b/test.txt\n"
b"--- a/test.txt\n"
b"+++ b/test.txt\n"
b"@@ -1,1 +1,2 @@\n"
b" TEST\n"
b"+adding another line"
), "Patch diff should be parsed from patch body."
assert revision.patch_bytes == b"", "Patch diff should be cleared after landing."


def test_try_api_success_gitformatpatch(
Expand Down Expand Up @@ -334,6 +338,26 @@ def test_try_api_success_gitformatpatch(
worker = LandingWorker(sleep_seconds=0.01)
hgrepo = HgRepo(hg_clone.strpath)

assert len(job.revisions) == 1, "Job should be landing a single revision."
revision = job.revisions[0]

assert revision.patch_bytes == (
b"# HG changeset patch\n"
b"# User Connor Sheehan <[email protected]>\n"
b"# Date 1657139769 +0000\n"
b"# Diff Start Line 8\n"
b"add another file\n"
b"\n"
b"add another file to the repo.\n"
b"\n"
b"diff --git a/test.txt b/test.txt\n"
b"--- a/test.txt\n"
b"+++ b/test.txt\n"
b"@@ -1,1 +1,2 @@\n"
b" TEST\n"
b"+adding another line"
), "Patch diff should be parsed from patch body."

# Assert the job landed against the expected commit hash.
assert worker.run_job(job, repo, hgrepo, treestatus)
assert job.status == LandingJobStatus.LANDED
Expand All @@ -343,8 +367,6 @@ def test_try_api_success_gitformatpatch(
), "Target changeset should match the passed value."

# Test the revision content matches expected.
assert len(job.revisions) == 1, "Job should have landed a single revision."
revision = job.revisions[0]
assert (
revision.patch_data["author_name"] == "Connor Sheehan"
), "Patch author should be parsed from `From` header."
Expand All @@ -357,19 +379,4 @@ def test_try_api_success_gitformatpatch(
assert (
revision.patch_data["timestamp"] == "1657139769"
), "Timestamp should be parsed from `Date` header."
assert revision.patch_bytes == (
b"# HG changeset patch\n"
b"# User Connor Sheehan <[email protected]>\n"
b"# Date 1657139769 +0000\n"
b"# Diff Start Line 8\n"
b"add another file\n"
b"\n"
b"add another file to the repo.\n"
b"\n"
b"diff --git a/test.txt b/test.txt\n"
b"--- a/test.txt\n"
b"+++ b/test.txt\n"
b"@@ -1,1 +1,2 @@\n"
b" TEST\n"
b"+adding another line"
), "Patch diff should be parsed from patch body."
assert revision.patch_bytes == b"", "Patch diff should be cleared after landing."