Skip to content

Commit

Permalink
feat: Make refresh/reput script more hackable (#2935)
Browse files Browse the repository at this point in the history
This is now essentially a superset of the reput_bugs.py script (the only
missing piece is source filtering being built in as a flag), just made
it a bit more hackable to allow easier changes for one off runs.
  • Loading branch information
another-rex authored Dec 17, 2024
1 parent b04faa3 commit ed5ae72
Showing 1 changed file with 64 additions and 17 deletions.
81 changes: 64 additions & 17 deletions tools/datafix/refresh_ids.py → tools/datafix/reput_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
determined by the pre put hook.
Does this by deleting and reputting each Bug entry.
Before running the script, fill out the sections commented with # FILLOUT
"""
from google.cloud import ndb
import osv
Expand All @@ -15,46 +17,69 @@

MAX_BATCH_SIZE = 500

# Global flags
verbose = False
fullrefresh = False
transform = True


class DryRunException(Exception):
"""This exception is raised to cancel a transaction during dry runs"""


def get_relevant_ids(verbose: bool) -> list[str]:
def get_relevant_ids() -> list[str]:
"""Retrieve the IDs that require refreshing.
Currently this checks for Key IDs that don't match db_id field.
1. FILLOUT this function to only return IDs that are necessary to update
"""
relevant_ids = []

query = osv.Bug.query()
query.projection = ["db_id"]
print(f"Running initial query on {query.kind}...")

# Examples:
# - Datastore query filters
# query = query.filter([osv.Bug.source == "ubuntu"])
#
# - Apply projections to avoid loading the entire entity
# query.projection = ["db_id"]
#
# - Use a key_only query if no python filtering logic is needed
query.keys_only = True

print(f"Running initial query '{ query }' on {query.kind}...")

result: typing.Iterable[osv.Bug] = query.iter()
counter = 0

for res in result:
counter += 1
# Check if the key needs to be updated
if res.key.id() != res.db_id: # type: ignore
relevant_ids.append(res.db_id)
if verbose:
print(res.db_id + ' - ' + res.key.id()) # type: ignore
relevant_ids.append(res.db_id)
if verbose:
print(res.db_id + ' - ' + res.key.id()) # type: ignore

print(f"Found {len(relevant_ids)} / {counter} relevant bugs to refresh.")
return relevant_ids


def refresh_ids(dryrun: bool, verbose: bool, loadcache: str) -> None:
def transform_bug(_: osv.Bug):
"""Transform bug in place.
2. FILLOUT this function to apply transformations before reputting the bug.
"""
# E.g. Set key to none to regenerate a new key
# bug.key = None


def refresh_ids(dryrun: bool, loadcache: str) -> None:
"""Update bugs IDs to the new format"""

relevant_ids = []
if loadcache:
with open(loadcache, 'r') as f:
relevant_ids = json.load(f)
else:
relevant_ids = get_relevant_ids(verbose)
relevant_ids = get_relevant_ids()

# Store the state incase we cancel halfway to avoid having
# to do the initial query again.
Expand All @@ -71,13 +96,15 @@ def _refresh_ids(batch: int):
osv.Bug.get_by_id(r) for r in relevant_ids[batch:batch + MAX_BATCH_SIZE]
]

# Delete the existing entries. This must be done in a transaction
# to avoid losing data if interrupted
ndb.delete_multi([r.key for r in buf])
if fullrefresh:
# Delete the existing entries. This must be done in a transaction
# to avoid losing data if interrupted
ndb.delete_multi([r.key for r in buf])

# Clear the key so the key name will be regenerated to the new key format
for elem in buf:
elem.key = None
if transform:
# Clear the key so the key name will be regenerated to the new key format
for elem in buf:
transform_bug(elem)

# Reput the bug back in
ndb.put_multi_async(buf)
Expand Down Expand Up @@ -121,6 +148,18 @@ def main() -> None:
dest="verbose",
default=False,
help="Print each ID that needs to be processed")
parser.add_argument(
"--full-refresh",
action=argparse.BooleanOptionalAction,
dest="fullrefresh",
default=False,
help="Deletes the bug before reputting, necessary for key changes")
parser.add_argument(
"--transform",
action=argparse.BooleanOptionalAction,
dest="transform",
default=True,
help="Perform transformation code")
# Add argument for loading from json cache
parser.add_argument(
"--load-cache",
Expand All @@ -134,10 +173,18 @@ def main() -> None:
help="GCP project to operate on")
args = parser.parse_args()

global verbose
global fullrefresh
global transform

verbose = args.verbose
fullrefresh = args.fullrefresh
transform = args.transform

client = ndb.Client(project=args.project)
print(f"Running on project {args.project}.")
with client.context():
refresh_ids(args.dryrun, args.verbose, args.loadcache)
refresh_ids(args.dryrun, args.loadcache)


if __name__ == "__main__":
Expand Down

0 comments on commit ed5ae72

Please sign in to comment.