feat: Make refresh/reput script more hackable (#2935)

This is now essentially a superset of the reput_bugs.py script (the only missing piece is source filtering being built in as a flag), just made it a bit more hackable to allow easier changes for one off runs.
google · Dec 17, 2024 · ed5ae72 · ed5ae72
1 parent b04faa3
commit ed5ae72
Showing 1 changed file with 64 additions and 17 deletions.
diff --git a/tools/datafix/refresh_ids.py → tools/datafix/reput_helper.py b/tools/datafix/refresh_ids.py → tools/datafix/reput_helper.py
@@ -3,6 +3,8 @@
     determined by the pre put hook.
 
     Does this by deleting and reputting each Bug entry.
+
+    Before running the script, fill out the sections commented with # FILLOUT
 """
 from google.cloud import ndb
 import osv
@@ -15,46 +17,69 @@
 
 MAX_BATCH_SIZE = 500
 
+# Global flags
+verbose = False
+fullrefresh = False
+transform = True
+
 
 class DryRunException(Exception):
   """This exception is raised to cancel a transaction during dry runs"""
 
 
-def get_relevant_ids(verbose: bool) -> list[str]:
+def get_relevant_ids() -> list[str]:
   """Retrieve the IDs that require refreshing.
   
-  Currently this checks for Key IDs that don't match db_id field.
+  1. FILLOUT this function to only return IDs that are necessary to update
   """
   relevant_ids = []
 
   query = osv.Bug.query()
-  query.projection = ["db_id"]
-  print(f"Running initial query on {query.kind}...")
+
+  # Examples:
+  # - Datastore query filters
+  # query = query.filter([osv.Bug.source == "ubuntu"])
+  #
+  # - Apply projections to avoid loading the entire entity
+  # query.projection = ["db_id"]
+  #
+  # - Use a key_only query if no python filtering logic is needed
+  query.keys_only = True
+
+  print(f"Running initial query '{ query }' on {query.kind}...")
 
   result: typing.Iterable[osv.Bug] = query.iter()
   counter = 0
 
   for res in result:
     counter += 1
     # Check if the key needs to be updated
-    if res.key.id() != res.db_id:  # type: ignore
-      relevant_ids.append(res.db_id)
-      if verbose:
-        print(res.db_id + ' - ' + res.key.id())  # type: ignore
+    relevant_ids.append(res.db_id)
+    if verbose:
+      print(res.db_id + ' - ' + res.key.id())  # type: ignore
 
   print(f"Found {len(relevant_ids)} / {counter} relevant bugs to refresh.")
   return relevant_ids
 
 
-def refresh_ids(dryrun: bool, verbose: bool, loadcache: str) -> None:
+def transform_bug(_: osv.Bug):
+  """Transform bug in place.
+  
+  2. FILLOUT this function to apply transformations before reputting the bug.
+  """
+  # E.g. Set key to none to regenerate a new key
+  # bug.key = None
+
+
+def refresh_ids(dryrun: bool, loadcache: str) -> None:
   """Update bugs IDs to the new format"""
 
   relevant_ids = []
   if loadcache:
     with open(loadcache, 'r') as f:
       relevant_ids = json.load(f)
   else:
-    relevant_ids = get_relevant_ids(verbose)
+    relevant_ids = get_relevant_ids()
 
   # Store the state incase we cancel halfway to avoid having
   # to do the initial query again.
@@ -71,13 +96,15 @@ def _refresh_ids(batch: int):
         osv.Bug.get_by_id(r) for r in relevant_ids[batch:batch + MAX_BATCH_SIZE]
     ]
 
-    # Delete the existing entries. This must be done in a transaction
-    # to avoid losing data if interrupted
-    ndb.delete_multi([r.key for r in buf])
+    if fullrefresh:
+      # Delete the existing entries. This must be done in a transaction
+      # to avoid losing data if interrupted
+      ndb.delete_multi([r.key for r in buf])
 
-    # Clear the key so the key name will be regenerated to the new key format
-    for elem in buf:
-      elem.key = None
+    if transform:
+      # Clear the key so the key name will be regenerated to the new key format
+      for elem in buf:
+        transform_bug(elem)
 
     # Reput the bug back in
     ndb.put_multi_async(buf)
@@ -121,6 +148,18 @@ def main() -> None:
       dest="verbose",
       default=False,
       help="Print each ID that needs to be processed")
+  parser.add_argument(
+      "--full-refresh",
+      action=argparse.BooleanOptionalAction,
+      dest="fullrefresh",
+      default=False,
+      help="Deletes the bug before reputting, necessary for key changes")
+  parser.add_argument(
+      "--transform",
+      action=argparse.BooleanOptionalAction,
+      dest="transform",
+      default=True,
+      help="Perform transformation code")
   # Add argument for loading from json cache
   parser.add_argument(
       "--load-cache",
@@ -134,10 +173,18 @@ def main() -> None:
       help="GCP project to operate on")
   args = parser.parse_args()
 
+  global verbose
+  global fullrefresh
+  global transform
+
+  verbose = args.verbose
+  fullrefresh = args.fullrefresh
+  transform = args.transform
+
   client = ndb.Client(project=args.project)
   print(f"Running on project {args.project}.")
   with client.context():
-    refresh_ids(args.dryrun, args.verbose, args.loadcache)
+    refresh_ids(args.dryrun, args.loadcache)
 
 
 if __name__ == "__main__":