Optimizations for write_cfg_data (#4569)

Makes progress in fixing the slowdown as proofs get large. I'll call the "sync time" the time in between receiving a step result and starting to wait for the next step result on the master thread. As proofs get more and more nodes, calling `to_dict()` on every node gets progressively harder and this was taking up a substantial portion of the sync time as proofs got to 500+ nodes. Instead of generating the entire KCFG dict with `KCFG.to_dict()` before passing this to `KCFGStore.write_cfg_data()`, which discards all of this work anyway for the final product and replaces it with just a list of the node IDs, it is significantly faster to just pass it the list of the node IDs and have it use the KCFG object, which already has a dictionary storing nodes by ID to find the vacuous and stuck nodes, and to get the newly created nodes. This way we can call `to_dict()` on each node only when it is first created. To give an idea of a benchmark, I used a `LoopsTest.test_sum_1000()` (linear and long-running) with `--max-depth 1` and `--max-iterations 1000`. Before this change it gets to 1000 iterations in 59:06, and after this change it does it in 40:31. Before the change the sync time as this proof approached 1000 nodes ranged between about 3.4 to 4.2 seconds. After the change ranged from about 1.39 to 1.54 seconds. The big remaining chunk of sync time when the proof gets large seems to be in `get_steps()`. --------- Co-authored-by: Petar Maksimović <[email protected]>
runtimeverification · Aug 8, 2024 · d55c98f · d55c98f
1 parent 394c95b
commit d55c98f
Showing 1 changed file with 25 additions and 10 deletions.
diff --git a/pyk/src/pyk/kcfg/kcfg.py b/pyk/src/pyk/kcfg/kcfg.py
@@ -73,23 +73,18 @@ def kcfg_node_path(self, node_id: int) -> Path:
         return self.kcfg_node_dir / f'{node_id}.json'
 
     def write_cfg_data(
-        self, dct: dict[str, Any], deleted_nodes: Iterable[int] = (), created_nodes: Iterable[int] = ()
+        self, kcfg: KCFG, dct: dict[str, Any], deleted_nodes: Iterable[int] = (), created_nodes: Iterable[int] = ()
     ) -> None:
-        node_dict = {node_dct['id']: node_dct for node_dct in dct['nodes']}
         vacuous_nodes = [
-            node_id for node_id in node_dict.keys() if KCFGNodeAttr.VACUOUS.value in node_dict[node_id]['attrs']
-        ]
-        stuck_nodes = [
-            node_id for node_id in node_dict.keys() if KCFGNodeAttr.STUCK.value in node_dict[node_id]['attrs']
+            node_id for node_id in kcfg._nodes.keys() if KCFGNodeAttr.VACUOUS in kcfg._nodes[node_id].attrs
         ]
+        stuck_nodes = [node_id for node_id in kcfg._nodes.keys() if KCFGNodeAttr.STUCK in kcfg._nodes[node_id].attrs]
         dct['vacuous'] = vacuous_nodes
         dct['stuck'] = stuck_nodes
         for node_id in deleted_nodes:
             self.kcfg_node_path(node_id).unlink(missing_ok=True)
         for node_id in created_nodes:
-            del node_dict[node_id]['attrs']
-            self.kcfg_node_path(node_id).write_text(json.dumps(node_dict[node_id]))
-        dct['nodes'] = list(node_dict.keys())
+            self.kcfg_node_path(node_id).write_text(json.dumps(kcfg._nodes[node_id].to_dict()))
         self.kcfg_json_path.write_text(json.dumps(dct))
 
     def read_cfg_data(self) -> dict[str, Any]:
@@ -542,6 +537,26 @@ def extend(
             case _:
                 raise AssertionError()
 
+    def to_dict_no_nodes(self) -> dict[str, Any]:
+        nodes = list(self._nodes.keys())
+        edges = [edge.to_dict() for edge in self.edges()]
+        covers = [cover.to_dict() for cover in self.covers()]
+        splits = [split.to_dict() for split in self.splits()]
+        ndbranches = [ndbranch.to_dict() for ndbranch in self.ndbranches()]
+
+        aliases = dict(sorted(self._aliases.items()))
+
+        res = {
+            'next': self._node_id,
+            'nodes': nodes,
+            'edges': edges,
+            'covers': covers,
+            'splits': splits,
+            'ndbranches': ndbranches,
+            'aliases': aliases,
+        }
+        return {k: v for k, v in res.items() if v}
+
     def to_dict(self) -> dict[str, Any]:
         nodes = [node.to_dict() for node in self.nodes]
         edges = [edge.to_dict() for edge in self.edges()]
@@ -1261,7 +1276,7 @@ def reachable_nodes(self, source_id: NodeIdLike, *, reverse: bool = False) -> se
     def write_cfg_data(self) -> None:
         assert self._kcfg_store is not None
         self._kcfg_store.write_cfg_data(
-            self.to_dict(), deleted_nodes=self._deleted_nodes, created_nodes=self._created_nodes
+            self, self.to_dict_no_nodes(), deleted_nodes=self._deleted_nodes, created_nodes=self._created_nodes
         )
         self._deleted_nodes.clear()
         self._created_nodes.clear()