Add --timing-stats flag:

This will hopefully help me address some of the slowness in Refurb. There is a lot of area for improvement, but right now it is hard/impossible to pinpoint where the slowness is. I will probably add more datapoints in the future, but for now this should be sufficient to get some preliminary timing stats. Also bump package version.
dosisod · Aug 15, 2023 · 37d7d4b · 37d7d4b
1 parent 5c22760
commit 37d7d4b
Show file tree

Hide file tree

Showing 6 changed files with 134 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -285,6 +285,23 @@ Then, to load your new check, use `refurb file.py --load your.path.here`
 > importing a normal python module. If `your.path.here` is a directory, all checks
 > in that directory will be loaded. If it is a file, only that file will be loaded.
 
+## Troubleshooting
+
+If Refurb is running slow, use the `--timing-stats` flag to diagnose why:
+
+```
+$ refurb file --timing-stats /tmp/stats.json
+```
+
+This will output a JSON file with the following information:
+
+* Total time Mypy took to parse the modules (a majority of the time usually).
+* Time Mypy spent parsing each module. Useful for finding very large/unused files.
+* Time Refurb spent checking each module. These numbers should be very small (less than 100ms).
+
+Larger files naturally take longer to check, but files that take way too long should be
+looked into, as an issue might only manifest themselves when a file reaches a certain size.
+
 ## Developing / Contributing
 
 ### Setup

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "refurb"
-version = "1.19.0"
+version = "1.20.0"
 description = "A tool for refurbish and modernize Python codebases"
 authors = ["dosisod"]
 license = "GPL-3.0-only"

diff --git a/refurb/main.py b/refurb/main.py
@@ -1,9 +1,12 @@
+import json
 import re
+import time
 from collections.abc import Sequence
 from functools import cache, partial
 from importlib import metadata
 from io import StringIO
 from pathlib import Path
+from tempfile import mkstemp
 
 from mypy.build import build
 from mypy.errors import CompileError
@@ -148,15 +151,24 @@ def run_refurb(settings: Settings) -> Sequence[Error | str]:
     opt.local_partial_types = True
     opt.python_version = settings.get_python_version()
 
+    mypy_timing_stats = Path(mkstemp()[1]) if settings.timing_stats else None
+    opt.timing_stats = str(mypy_timing_stats) if mypy_timing_stats else None
+
     try:
+        start = time.time()
+
         result = build(files, options=opt)
 
+        mypy_build_time = time.time() - start
+
     except CompileError as e:
         return [re.sub("^mypy: ", "refurb: ", msg) for msg in e.messages]
 
     errors: list[Error | str] = []
     checks = load_checks(settings)
 
+    refurb_timing_stats_in_ms: dict[str, int] = {}
+
     for file in files:
         tree = result.graph[file.module].tree
 
@@ -165,15 +177,30 @@ def run_refurb(settings: Settings) -> Sequence[Error | str]:
         if settings.debug:
             errors.append(str(tree))
 
-        visitor = RefurbVisitor(checks, settings)
+        start = time.time()
 
+        visitor = RefurbVisitor(checks, settings)
         tree.accept(visitor)
 
+        elapsed = time.time() - start
+
+        refurb_timing_stats_in_ms[file.module] = int(elapsed * 1_000)
+
         for error in visitor.errors:
             error.filename = file.path
 
         errors += visitor.errors
 
+    output_timing_stats(
+        settings,
+        mypy_build_time,
+        mypy_timing_stats,
+        refurb_timing_stats_in_ms,
+    )
+
+    if mypy_timing_stats:
+        mypy_timing_stats.unlink()
+
     return sorted(
         [
             error
@@ -239,6 +266,42 @@ def format_errors(errors: Sequence[Error | str], settings: Settings) -> str:
     return done
 
 
+def output_timing_stats(
+    settings: Settings,
+    mypy_total_time_spent: float,
+    mypy_timing_stats: Path | None,
+    refurb_timing_stats_in_ms: dict[str, int],
+) -> None:
+    if not settings.timing_stats:
+        return
+
+    assert mypy_timing_stats
+
+    mypy_stats: dict[str, int] = {}
+    lines = mypy_timing_stats.read_text().splitlines()
+
+    for line in lines:
+        module, micro_seconds = line.split()
+
+        mypy_stats[module] = int(micro_seconds) // 1_000
+
+    data = {
+        "mypy_total_time_spent_in_ms": int(mypy_total_time_spent * 1_000),
+        "mypy_time_spent_parsing_modules_in_ms": dict(
+            sorted(mypy_stats.items(), key=lambda x: x[1], reverse=True)
+        ),
+        "refurb_time_spent_checking_file_in_ms": dict(
+            sorted(
+                refurb_timing_stats_in_ms.items(),
+                key=lambda x: x[1],
+                reverse=True,
+            )
+        ),
+    }
+
+    settings.timing_stats.write_text(json.dumps(data, separators=(",", ":")))
+
+
 def main(args: list[str]) -> int:
     try:
         settings = load_settings(args)

diff --git a/refurb/settings.py b/refurb/settings.py
@@ -42,6 +42,7 @@ class Settings:
     format: Literal["text", "github", None] | None = None
     sort_by: Literal["filename", "error"] | None = None
     verbose: bool = False
+    timing_stats: Path | None = None
 
     def __post_init__(self) -> None:
         if self.enable_all and self.disable_all:
@@ -83,6 +84,7 @@ def merge(old: Settings, new: Settings) -> Settings:
             format=new.format or old.format,
             sort_by=new.sort_by or old.sort_by,
             verbose=old.verbose or new.verbose,
+            timing_stats=old.timing_stats or new.timing_stats,
         )
 
     def get_python_version(self) -> tuple[int, int]:
@@ -322,6 +324,9 @@ def get_next_arg(arg: str, args: Iterator[str]) -> str:
         elif arg == "--verbose":
             settings.verbose = True
 
+        elif arg == "--timing-stats":
+            settings.timing_stats = Path(get_next_arg(arg, iargs))
+
         elif arg == "--":
             settings.mypy_args = list(iargs)
 

diff --git a/test/test_arg_parsing.py b/test/test_arg_parsing.py
@@ -689,3 +689,16 @@ def test_generate_subcommand_is_ignored_if_other_files_are_passed() -> None:
 
 def test_parse_verbose_flag() -> None:
     assert parse_args(["--verbose"]) == Settings(verbose=True)
+
+
+def test_parse_timing_stats_flag() -> None:
+    assert parse_args(["--timing-stats", "file"]) == Settings(
+        timing_stats=Path("file")
+    )
+
+
+def test_parse_timing_stats_flag_without_arg_is_an_error() -> None:
+    with pytest.raises(
+        ValueError, match='refurb: missing argument after "--timing-stats"'
+    ):
+        parse_args(["--timing-stats"])
diff --git a/test/test_main.py b/test/test_main.py
@@ -1,8 +1,11 @@
+import json
 import os
 from dataclasses import dataclass
 from functools import partial
 from importlib import metadata
 from locale import LC_ALL, setlocale
+from pathlib import Path
+from tempfile import NamedTemporaryFile
 from unittest.mock import patch
 
 import pytest
@@ -270,3 +273,34 @@ def test_verbose_flag_prints_message_when_all_checks_disabled() -> None:
 
     assert "FURB100" not in stdout
     assert "No checks enabled" in stdout
+
+
+def test_timing_stats_outputs_stats_file() -> None:
+    with NamedTemporaryFile(mode="r") as tmp:
+        main(["test/e2e/dummy.py", "--timing-stats", tmp.name])
+
+        stats_file = Path(tmp.name)
+
+        assert stats_file.exists()
+
+        data = json.loads(stats_file.read_text())
+
+        match data:
+            case {
+                "mypy_total_time_spent_in_ms": int(_),
+                "mypy_time_spent_parsing_modules_in_ms": dict(mypy_timing),
+                "refurb_time_spent_checking_file_in_ms": dict(refurb_timing),
+            }:
+                msg = "All values must be ints"
+
+                assert all(
+                    isinstance(v, int) for v in mypy_timing.values()
+                ), msg
+
+                assert all(
+                    isinstance(v, int) for v in refurb_timing.values()
+                ), msg
+
+                return
+
+        pytest.fail("Data is not in proper format")