Skip to content

Commit

Permalink
Add --timing-stats flag:
Browse files Browse the repository at this point in the history
This will hopefully help me address some of the slowness in Refurb. There is a
lot of area for improvement, but right now it is hard/impossible to pinpoint
where the slowness is. I will probably add more datapoints in the future, but
for now this should be sufficient to get some preliminary timing stats.

Also bump package version.
  • Loading branch information
dosisod committed Aug 15, 2023
1 parent 5c22760 commit 37d7d4b
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 2 deletions.
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,23 @@ Then, to load your new check, use `refurb file.py --load your.path.here`
> importing a normal python module. If `your.path.here` is a directory, all checks
> in that directory will be loaded. If it is a file, only that file will be loaded.

## Troubleshooting

If Refurb is running slow, use the `--timing-stats` flag to diagnose why:

```
$ refurb file --timing-stats /tmp/stats.json
```

This will output a JSON file with the following information:

* Total time Mypy took to parse the modules (a majority of the time usually).
* Time Mypy spent parsing each module. Useful for finding very large/unused files.
* Time Refurb spent checking each module. These numbers should be very small (less than 100ms).

Larger files naturally take longer to check, but files that take way too long should be
looked into, as an issue might only manifest themselves when a file reaches a certain size.

## Developing / Contributing

### Setup
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "refurb"
version = "1.19.0"
version = "1.20.0"
description = "A tool for refurbish and modernize Python codebases"
authors = ["dosisod"]
license = "GPL-3.0-only"
Expand Down
65 changes: 64 additions & 1 deletion refurb/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import json
import re
import time
from collections.abc import Sequence
from functools import cache, partial
from importlib import metadata
from io import StringIO
from pathlib import Path
from tempfile import mkstemp

from mypy.build import build
from mypy.errors import CompileError
Expand Down Expand Up @@ -148,15 +151,24 @@ def run_refurb(settings: Settings) -> Sequence[Error | str]:
opt.local_partial_types = True
opt.python_version = settings.get_python_version()

mypy_timing_stats = Path(mkstemp()[1]) if settings.timing_stats else None
opt.timing_stats = str(mypy_timing_stats) if mypy_timing_stats else None

try:
start = time.time()

result = build(files, options=opt)

mypy_build_time = time.time() - start

except CompileError as e:
return [re.sub("^mypy: ", "refurb: ", msg) for msg in e.messages]

errors: list[Error | str] = []
checks = load_checks(settings)

refurb_timing_stats_in_ms: dict[str, int] = {}

for file in files:
tree = result.graph[file.module].tree

Expand All @@ -165,15 +177,30 @@ def run_refurb(settings: Settings) -> Sequence[Error | str]:
if settings.debug:
errors.append(str(tree))

visitor = RefurbVisitor(checks, settings)
start = time.time()

visitor = RefurbVisitor(checks, settings)
tree.accept(visitor)

elapsed = time.time() - start

refurb_timing_stats_in_ms[file.module] = int(elapsed * 1_000)

for error in visitor.errors:
error.filename = file.path

errors += visitor.errors

output_timing_stats(
settings,
mypy_build_time,
mypy_timing_stats,
refurb_timing_stats_in_ms,
)

if mypy_timing_stats:
mypy_timing_stats.unlink()

return sorted(
[
error
Expand Down Expand Up @@ -239,6 +266,42 @@ def format_errors(errors: Sequence[Error | str], settings: Settings) -> str:
return done


def output_timing_stats(
settings: Settings,
mypy_total_time_spent: float,
mypy_timing_stats: Path | None,
refurb_timing_stats_in_ms: dict[str, int],
) -> None:
if not settings.timing_stats:
return

assert mypy_timing_stats

mypy_stats: dict[str, int] = {}
lines = mypy_timing_stats.read_text().splitlines()

for line in lines:
module, micro_seconds = line.split()

mypy_stats[module] = int(micro_seconds) // 1_000

data = {
"mypy_total_time_spent_in_ms": int(mypy_total_time_spent * 1_000),
"mypy_time_spent_parsing_modules_in_ms": dict(
sorted(mypy_stats.items(), key=lambda x: x[1], reverse=True)
),
"refurb_time_spent_checking_file_in_ms": dict(
sorted(
refurb_timing_stats_in_ms.items(),
key=lambda x: x[1],
reverse=True,
)
),
}

settings.timing_stats.write_text(json.dumps(data, separators=(",", ":")))


def main(args: list[str]) -> int:
try:
settings = load_settings(args)
Expand Down
5 changes: 5 additions & 0 deletions refurb/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class Settings:
format: Literal["text", "github", None] | None = None
sort_by: Literal["filename", "error"] | None = None
verbose: bool = False
timing_stats: Path | None = None

def __post_init__(self) -> None:
if self.enable_all and self.disable_all:
Expand Down Expand Up @@ -83,6 +84,7 @@ def merge(old: Settings, new: Settings) -> Settings:
format=new.format or old.format,
sort_by=new.sort_by or old.sort_by,
verbose=old.verbose or new.verbose,
timing_stats=old.timing_stats or new.timing_stats,
)

def get_python_version(self) -> tuple[int, int]:
Expand Down Expand Up @@ -322,6 +324,9 @@ def get_next_arg(arg: str, args: Iterator[str]) -> str:
elif arg == "--verbose":
settings.verbose = True

elif arg == "--timing-stats":
settings.timing_stats = Path(get_next_arg(arg, iargs))

elif arg == "--":
settings.mypy_args = list(iargs)

Expand Down
13 changes: 13 additions & 0 deletions test/test_arg_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,3 +689,16 @@ def test_generate_subcommand_is_ignored_if_other_files_are_passed() -> None:

def test_parse_verbose_flag() -> None:
assert parse_args(["--verbose"]) == Settings(verbose=True)


def test_parse_timing_stats_flag() -> None:
assert parse_args(["--timing-stats", "file"]) == Settings(
timing_stats=Path("file")
)


def test_parse_timing_stats_flag_without_arg_is_an_error() -> None:
with pytest.raises(
ValueError, match='refurb: missing argument after "--timing-stats"'
):
parse_args(["--timing-stats"])
34 changes: 34 additions & 0 deletions test/test_main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import json
import os
from dataclasses import dataclass
from functools import partial
from importlib import metadata
from locale import LC_ALL, setlocale
from pathlib import Path
from tempfile import NamedTemporaryFile
from unittest.mock import patch

import pytest
Expand Down Expand Up @@ -270,3 +273,34 @@ def test_verbose_flag_prints_message_when_all_checks_disabled() -> None:

assert "FURB100" not in stdout
assert "No checks enabled" in stdout


def test_timing_stats_outputs_stats_file() -> None:
with NamedTemporaryFile(mode="r") as tmp:
main(["test/e2e/dummy.py", "--timing-stats", tmp.name])

stats_file = Path(tmp.name)

assert stats_file.exists()

data = json.loads(stats_file.read_text())

match data:
case {
"mypy_total_time_spent_in_ms": int(_),
"mypy_time_spent_parsing_modules_in_ms": dict(mypy_timing),
"refurb_time_spent_checking_file_in_ms": dict(refurb_timing),
}:
msg = "All values must be ints"

assert all(
isinstance(v, int) for v in mypy_timing.values()
), msg

assert all(
isinstance(v, int) for v in refurb_timing.values()
), msg

return

pytest.fail("Data is not in proper format")

0 comments on commit 37d7d4b

Please sign in to comment.