RagnarGrootKoerkamp · mzuenni · Nov 23, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 12, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,6 +21,8 @@ repos:
         -   ruamel.yaml==0.18.6
         -   questionary==2.0.1
         -   types-colorama==0.4.15.20240311
+        -   types-Pygments==2.18.0.20240506
+        -   types-python-dateutil==2.9.0.20241003
         -   types-PyYAML==6.0.12.20240311
         -   types-requests==2.32.0.20240712
         args:

diff --git a/bin/config.py b/bin/config.py
@@ -96,7 +96,7 @@
 grep -Ev '^(h|jobs|time|verbose)$' | sed "s/^/'/;s/$/',/" | tr '\n' ' ' | sed 's/^/args_list = [/;s/, $/]\n/'
 """
 # fmt: off
-args_list = ['1', 'add', 'all', 'answer', 'api', 'author', 'check_deterministic', 'clean', 'colors', 'contest', 'contest_id', 'contestname', 'cp', 'default_solution', 'depth', 'directory', 'error', 'force', 'force_build', 'input', 'interaction', 'interactive', 'invalid', 'kattis', 'language', 'memory', 'move_to', 'no_bar', 'no_generate', 'no_solution', 'no_solutions', 'no_testcase_sanity_checks', 'no_timelimit', 'no_validators', 'no_visualizer', 'open', 'order', 'order_from_ccs', 'overview', 'password', 'post_freeze', 'problem', 'problemname', 'remove', 'reorder', 'samples', 'sanitizer', 'skel', 'skip', 'sort', 'submissions', 'table', 'testcases', 'timelimit', 'timeout', 'token', 'tree', 'username', 'validation', 'watch', 'web', 'write']
+args_list = ['1', 'add', 'all', 'answer', 'api', 'author', 'check_deterministic', 'clean', 'colors', 'contest', 'contest_id', 'contestname', 'cp', 'default_solution', 'depth', 'directory', 'error', 'force', 'force_build', 'input', 'interaction', 'interactive', 'invalid', 'kattis', 'language', 'memory', 'more', 'move_to', 'no_bar', 'no_generate', 'no_solution', 'no_solutions', 'no_testcase_sanity_checks', 'no_timelimit', 'no_validators', 'no_visualizer', 'open', 'order', 'order_from_ccs', 'overview', 'password', 'post_freeze', 'problem', 'problemname', 'remove', 'reorder', 'samples', 'sanitizer', 'skel', 'skip', 'sort', 'submissions', 'table', 'testcases', 'timelimit', 'timeout', 'token', 'tree', 'username', 'validation', 'watch', 'web', 'write']
 # fmt: on
 
 

diff --git a/bin/problem.py b/bin/problem.py
@@ -507,6 +507,18 @@ def add(s):
 
         programs = [run.Submission(problem, path) for path in paths]
 
+        # - first all submission with just one verdict (sorted by that verdict)
+        # - then by subdir
+        # - then by list of verdicts
+        # - then by name
+        def submissions_key(x):
+            if len(x.expected_verdicts) == 1:
+                return (1, x.expected_verdicts[0], x.name)
+            else:
+                return (len(x.expected_verdicts), x.subdir, x.expected_verdicts, x.name)
+
+        programs.sort(key=submissions_key)
+
         bar = ProgressBar('Build submissions', items=programs)
 
         def build_program(p):
@@ -526,19 +538,6 @@ def build_program(p):
             return False
 
         assert isinstance(problem._submissions, list)
-
-        # - first all submission with just one verdict (sorted by that verdict)
-        # - then by subdir
-        # - then by list of verdicts
-        # - then by name
-        def submissions_key(x):
-            if len(x.expected_verdicts) == 1:
-                return (1, x.expected_verdicts[0], x.name)
-            else:
-                return (len(x.expected_verdicts), x.subdir, x.expected_verdicts, x.name)
-
-        problem._submissions.sort(key=submissions_key)
-
         return problem._submissions.copy()
 
     def validators(

diff --git a/bin/stats.py b/bin/stats.py
@@ -1,15 +1,28 @@
+import shutil
+import statistics
 import sys
 from collections.abc import Callable
+from datetime import datetime, timedelta, timezone
+from dateutil import parser
 from pathlib import Path
+from typing import Literal, Any
 
-from colorama import Fore, Style
+from colorama import ansi, Fore, Style
 
+import config
 import generate
-from util import glob
+import program
+from util import error, glob, exec_command
 
 Selector = str | Callable | list[str] | list[Callable]
 
 
+def stats(problems):
+    problem_stats(problems)
+    if config.args.more:
+        more_stats(problems)
+
+
 # This prints the number belonging to the count.
 # This can be a red/white colored number, or Y/N
 def _get_stat(count, threshold=True, upper_bound=None):
@@ -28,7 +41,7 @@ def _get_stat(count, threshold=True, upper_bound=None):
     return color + str(count) + Style.RESET_ALL
 
 
-def stats(problems):
+def problem_stats(problems):
     stats: list[
         tuple[str, Selector] | tuple[str, Selector, int] | tuple[str, Selector, int, int]
     ] = [
@@ -66,15 +79,20 @@ def stats(problems):
         (' WA', 'submissions/wrong_answer/*', 2),
         ('TLE', 'submissions/time_limit_exceeded/*', 1),
         ('subs', lambda p: len(glob(p.path, 'submissions/*/*')), 6),
-        (
-            '  cpp',
-            ['submissions/accepted/*.c', 'submissions/accepted/*.cpp', 'submissions/accepted/*.cc'],
-            1,
-        ),
-        ('py', ['submissions/accepted/*.py3', 'submissions/accepted/*.py'], 1),
-        ('java', 'submissions/accepted/*.java', 1),
-        ('kt', 'submissions/accepted/*.kt', 1),
     ]
+    languages = {
+        '  c(++)': ['C', 'C++'],
+        'py': ['Python 2', 'Python 3', 'CPython 2', 'CPython 3'],
+        'java': ['Java'],
+        'kt': ['Kotlin'],
+    }
+    for column, names in languages.items():
+        paths = []
+        for config in program.languages().values():
+            if config['name'] in names:
+                globs = config['files'].split() or []
+                paths += [f'submissions/accepted/{glob}' for glob in globs]
+        stats.append((column, list(set(paths)), 1))
 
     headers = ['problem', *(h[0] for h in stats), '   comment']
     cumulative = [0] * (len(stats))
@@ -177,7 +195,7 @@ def value(x):
                     )
                     for i in range(len(stats))
                 ],
-                comment
+                comment,
             ),
             file=sys.stderr,
         )
@@ -188,3 +206,229 @@ def value(x):
         format_string.format('TOTAL', *(_get_stat(x, False) for x in cumulative), ''),
         file=sys.stderr,
     )
+
+
+try:
+    import pygments
+    from pygments import lexers
+
+    loc_cache: dict[Path, int | None] = {}
+    has_pygments = True
+except Exception:
+    has_pygments = False
+
+
+def _is_code(language, type, text):
+    if type in pygments.token.Comment and type not in (
+        pygments.token.Comment.Preproc,  # pygments treats preprocessor statements as comments
+        pygments.token.Comment.PreprocFile,
+    ):
+        return False
+    if type in pygments.token.String:
+        return False
+    if text.rstrip(' \f\n\r\t(),:;[]{}') == '':
+        return False
+    # ignore some language specific keywords
+    text = text.strip()
+    if language == 'python':
+        return text != 'pass'
+    elif language == 'batchfile':
+        return text != '@'
+    elif language == 'sql' and text == 'pass':
+        return text not in ['begin', 'end']
+    else:
+        return True
+
+
+def loc(file):
+    if file not in loc_cache:
+        try:
+            content = file.read_text()
+            lexer = lexers.guess_lexer_for_filename(file, content)
+            assert isinstance(lexer, pygments.lexer.Lexer)
+            language = lexer.name.lower()
+            tokens = lexer.get_tokens(content)
+
+            count = 0
+            has_code = False
+            for type, text in tokens:
+                for line in text.splitlines(True):
+                    if _is_code(language, type, line):
+                        has_code = True
+                    if line.endswith('\n') and has_code:
+                        count += 1
+                        has_code = False
+            if has_code:
+                count += 1
+
+            loc_cache[file] = count
+        except:
+            # Either we could not read the file (for example binaries)
+            # or we did not find a lexer
+            loc_cache[file] = None
+    return loc_cache[file]
+
+
+def more_stats(problems):
+    if not has_pygments:
+        error('stats --more needs pygments. Install python[3]-pygments.')
+        return
+
+    stat_name_len = 10
+    stat_len = 5
+
+    # solution stats
+    columns = [p.label for p in problems] + ['sum', 'min', 'avg', 'max']
+
+    def get_stats(values, missing='-'):
+        if not values:
+            return [missing] * 4
+        return [sum(values), min(values), statistics.mean(values), max(values)]
+
+    header_string = f'{{:<{stat_name_len}}}' + f' {{:>{stat_len}}}' * len(columns)
+    format_string = (
+        f'{{:<{stat_name_len + len(Fore.WHITE)}}}{Style.RESET_ALL}'
+        + f' {{:>{stat_len + len(Fore.WHITE)}}}{Style.RESET_ALL}' * len(columns)
+    )
+
+    print(file=sys.stderr)
+    header = header_string.format('', *columns)
+    print(Style.BRIGHT + header + Style.RESET_ALL, file=sys.stderr)
+    print('-' * len(header), file=sys.stderr)
+
+    def format_row(*values):
+        printable = []
+        for value in values:
+            if isinstance(value, float):
+                value = f'{value:.1f}'
+            elif isinstance(value, timedelta):
+                hours = int(value.total_seconds()) // (60 * 60)
+                days = int(value.total_seconds()) // (60 * 60 * 24)
+                weeks = int(value.total_seconds()) // (60 * 60 * 24 * 7)
+                if hours < 3 * 24:
+                    value = f'{hours}h'
+                elif days < 4 * 7:
+                    value = f'{days}d'
+                else:
+                    value = f'{weeks}w'
+            elif not isinstance(value, str):
+                value = str(value)
+            if not value.startswith(ansi.CSI):
+                value = f'{Fore.WHITE}{value}'
+            printable.append(value)
+        return format_string.format(*printable)
+
+    languages: dict[str, list[str] | Literal[True]] = {
+        'C(++)': ['C', 'C++'],
+        'Python': ['Python 2', 'Python 3', 'CPython 2', 'CPython 3'],
+        'Java': ['Java'],
+        'Kotlin': ['Kotlin'],
+    }
+
+    def get_submissions_row(display_name, names):
+        paths = []
+        if names is True:
+            paths.append('submissions/accepted/*')
+        else:
+            assert isinstance(names, list)
+            for config in program.languages().values():
+                if config['name'] in names:
+                    globs = config['files'].split() or []
+                    paths += [f'submissions/accepted/{glob}' for glob in globs]
+            paths = list(set(paths))
+
+        lines = [display_name]
+        values = []
+        for problem in problems:
+            files = {file for path in paths for file in glob(problem.path, path)}
+            cur_lines = [loc(file) for file in files]
+            cur_lines = [x for x in cur_lines if x is not None]
+            if cur_lines:
+                best = min(cur_lines)
+                values.append(best)
+                lines.append(best)
+            else:
+                lines.append(f'{Fore.RED}-')
+        lines += get_stats(values)
+        return lines
+
+    best = get_submissions_row('Solution', True)
+    print(format_row(*best), file=sys.stderr)
+    for display_name, names in languages.items():
+        values = get_submissions_row(display_name, names)
+        for i in range(1, 1 + len(problems)):
+            if values[i] == best[i]:
+                values[i] = f'{Fore.CYAN}{values[i]}'
+        print(format_row(*values), file=sys.stderr)
+
+    # TODO: analyze team submissions?
+
+    # git stats
+    if shutil.which('git') is None:
+        error('git not found!')
+        return
+
+    if not exec_command(['git', 'rev-parse', '--is-inside-work-tree']).out.startswith('true'):
+        error('not inside git')
+        return
+
+    def git(*args):
+        res = exec_command(
+            ['git', *args],
+            crop=False,
+            preexec_fn=False,
+            timeout=None,
+        )
+        return res.out if res else ''
+
+    print('-' * len(header), file=sys.stderr)
+    testcases = [len(generate.testcases(p)) for p in problems]
+    testcases += get_stats(testcases)
+    print(format_row('Testcases', *testcases), file=sys.stderr)
+    changed: list[Any] = []
+    for p in problems:
+        time = max(
+            [
+                parser.parse(git('log', '--format=%cI', '-1', '--', p.path / path))
+                for path in ['generators', 'data']
+            ]
+        )
+        duration = datetime.now(timezone.utc) - time
+        changed.append(duration.total_seconds())
+    changed += get_stats(changed)
+    changed = [timedelta(seconds=s) for s in changed]
+    changed[-4] = '-'  # sum of last changed is meaningless...
+    print(format_row('└─changed', *changed), file=sys.stderr)
+
+    # this is hacky and does not handle all renames properly...
+    # for example: if A is renamed to C and B is renamed to A this will break
+    def countCommits(problem):
+        yaml_path = problem.path / 'problem.yaml'
+        paths = git(
+            'log', '--all', '--follow', '--name-only', '--relative', '--format=', '--', yaml_path
+        ).split('\n')
+        names = {Path(p).parent for p in paths if p.strip() != ''}
+        return int(git('rev-list', '--all', '--count', '--', *names))
+
+    commits = [countCommits(p) for p in problems]
+    commits += get_stats(commits, '-')
+    commits[-4] = '-'  # one commit can change multiple problems so the sum is meaningless...
+    print(format_row('Commits', *commits), file=sys.stderr)
+    print(file=sys.stderr)
+    print(
+        f'{Fore.CYAN}Total Commits{Style.RESET_ALL}:',
+        int(git('rev-list', '--all', '--count')),
+        file=sys.stderr,
+    )
+    print(
+        f'{Fore.CYAN}Total Authors{Style.RESET_ALL}:',
+        git('shortlog', '--group=%ae', '-s').count('\n'),
+        file=sys.stderr,
+    )
+    duration = datetime.now(timezone.utc) - parser.parse(
+        git('log', '--reverse', '--format=%cI').partition('\n')[0]
+    )
+    print(
+        f'{Fore.CYAN}Preparation{Style.RESET_ALL}: {duration.days}d, {duration.seconds // 3600}h',
+        file=sys.stderr,
+    )
diff --git a/bin/tools.py b/bin/tools.py
@@ -496,9 +496,10 @@ def build_parser():
     )
 
     # Stats
-    subparsers.add_parser(
+    statsparser = subparsers.add_parser(
         'stats', parents=[global_parser], help='show statistics for contest/problem'
     )
+    statsparser.add_argument('--more', action='store_true', help='Print more stats.')
 
     # Generate Testcases
     genparser = subparsers.add_parser(