From 83515fa545a28de9da31bacc56c78232d8f28eb2 Mon Sep 17 00:00:00 2001 From: Tom de Geus Date: Mon, 11 Mar 2024 15:03:17 +0000 Subject: [PATCH] bugfix classification labels (#118) --- .pre-commit-config.yaml | 2 +- tests/test_classify.py | 110 +++++++++++++--------------- tests/test_simple.py | 40 ++++------ texplain/__init__.py | 157 ++++++++++++++-------------------------- 4 files changed, 122 insertions(+), 187 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2db7c79..2025d21 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,7 @@ repos: rev: 24.1.1 hooks: - id: black - args: [--safe, --quiet, --line-length=100] + args: [--safe, --quiet, --line-length=100, --preview] - repo: https://github.com/PyCQA/autoflake rev: v2.2.1 hooks: diff --git a/tests/test_classify.py b/tests/test_classify.py index 562a9ec..a857fb6 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -1,10 +1,10 @@ -import pytest - import texplain def test_equation(): - text = r""" + texts = [] + + t = r""" foo bar \begin{equation} \label{eq:foo} @@ -12,24 +12,21 @@ def test_equation(): \end{equation} baz """ + texts.append(t) - tex = texplain.TeX(text) - tex.format_labels() - assert str(tex).strip() == text.strip() - - # -- - - text = r""" + t = r""" My text \begin{equation} a = b \label{eq:qew} \end{equation} """ + texts.append(t) - tex = texplain.TeX(text) - tex.format_labels() - assert str(tex).strip() == text.strip() + for text in texts: + tex = texplain.TeX(text) + tex.format_labels() + assert str(tex).strip() == text.strip() def test_section(): @@ -39,7 +36,6 @@ def test_section(): \label{sec:foo} baz """ - tex = texplain.TeX(text) tex.format_labels() assert str(tex).strip() == text.strip() @@ -54,7 +50,6 @@ def test_figure(): \end{figure} baz """ - tex = texplain.TeX(text) tex.format_labels() assert str(tex).strip() == text.strip() @@ -69,14 +64,15 @@ def test_figure_b(): \end{figure*} baz """ - tex = texplain.TeX(text) tex.format_labels() assert str(tex).strip() == text.strip() def test_custom(): - text = r""" + texts = [] + + t0 = r""" \begin{example}[H] \begin{oframed} \caption{Self-explanatory vs documentation intensive} @@ -84,32 +80,35 @@ def test_custom(): \end{oframed} \end{example} """ + texts.append(t0) - tex = texplain.TeX(text) - tex.format_labels() - assert str(tex).strip() == text.strip() + texts.append("\n\n".join([r"\section{My section}"]) + t0) + texts.append("\n\n".join([r"\section{My section}", r"\label{sec:mysec}"]) + t0) + for text in texts: + tex = texplain.TeX(text) + tex.format_labels() + assert str(tex).strip() == text.strip() -@pytest.mark.skip(reason="TODO: find solution") -def test_custom_nested(): - text = r""" -\section{My section} -\begin{example}[H] - \begin{oframed} - \caption{Self-explanatory vs documentation intensive} - \label{misc:self-explenatory} - \end{oframed} -\end{example} -""" +def test_nested(): + texts = [] - tex = texplain.TeX(text) - tex.format_labels() - assert str(tex).strip() == text.strip() + t = r""" +\begin{appendices} + \section{My section} -def test_nested(): - text = r""" + \begin{figure}[H] + \caption{Self-explanatory vs documentation intensive} + \label{fig:self-explenatory} + \end{figure} + +\end{appendices} +""" + texts.append(t) + + t = r""" \begin{appendices} \section{My section} @@ -122,15 +121,9 @@ def test_nested(): \end{appendices} """ + texts.append(t) - tex = texplain.TeX(text) - tex.format_labels() - assert str(tex).strip() == text.strip() - - -@pytest.mark.skip(reason="TODO: find solution") -def test_nested_custom(): - text = r""" + t = r""" \begin{appendices} \section{My section} @@ -145,14 +138,18 @@ def test_nested_custom(): \end{appendices} """ + texts.append(t) - tex = texplain.TeX(text) - tex.format_labels() - assert str(tex).strip() == text.strip() + for text in texts: + tex = texplain.TeX(text) + tex.format_labels() + assert str(tex).strip() == text.strip() def test_hybrid(): - text = r""" + texts = [] + + t = r""" \begin{itemize} \item \begin{referee} @@ -171,14 +168,9 @@ def test_hybrid(): \end{figure} \end{itemize} """ + texts.append(t) - tex = texplain.TeX(text) - tex.format_labels() - assert str(tex).strip() == text.strip() - - # --- - - text = r""" + t = r""" Foo bar \section{My section} @@ -196,7 +188,9 @@ def test_hybrid(): \label{fig:1} \end{figure} """ + texts.append(t) - tex = texplain.TeX(text) - tex.format_labels() - assert str(tex).strip() == text.strip() + for text in texts: + tex = texplain.TeX(text) + tex.format_labels() + assert str(tex).strip() == text.strip() diff --git a/tests/test_simple.py b/tests/test_simple.py index 19a677a..f53952b 100644 --- a/tests/test_simple.py +++ b/tests/test_simple.py @@ -453,31 +453,23 @@ def test_remove_command_d(): def test_fix_quote(): test = [] test.append(['This is text "with quotes".', "This is text ``with quotes''."]) - test.append( - [ - 'This is text "with quotes" but not matching".', - 'This is text "with quotes" but not matching".', - ] - ) + test.append([ + 'This is text "with quotes" but not matching".', + 'This is text "with quotes" but not matching".', + ]) test.append(["This is text 'with quotes'.", "This is text `with quotes'."]) - test.append( - [ - "This is text 'with quotes' but not matching'.", - "This is text 'with quotes' but not matching'.", - ] - ) - test.append( - [ - 'A text "with one quote", and "another one".', - "A text ``with one quote'', and ``another one''.", - ] - ) - test.append( - [ - 'A text "with one quote", and "another one"', - "A text ``with one quote'', and ``another one''", - ] - ) + test.append([ + "This is text 'with quotes' but not matching'.", + "This is text 'with quotes' but not matching'.", + ]) + test.append([ + 'A text "with one quote", and "another one".', + "A text ``with one quote'', and ``another one''.", + ]) + test.append([ + 'A text "with one quote", and "another one"', + "A text ``with one quote'', and ``another one''", + ]) test.append(['Foo bar "bar foo"', "Foo bar ``bar foo''"]) for source, expect in test: diff --git a/texplain/__init__.py b/texplain/__init__.py index 7702425..13a9cc5 100644 --- a/texplain/__init__.py +++ b/texplain/__init__.py @@ -6,6 +6,7 @@ import re import sys import textwrap +from collections import defaultdict from copy import deepcopy from shutil import copyfile @@ -386,24 +387,25 @@ def remove_comments(text: str) -> str: return "\n".join(text) -def environments(text: str) -> list[str]: - r""" - Return list with present environments. - This corresponds to the text between ``\begin{...}`` and ``\end{...}``. - """ - +def _environments_impl(text: str, curly_braces: dict) -> list[str]: ret = [] - curly_braces = find_matching(text, "{", "}", ignore_escaped=True) - for i in re.finditer(r"\\begin{.*}", text): opening = i.span(0)[0] + 6 closing = curly_braces[opening] i = opening + 1 ret += [text[i:closing]] - return list(set(ret)) +def environments(text: str) -> list[str]: + r""" + Return list with present environments. + This corresponds to the text between ``\begin{...}`` and ``\end{...}``. + """ + braces = find_matching(text, "{", "}", ignore_escaped=True) + return _environments_impl(text=text, curly_braces=braces) + + class Placeholder: """ Placeholder for text. @@ -1926,91 +1928,32 @@ def _classify_for_label(text: str) -> tuple[list[str], NDArray[np.int_]]: starting = -1 * np.ones((len(text), len(categories)), dtype=int) braces = find_matching(text, "{", "}", ignore_escaped=True) - # "eq" - - r = categories.index("eq") - - index = find_matching( - text, - r"\\begin\{equation\*?\}", - r"\\end\{equation\*?\}", - escape=False, - closing_match=1, - ) - for i, j in index.items(): - starting[i:j, r] = i - - index = find_matching( - text, - r"\\begin\{align\*?\}", - r"\\end\{align\*?\}", - escape=False, - closing_match=1, - ) - for i, j in index.items(): - starting[i:j, r] = i - - index = find_matching( - text, - r"\\begin\{eqnarray\*?\}", - r"\\end\{eqnarray\*?\}", - escape=False, - closing_match=1, - ) - for i, j in index.items(): - starting[i:j, r] = i - - # "fig" - - r = categories.index("fig") - - index = find_matching( - text, - r"\\begin\{figure\*?\}", - r"\\end\{figure\*?\}", - escape=False, - closing_match=1, - ) - for i, j in index.items(): - starting[i:j, r] = i - - # "tab" - - r = categories.index("tab") - - index = find_matching( - text, - r"\\begin\{table\*?\}", - r"\\end\{table\*?\}", - escape=False, - closing_match=1, - ) - for i, j in index.items(): - starting[i:j, r] = i - - # "item" - - r = categories.index("item") - - index = find_matching( - text, - r"\\begin\{itemize\*?\}", - r"\\end\{itemize\*?\}", - escape=False, - closing_match=1, - ) - for i, j in index.items(): - starting[i:j, r] = i + envs = defaultdict(list) + for env in _environments_impl(text=text, curly_braces=braces): + name = re.split(r"(\w*)(\*?)", env)[1] + if name in ["equation", "align", "eqnarray"]: + envs["eq"].append(env) + elif name in ["figure"]: + envs["fig"].append(env) + elif name in ["table"]: + envs["tab"].append(env) + elif name in ["itemize", "enumerate"]: + envs["tab"].append(env) + else: + envs["misc"].append(env) - index = find_matching( - text, - r"\\begin\{enumerate\*?\}", - r"\\end\{enumerate\*?\}", - escape=False, - closing_match=1, - ) - for i, j in index.items(): - starting[i:j, r] = i + for category, names in envs.items(): + r = categories.index(category) + for name in names: + index = find_matching( + text, + r"\\begin\{" + re.escape(name) + r"\}", + r"\\end\{" + re.escape(name) + r"\}", + escape=False, + closing_match=1, + ) + for i, j in index.items(): + starting[i:j, r] = i # "note" @@ -2021,21 +1964,27 @@ def _classify_for_label(text: str) -> tuple[list[str], NDArray[np.int_]]: j = braces[match.span()[1] - 1] starting[i:j, r] = i - # "sec" + # "sec" / "ch" - r = categories.index("sec") - - for match in re.finditer(r"(\\)(sub)*(section\s*\{)", text): - i = match.span()[0] - starting[i:, r] = i + patterns = { + "sec": r"(\\)(sub)*(section\s*\{)", + "ch": r"(\\)(chapter\s*\{)", + } - # "ch" + for category, pattern in patterns.items(): - r = categories.index("ch") + r = categories.index(category) - for match in re.finditer(r"(\\)(chapter\s*\{)", text): - i = match.span()[0] - starting[i:, r] = i + for match in re.finditer(pattern, text): + i = match.span()[0] + j = braces[match.span()[1] - 1] + for label in re.finditer(r"(\\)(label\{)", text[j + 1 :]): + s = label.span()[1] + j + 1 + e = braces[s - 1] + between = remove_comments(text[j + 1 : s - 7]) + if len(between.strip()) == 0: + starting[i:e, r] = i + break return categories, np.argmax(starting, axis=1)