From 83515fa545a28de9da31bacc56c78232d8f28eb2 Mon Sep 17 00:00:00 2001
From: Tom de Geus <tdegeus@users.noreply.github.com>
Date: Mon, 11 Mar 2024 15:03:17 +0000
Subject: [PATCH] bugfix classification labels (#118)

---
 .pre-commit-config.yaml |   2 +-
 tests/test_classify.py  | 110 +++++++++++++---------------
 tests/test_simple.py    |  40 ++++------
 texplain/__init__.py    | 157 ++++++++++++++--------------------------
 4 files changed, 122 insertions(+), 187 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2db7c79..2025d21 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
   rev: 24.1.1
   hooks:
   - id: black
-    args: [--safe, --quiet, --line-length=100]
+    args: [--safe, --quiet, --line-length=100, --preview]
 - repo: https://github.com/PyCQA/autoflake
   rev: v2.2.1
   hooks:
diff --git a/tests/test_classify.py b/tests/test_classify.py
index 562a9ec..a857fb6 100644
--- a/tests/test_classify.py
+++ b/tests/test_classify.py
@@ -1,10 +1,10 @@
-import pytest
-
 import texplain
 
 
 def test_equation():
-    text = r"""
+    texts = []
+
+    t = r"""
 foo bar
 \begin{equation}
     \label{eq:foo}
@@ -12,24 +12,21 @@ def test_equation():
 \end{equation}
 baz
 """
+    texts.append(t)
 
-    tex = texplain.TeX(text)
-    tex.format_labels()
-    assert str(tex).strip() == text.strip()
-
-    # --
-
-    text = r"""
+    t = r"""
 My text
 \begin{equation}
     a = b
     \label{eq:qew}
 \end{equation}
 """
+    texts.append(t)
 
-    tex = texplain.TeX(text)
-    tex.format_labels()
-    assert str(tex).strip() == text.strip()
+    for text in texts:
+        tex = texplain.TeX(text)
+        tex.format_labels()
+        assert str(tex).strip() == text.strip()
 
 
 def test_section():
@@ -39,7 +36,6 @@ def test_section():
 \label{sec:foo}
 baz
 """
-
     tex = texplain.TeX(text)
     tex.format_labels()
     assert str(tex).strip() == text.strip()
@@ -54,7 +50,6 @@ def test_figure():
 \end{figure}
 baz
 """
-
     tex = texplain.TeX(text)
     tex.format_labels()
     assert str(tex).strip() == text.strip()
@@ -69,14 +64,15 @@ def test_figure_b():
 \end{figure*}
 baz
 """
-
     tex = texplain.TeX(text)
     tex.format_labels()
     assert str(tex).strip() == text.strip()
 
 
 def test_custom():
-    text = r"""
+    texts = []
+
+    t0 = r"""
 \begin{example}[H]
     \begin{oframed}
         \caption{Self-explanatory vs documentation intensive}
@@ -84,32 +80,35 @@ def test_custom():
     \end{oframed}
 \end{example}
 """
+    texts.append(t0)
 
-    tex = texplain.TeX(text)
-    tex.format_labels()
-    assert str(tex).strip() == text.strip()
+    texts.append("\n\n".join([r"\section{My section}"]) + t0)
+    texts.append("\n\n".join([r"\section{My section}", r"\label{sec:mysec}"]) + t0)
 
+    for text in texts:
+        tex = texplain.TeX(text)
+        tex.format_labels()
+        assert str(tex).strip() == text.strip()
 
-@pytest.mark.skip(reason="TODO: find solution")
-def test_custom_nested():
-    text = r"""
-\section{My section}
 
-\begin{example}[H]
-    \begin{oframed}
-        \caption{Self-explanatory vs documentation intensive}
-        \label{misc:self-explenatory}
-    \end{oframed}
-\end{example}
-"""
+def test_nested():
+    texts = []
 
-    tex = texplain.TeX(text)
-    tex.format_labels()
-    assert str(tex).strip() == text.strip()
+    t = r"""
+\begin{appendices}
 
+    \section{My section}
 
-def test_nested():
-    text = r"""
+    \begin{figure}[H]
+        \caption{Self-explanatory vs documentation intensive}
+        \label{fig:self-explenatory}
+    \end{figure}
+
+\end{appendices}
+"""
+    texts.append(t)
+
+    t = r"""
 \begin{appendices}
 
     \section{My section}
@@ -122,15 +121,9 @@ def test_nested():
 
 \end{appendices}
 """
+    texts.append(t)
 
-    tex = texplain.TeX(text)
-    tex.format_labels()
-    assert str(tex).strip() == text.strip()
-
-
-@pytest.mark.skip(reason="TODO: find solution")
-def test_nested_custom():
-    text = r"""
+    t = r"""
 \begin{appendices}
 
     \section{My section}
@@ -145,14 +138,18 @@ def test_nested_custom():
 
 \end{appendices}
 """
+    texts.append(t)
 
-    tex = texplain.TeX(text)
-    tex.format_labels()
-    assert str(tex).strip() == text.strip()
+    for text in texts:
+        tex = texplain.TeX(text)
+        tex.format_labels()
+        assert str(tex).strip() == text.strip()
 
 
 def test_hybrid():
-    text = r"""
+    texts = []
+
+    t = r"""
 \begin{itemize}
     \item
     \begin{referee}
@@ -171,14 +168,9 @@ def test_hybrid():
     \end{figure}
 \end{itemize}
 """
+    texts.append(t)
 
-    tex = texplain.TeX(text)
-    tex.format_labels()
-    assert str(tex).strip() == text.strip()
-
-    # ---
-
-    text = r"""
+    t = r"""
 Foo bar
 
 \section{My section}
@@ -196,7 +188,9 @@ def test_hybrid():
     \label{fig:1}
 \end{figure}
 """
+    texts.append(t)
 
-    tex = texplain.TeX(text)
-    tex.format_labels()
-    assert str(tex).strip() == text.strip()
+    for text in texts:
+        tex = texplain.TeX(text)
+        tex.format_labels()
+        assert str(tex).strip() == text.strip()
diff --git a/tests/test_simple.py b/tests/test_simple.py
index 19a677a..f53952b 100644
--- a/tests/test_simple.py
+++ b/tests/test_simple.py
@@ -453,31 +453,23 @@ def test_remove_command_d():
 def test_fix_quote():
     test = []
     test.append(['This is text "with quotes".', "This is text ``with quotes''."])
-    test.append(
-        [
-            'This is text "with quotes" but not matching".',
-            'This is text "with quotes" but not matching".',
-        ]
-    )
+    test.append([
+        'This is text "with quotes" but not matching".',
+        'This is text "with quotes" but not matching".',
+    ])
     test.append(["This is text 'with quotes'.", "This is text `with quotes'."])
-    test.append(
-        [
-            "This is text 'with quotes' but not matching'.",
-            "This is text 'with quotes' but not matching'.",
-        ]
-    )
-    test.append(
-        [
-            'A text "with one quote", and "another one".',
-            "A text ``with one quote'', and ``another one''.",
-        ]
-    )
-    test.append(
-        [
-            'A text "with one quote", and "another one"',
-            "A text ``with one quote'', and ``another one''",
-        ]
-    )
+    test.append([
+        "This is text 'with quotes' but not matching'.",
+        "This is text 'with quotes' but not matching'.",
+    ])
+    test.append([
+        'A text "with one quote", and "another one".',
+        "A text ``with one quote'', and ``another one''.",
+    ])
+    test.append([
+        'A text "with one quote", and "another one"',
+        "A text ``with one quote'', and ``another one''",
+    ])
     test.append(['Foo bar "bar foo"', "Foo bar ``bar foo''"])
 
     for source, expect in test:
diff --git a/texplain/__init__.py b/texplain/__init__.py
index 7702425..13a9cc5 100644
--- a/texplain/__init__.py
+++ b/texplain/__init__.py
@@ -6,6 +6,7 @@
 import re
 import sys
 import textwrap
+from collections import defaultdict
 from copy import deepcopy
 from shutil import copyfile
 
@@ -386,24 +387,25 @@ def remove_comments(text: str) -> str:
     return "\n".join(text)
 
 
-def environments(text: str) -> list[str]:
-    r"""
-    Return list with present environments.
-    This corresponds to the text between ``\begin{...}`` and ``\end{...}``.
-    """
-
+def _environments_impl(text: str, curly_braces: dict) -> list[str]:
     ret = []
-    curly_braces = find_matching(text, "{", "}", ignore_escaped=True)
-
     for i in re.finditer(r"\\begin{.*}", text):
         opening = i.span(0)[0] + 6
         closing = curly_braces[opening]
         i = opening + 1
         ret += [text[i:closing]]
-
     return list(set(ret))
 
 
+def environments(text: str) -> list[str]:
+    r"""
+    Return list with present environments.
+    This corresponds to the text between ``\begin{...}`` and ``\end{...}``.
+    """
+    braces = find_matching(text, "{", "}", ignore_escaped=True)
+    return _environments_impl(text=text, curly_braces=braces)
+
+
 class Placeholder:
     """
     Placeholder for text.
@@ -1926,91 +1928,32 @@ def _classify_for_label(text: str) -> tuple[list[str], NDArray[np.int_]]:
     starting = -1 * np.ones((len(text), len(categories)), dtype=int)
     braces = find_matching(text, "{", "}", ignore_escaped=True)
 
-    # "eq"
-
-    r = categories.index("eq")
-
-    index = find_matching(
-        text,
-        r"\\begin\{equation\*?\}",
-        r"\\end\{equation\*?\}",
-        escape=False,
-        closing_match=1,
-    )
-    for i, j in index.items():
-        starting[i:j, r] = i
-
-    index = find_matching(
-        text,
-        r"\\begin\{align\*?\}",
-        r"\\end\{align\*?\}",
-        escape=False,
-        closing_match=1,
-    )
-    for i, j in index.items():
-        starting[i:j, r] = i
-
-    index = find_matching(
-        text,
-        r"\\begin\{eqnarray\*?\}",
-        r"\\end\{eqnarray\*?\}",
-        escape=False,
-        closing_match=1,
-    )
-    for i, j in index.items():
-        starting[i:j, r] = i
-
-    # "fig"
-
-    r = categories.index("fig")
-
-    index = find_matching(
-        text,
-        r"\\begin\{figure\*?\}",
-        r"\\end\{figure\*?\}",
-        escape=False,
-        closing_match=1,
-    )
-    for i, j in index.items():
-        starting[i:j, r] = i
-
-    # "tab"
-
-    r = categories.index("tab")
-
-    index = find_matching(
-        text,
-        r"\\begin\{table\*?\}",
-        r"\\end\{table\*?\}",
-        escape=False,
-        closing_match=1,
-    )
-    for i, j in index.items():
-        starting[i:j, r] = i
-
-    # "item"
-
-    r = categories.index("item")
-
-    index = find_matching(
-        text,
-        r"\\begin\{itemize\*?\}",
-        r"\\end\{itemize\*?\}",
-        escape=False,
-        closing_match=1,
-    )
-    for i, j in index.items():
-        starting[i:j, r] = i
+    envs = defaultdict(list)
+    for env in _environments_impl(text=text, curly_braces=braces):
+        name = re.split(r"(\w*)(\*?)", env)[1]
+        if name in ["equation", "align", "eqnarray"]:
+            envs["eq"].append(env)
+        elif name in ["figure"]:
+            envs["fig"].append(env)
+        elif name in ["table"]:
+            envs["tab"].append(env)
+        elif name in ["itemize", "enumerate"]:
+            envs["tab"].append(env)
+        else:
+            envs["misc"].append(env)
 
-    index = find_matching(
-        text,
-        r"\\begin\{enumerate\*?\}",
-        r"\\end\{enumerate\*?\}",
-        escape=False,
-        closing_match=1,
-    )
-    for i, j in index.items():
-        starting[i:j, r] = i
+    for category, names in envs.items():
+        r = categories.index(category)
+        for name in names:
+            index = find_matching(
+                text,
+                r"\\begin\{" + re.escape(name) + r"\}",
+                r"\\end\{" + re.escape(name) + r"\}",
+                escape=False,
+                closing_match=1,
+            )
+            for i, j in index.items():
+                starting[i:j, r] = i
 
     # "note"
 
@@ -2021,21 +1964,27 @@ def _classify_for_label(text: str) -> tuple[list[str], NDArray[np.int_]]:
         j = braces[match.span()[1] - 1]
         starting[i:j, r] = i
 
-    # "sec"
+    # "sec" / "ch"
 
-    r = categories.index("sec")
-
-    for match in re.finditer(r"(\\)(sub)*(section\s*\{)", text):
-        i = match.span()[0]
-        starting[i:, r] = i
+    patterns = {
+        "sec": r"(\\)(sub)*(section\s*\{)",
+        "ch": r"(\\)(chapter\s*\{)",
+    }
 
-    # "ch"
+    for category, pattern in patterns.items():
 
-    r = categories.index("ch")
+        r = categories.index(category)
 
-    for match in re.finditer(r"(\\)(chapter\s*\{)", text):
-        i = match.span()[0]
-        starting[i:, r] = i
+        for match in re.finditer(pattern, text):
+            i = match.span()[0]
+            j = braces[match.span()[1] - 1]
+            for label in re.finditer(r"(\\)(label\{)", text[j + 1 :]):
+                s = label.span()[1] + j + 1
+                e = braces[s - 1]
+                between = remove_comments(text[j + 1 : s - 7])
+                if len(between.strip()) == 0:
+                    starting[i:e, r] = i
+                break
 
     return categories, np.argmax(starting, axis=1)