roskakori · snowkoli · Jul 14, 2024 · Jul 18, 2024 · Jul 18, 2024 · roskakori
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -11,6 +11,8 @@ Version 1.8.1, 2024-07-xx
   `#160 <https://github.com/roskakori/pygount/issues/160>`_).
 * Removed deprecated code: (contributed by Marco Gambone and Niels Vanden Bussche, issue
   `#47 <https://github.com/roskakori/pygount/issues/47>`_).
+* Count pure markup files as documentation: (contributed by Tytus Bucholc, issue
+  `#6 <https://github.com/roskakori/pygount/issues/6>`_).
 
 Version 1.8.0, 2024-05-13
 

diff --git a/pygount/analysis.py b/pygount/analysis.py
@@ -135,6 +135,9 @@ class SourceState(Enum):
 #: Regular expression to detect plain text files by name.
 _PLAIN_TEXT_NAME_REGEX = re.compile(_PLAIN_TEXT_PATTERN, re.IGNORECASE)
 
+_MARK_UP_PATTERN = ".*\.(md|rst|txt|\d+)$"
+_MARK_UP_NAME_REGEX = re.compile(_MARK_UP_PATTERN)
+
 #: Mapping for file suffixes to lexers for which pygments offers no official one.
 _SUFFIX_TO_FALLBACK_LEXER_MAP = {
     "fex": pygount.lexers.MinimalisticWebFocusLexer(),
@@ -148,6 +151,10 @@ class SourceState(Enum):
     _SUFFIX_TO_FALLBACK_LEXER_MAP[_oracle_suffix] = pygments.lexers.get_lexer_by_name("plpgsql")
 
 
+def is_markup_file(source_path: str):
+    return _MARK_UP_NAME_REGEX.match(os.path.basename(source_path))
+
+
 class DuplicatePool:
     """
     A pool that collects information about potential duplicate files.
@@ -351,7 +358,8 @@ def from_file(
                     language = dialect
             _log.info("%s: analyze as %s using encoding %s", source_path, language, encoding)
             mark_to_count_map = {"c": 0, "d": 0, "e": 0, "s": 0}
-            for line_parts in _line_parts(lexer, source_code):
+            is_markup = is_markup_file(source_path)
+            for line_parts in _line_parts(lexer, source_code, is_markup=is_markup):
                 mark_to_increment = "e"
                 for mark_to_check in ("d", "s", "c"):
                     if mark_to_check in line_parts:
@@ -717,7 +725,7 @@ def _pythonized_comments(tokens: Iterator[Tuple[TokenType, str]]) -> Iterator[To
         yield result_token_type, result_token_text
 
 
-def _line_parts(lexer: pygments.lexer.Lexer, text: str) -> Iterator[Set[str]]:
+def _line_parts(lexer: pygments.lexer.Lexer, text: str, is_markup: bool) -> Iterator[Set[str]]:
     line_marks = set()
     tokens = _delined_tokens(lexer.get_tokens(text))
     if lexer.name == "Python":
@@ -738,7 +746,8 @@ def _line_parts(lexer: pygments.lexer.Lexer, text: str) -> Iterator[Set[str]]:
         else:
             is_white_text = (token_text.strip() in white_words) or (token_text.rstrip(white_text) == "")
             if not is_white_text:
-                line_marks.add("c")  # 'code'
+                line_mark = "d" if is_markup else "c"
+                line_marks.add(line_mark)
         if token_text.endswith("\n"):
             yield line_marks
             line_marks = set()

diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -22,6 +22,7 @@
     _pythonized_comments,
     base_language,
     guess_lexer,
+    is_markup_file,
 )
 
 from ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest
@@ -105,13 +106,13 @@ def test_can_deline_tokens(self):
 
     def test_can_compute_python_line_parts(self):
         python_lexer = lexers.get_lexer_by_name("python")
-        assert list(_line_parts(python_lexer, "#")) == [set("d")]
-        assert list(_line_parts(python_lexer, "s = 'x'  # x")) == [set("cds")]
+        assert list(_line_parts(python_lexer, "#", False)) == [set("d")]
+        assert list(_line_parts(python_lexer, "s = 'x'  # x", False)) == [set("cds")]
 
     def test_can_detect_white_text(self):
         python_lexer = lexers.get_lexer_by_name("python")
-        assert list(_line_parts(python_lexer, "{[()]};")) == [set()]
-        assert list(_line_parts(python_lexer, "pass")) == [set()]
+        assert list(_line_parts(python_lexer, "{[()]};", False)) == [set()]
+        assert list(_line_parts(python_lexer, "pass", False)) == [set()]
 
     def test_can_convert_python_strings_to_comments(self):
         source_code = (
@@ -125,8 +126,9 @@ def test_can_convert_python_strings_to_comments(self):
     @staticmethod
     def _line_parts(lexer_name: str, source_lines: List[str]) -> List[Set[str]]:
         lexer = lexers.get_lexer_by_name(lexer_name)
+        is_markup = lexer_name in ["markdown", "md", "restructuredtext", "rst", "rest", "groff"]
         source_code = "\n".join(source_lines)
-        return list(_line_parts(lexer, source_code))
+        return list(_line_parts(lexer, source_code, is_markup=is_markup))
 
     def test_can_analyze_python(self):
         source_lines = [
@@ -155,6 +157,20 @@ def test_can_analyze_c(self):
         expected_line_parts = [{"d"}, {"d"}, {"d"}, {"c"}, {"c"}, {"c", "s"}, set()]
         assert actual_line_parts == expected_line_parts
 
+    def test_all_lines_are_d_when_markup_flag_is_true(self):
+        source_lines = [
+            "/*",
+            " * The classic hello world for C99.",
+            " */",
+            "#include <stdio.h>",
+            "int main(void) {",
+            '   puts("Hello, World!");',
+            "}",
+        ]
+        actual_line_parts = AnalysisTest._line_parts("markdown", source_lines)
+        expected_line_parts = [{"d"}, {"d"}, {"d"}, {"d"}, {"d"}, {"d"}, set()]
+        assert actual_line_parts == expected_line_parts
+
 
 class _NonSeekableEmptyBytesIO(BytesIO):
     # Class to create a 'dummy object that mimics a non-seekable file handle'
@@ -275,6 +291,24 @@ def test_fails_on_non_seekable_file_handle_with_encoding_chardet(self):
         with pytest.raises(PygountError, match=r".*file handle must be seekable.*"):
             analysis.SourceAnalysis.from_file("README.md", "test", file_handle=file_handle, encoding="chardet")
 
+    def test_analyzer_treats_textual_files_as_docs_only(self):
+        test_params = [
+            ("test.rst", 0, 3, "restructuredtext"),
+            ("test.md", 0, 3, "markdown"),
+            ("test.txt", 0, 3, "text only"),
+            ("test.4", 0, 3, "groff"),
+        ]
+        for filename, code_count, doc_count, lang_lower in test_params:
+            with self.subTest(filename):
+                test_html_django_path = self.create_temp_file(
+                    filename,
+                    ["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
+                )
+                source_analysis = analysis.SourceAnalysis.from_file(test_html_django_path, "test", encoding="utf-8")
+                assert source_analysis.language.lower() == lang_lower
+                assert source_analysis.code_count == code_count
+                assert source_analysis.documentation_count == doc_count
+
 
 def test_can_repr_source_analysis_from_file():
     source_analysis = analysis.SourceAnalysis("some.py", "Python", "some", 1, 2, 3, 4, analysis.SourceState.analyzed)
@@ -506,3 +540,11 @@ def test_can_detect_duplicate(self):
         duplicate_pool = analysis.DuplicatePool()
         assert duplicate_pool.duplicate_path(original_path) is None
         assert original_path == duplicate_pool.duplicate_path(duplicate_path)
+
+
+@pytest.mark.parametrize(
+    "file_extension, expected_result", [(".md", True), (".rst", True), (".py", False), (".4", True), (".c", False)]
+)
+def test_is_markup_file(file_extension, expected_result):
+    source_path = f"some_file_name{file_extension}"
+    assert any([is_markup_file(source_path)]) is expected_result