Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#6 Count pure markup files as documentation #170

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Version 1.8.1, 2024-07-xx
`#160 <https://github.com/roskakori/pygount/issues/160>`_).
* Removed deprecated code: (contributed by Marco Gambone and Niels Vanden Bussche, issue
`#47 <https://github.com/roskakori/pygount/issues/47>`_).
* Count pure markup files as documentation: (contributed by Tytus Bucholc, issue
`#6 <https://github.com/roskakori/pygount/issues/6>`_).

Version 1.8.0, 2024-05-13

Expand Down
15 changes: 12 additions & 3 deletions pygount/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ class SourceState(Enum):
#: Regular expression to detect plain text files by name.
_PLAIN_TEXT_NAME_REGEX = re.compile(_PLAIN_TEXT_PATTERN, re.IGNORECASE)

_MARK_UP_PATTERN = ".*\.(md|rst|txt|\d+)$"
_MARK_UP_NAME_REGEX = re.compile(_MARK_UP_PATTERN)

#: Mapping for file suffixes to lexers for which pygments offers no official one.
_SUFFIX_TO_FALLBACK_LEXER_MAP = {
"fex": pygount.lexers.MinimalisticWebFocusLexer(),
Expand All @@ -148,6 +151,10 @@ class SourceState(Enum):
_SUFFIX_TO_FALLBACK_LEXER_MAP[_oracle_suffix] = pygments.lexers.get_lexer_by_name("plpgsql")


def is_markup_file(source_path: str):
return _MARK_UP_NAME_REGEX.match(os.path.basename(source_path))


class DuplicatePool:
"""
A pool that collects information about potential duplicate files.
Expand Down Expand Up @@ -351,7 +358,8 @@ def from_file(
language = dialect
_log.info("%s: analyze as %s using encoding %s", source_path, language, encoding)
mark_to_count_map = {"c": 0, "d": 0, "e": 0, "s": 0}
for line_parts in _line_parts(lexer, source_code):
is_markup = is_markup_file(source_path)
for line_parts in _line_parts(lexer, source_code, is_markup=is_markup):
mark_to_increment = "e"
for mark_to_check in ("d", "s", "c"):
if mark_to_check in line_parts:
Expand Down Expand Up @@ -717,7 +725,7 @@ def _pythonized_comments(tokens: Iterator[Tuple[TokenType, str]]) -> Iterator[To
yield result_token_type, result_token_text


def _line_parts(lexer: pygments.lexer.Lexer, text: str) -> Iterator[Set[str]]:
def _line_parts(lexer: pygments.lexer.Lexer, text: str, is_markup: bool) -> Iterator[Set[str]]:
line_marks = set()
tokens = _delined_tokens(lexer.get_tokens(text))
if lexer.name == "Python":
Expand All @@ -738,7 +746,8 @@ def _line_parts(lexer: pygments.lexer.Lexer, text: str) -> Iterator[Set[str]]:
else:
is_white_text = (token_text.strip() in white_words) or (token_text.rstrip(white_text) == "")
if not is_white_text:
line_marks.add("c") # 'code'
line_mark = "d" if is_markup else "c"
line_marks.add(line_mark)
if token_text.endswith("\n"):
yield line_marks
line_marks = set()
Expand Down
52 changes: 47 additions & 5 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
_pythonized_comments,
base_language,
guess_lexer,
is_markup_file,
)

from ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest
Expand Down Expand Up @@ -105,13 +106,13 @@ def test_can_deline_tokens(self):

def test_can_compute_python_line_parts(self):
python_lexer = lexers.get_lexer_by_name("python")
assert list(_line_parts(python_lexer, "#")) == [set("d")]
assert list(_line_parts(python_lexer, "s = 'x' # x")) == [set("cds")]
assert list(_line_parts(python_lexer, "#", False)) == [set("d")]
assert list(_line_parts(python_lexer, "s = 'x' # x", False)) == [set("cds")]

def test_can_detect_white_text(self):
python_lexer = lexers.get_lexer_by_name("python")
assert list(_line_parts(python_lexer, "{[()]};")) == [set()]
assert list(_line_parts(python_lexer, "pass")) == [set()]
assert list(_line_parts(python_lexer, "{[()]};", False)) == [set()]
assert list(_line_parts(python_lexer, "pass", False)) == [set()]

def test_can_convert_python_strings_to_comments(self):
source_code = (
Expand All @@ -125,8 +126,9 @@ def test_can_convert_python_strings_to_comments(self):
@staticmethod
def _line_parts(lexer_name: str, source_lines: List[str]) -> List[Set[str]]:
lexer = lexers.get_lexer_by_name(lexer_name)
is_markup = lexer_name in ["markdown", "md", "restructuredtext", "rst", "rest", "groff"]
source_code = "\n".join(source_lines)
return list(_line_parts(lexer, source_code))
return list(_line_parts(lexer, source_code, is_markup=is_markup))

def test_can_analyze_python(self):
source_lines = [
Expand Down Expand Up @@ -155,6 +157,20 @@ def test_can_analyze_c(self):
expected_line_parts = [{"d"}, {"d"}, {"d"}, {"c"}, {"c"}, {"c", "s"}, set()]
assert actual_line_parts == expected_line_parts

def test_all_lines_are_d_when_markup_flag_is_true(self):
source_lines = [
"/*",
" * The classic hello world for C99.",
" */",
"#include <stdio.h>",
"int main(void) {",
' puts("Hello, World!");',
"}",
]
actual_line_parts = AnalysisTest._line_parts("markdown", source_lines)
expected_line_parts = [{"d"}, {"d"}, {"d"}, {"d"}, {"d"}, {"d"}, set()]
assert actual_line_parts == expected_line_parts
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The can probably be expressed more concise with:

assert all(line_part = "d" for line_part in actual_line_parts), f"line_part={line_part}"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've followed your suggestion and made a small adjustment. Additionally, I've updated the README.md file. Could you please take a look when you have a moment?

image

Copy link
Collaborator Author

@snowkoli snowkoli Jul 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it acceptable that not all lines are categorized as documentation?
Markdown details:
{"documentationCount": 53, "documentationPercentage": 49.074074074074076, "codeCount": 0, "codePercentage": 0.0, "emptyCount": 21, "emptyPercentage": 19.444444444444443, "fileCount": 3, "filePercentage": 7.894736842105263, "isPseudoLanguage": false, "language": "Markdown", "sourceCount": 34, "sourcePercentage": 31.48148148148148, "stringCount": 34, "stringPercentage": 31.48148148148148 }



class _NonSeekableEmptyBytesIO(BytesIO):
# Class to create a 'dummy object that mimics a non-seekable file handle'
Expand Down Expand Up @@ -275,6 +291,24 @@ def test_fails_on_non_seekable_file_handle_with_encoding_chardet(self):
with pytest.raises(PygountError, match=r".*file handle must be seekable.*"):
analysis.SourceAnalysis.from_file("README.md", "test", file_handle=file_handle, encoding="chardet")

def test_analyzer_treats_textual_files_as_docs_only(self):
test_params = [
("test.rst", 0, 3, "restructuredtext"),
("test.md", 0, 3, "markdown"),
("test.txt", 0, 3, "text only"),
("test.4", 0, 3, "groff"),
]
for filename, code_count, doc_count, lang_lower in test_params:
with self.subTest(filename):
test_html_django_path = self.create_temp_file(
filename,
["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
)
source_analysis = analysis.SourceAnalysis.from_file(test_html_django_path, "test", encoding="utf-8")
assert source_analysis.language.lower() == lang_lower
assert source_analysis.code_count == code_count
assert source_analysis.documentation_count == doc_count


def test_can_repr_source_analysis_from_file():
source_analysis = analysis.SourceAnalysis("some.py", "Python", "some", 1, 2, 3, 4, analysis.SourceState.analyzed)
Expand Down Expand Up @@ -506,3 +540,11 @@ def test_can_detect_duplicate(self):
duplicate_pool = analysis.DuplicatePool()
assert duplicate_pool.duplicate_path(original_path) is None
assert original_path == duplicate_pool.duplicate_path(duplicate_path)


@pytest.mark.parametrize(
"file_extension, expected_result", [(".md", True), (".rst", True), (".py", False), (".4", True), (".c", False)]
)
def test_is_markup_file(file_extension, expected_result):
source_path = f"some_file_name{file_extension}"
assert any([is_markup_file(source_path)]) is expected_result
Loading