diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py index 3ee9215..2eb150e 100644 --- a/src/inscriptis/__init__.py +++ b/src/inscriptis/__init__.py @@ -60,7 +60,8 @@ """ import re -import lxml.html +from lxml.html import fromstring, HtmlElement +from lxml.etree import ParserError from typing import Dict, Optional, Any @@ -70,7 +71,7 @@ RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>') -def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]: +def _get_html_tree(html_content: str) -> Optional[HtmlElement]: """Obtain the HTML parse tree for the given HTML content. Args: @@ -87,7 +88,10 @@ def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]: if html_content.startswith('' + html_content + '') def get_text(html_content: str, config: ParserConfig = None) -> str: diff --git a/src/inscriptis/css_profiles.py b/src/inscriptis/css_profiles.py index 48bb660..3d08c45 100644 --- a/src/inscriptis/css_profiles.py +++ b/src/inscriptis/css_profiles.py @@ -70,7 +70,6 @@ whitespace=WhiteSpace.pre), 'plaintext': HtmlElement(display=Display.block, whitespace=WhiteSpace.pre), - } RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy() diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index c38899c..85664b7 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -4,6 +4,7 @@ from typing import List import lxml.html +from lxml.etree import Comment from inscriptis.annotation import Annotation from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT @@ -86,25 +87,25 @@ def _parse_html_tree(self, tree): Args: tree: the HTML tree to parse. """ - # ignore comments - if not isinstance(tree.tag, str): - return + if isinstance(tree.tag, str): + self.handle_starttag(tree.tag, tree.attrib) + cur = self.tags[-1] + cur.canvas.open_tag(cur) - self.handle_starttag(tree.tag, tree.attrib) - cur = self.tags[-1] - cur.canvas.open_tag(cur) + self.tags[-1].write(tree.text) - self.tags[-1].write(tree.text) + for node in tree: + self._parse_html_tree(node) - for node in tree: - self._parse_html_tree(node) + self.handle_endtag(tree.tag) + prev = self.tags.pop() + prev.canvas.close_tag(prev) - self.handle_endtag(tree.tag) - prev = self.tags.pop() - prev.canvas.close_tag(prev) + # write the tail text to the element's container + self.tags[-1].write(tree.tail) - # write the tail text to the element's container - self.tags[-1].write_tail(tree.tail) + elif tree.tag is Comment and tree.tail: + self.tags[-1].canvas.write(self.tags[-1], tree.tail) def get_text(self) -> str: """Return the text extracted from the HTML page.""" diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py index c39d143..2462526 100644 --- a/src/inscriptis/metadata.py +++ b/src/inscriptis/metadata.py @@ -2,6 +2,6 @@ __author__ = 'Albert Weichselbraun, Fabian Odoni' __author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch' -__copyright__ = '2016-2021 Albert Weichselbraun, Fabian Odoni' +__copyright__ = '2016-2022 Albert Weichselbraun, Fabian Odoni' __license__ = 'Apache 2.0' -__version__ = '2.2.0' +__version__ = '2.3.0' diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py index aa5094c..3ea95fe 100644 --- a/src/inscriptis/model/html_element.py +++ b/src/inscriptis/model/html_element.py @@ -71,20 +71,9 @@ def write(self, text: str): """Write the given HTML text to the element's canvas.""" if not text or self.display == Display.none: return - self.canvas.write(self, ''.join( (self.prefix, text, self.suffix))) - def write_tail(self, text: str): - """Write the given tail text the the element's canvas. - - Args: - text: the text to write - """ - if not text or self.display == Display.none: - return - self.write(text) - def set_canvas(self, canvas) -> 'HtmlElement': self.canvas = canvas return self diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py index acd91b7..559aa79 100644 --- a/src/inscriptis/model/table.py +++ b/src/inscriptis/model/table.py @@ -115,11 +115,10 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]: # the easy case - the cell has only one line :) if len(self.blocks) == 1: - annotations = horizontal_shift(self.annotations, - self.line_width[0], - self.width, self.align, idx) self.line_width[0] = self.width - return annotations + return horizontal_shift(self.annotations, + self.line_width[0], + self.width, self.align, idx) # the more challenging one - multiple cell lines line_break_pos = list(accumulate(self.line_width)) diff --git a/tests/html/html-comment-ofuscation.html b/tests/html/html-comment-ofuscation.html new file mode 100644 index 0000000..f98d56b --- /dev/null +++ b/tests/html/html-comment-ofuscation.html @@ -0,0 +1 @@ +
$90.74 diff --git a/tests/html/html-comment-ofuscation.txt b/tests/html/html-comment-ofuscation.txt new file mode 100644 index 0000000..cfcccae --- /dev/null +++ b/tests/html/html-comment-ofuscation.txt @@ -0,0 +1 @@ +$90.74 diff --git a/tests/html/table-pre.html b/tests/html/table-pre.html index b505796..c3ba857 100644 --- a/tests/html/table-pre.html +++ b/tests/html/table-pre.html @@ -18,7 +18,7 @@int b = 1; -for (int a=0; a<10; a++) { +for (int a=0; a<10; a++) { System.out.println(a); b = b * a; System.out.println(b); diff --git a/tests/test_empty_string.py b/tests/test_empty_string.py index d436dc1..dd46353 100644 --- a/tests/test_empty_string.py +++ b/tests/test_empty_string.py @@ -9,8 +9,9 @@ def test_empty_and_corrupt(): - assert get_text("test").strip() == "test" - assert get_text(" ") == "" - assert get_text("") == "" - assert get_text("<<<").strip() == "<<" + assert get_text('test').strip() == 'test' + assert get_text(' ') == '' + assert get_text('') == '' + # test for the behaviour of older and recent lxml versions. + assert get_text('<<<').strip() in ('<<<', '<<', '') diff --git a/tests/test_model_html_element_canvas.py b/tests/test_model_html_element_canvas.py index ecc4698..574c047 100644 --- a/tests/test_model_html_element_canvas.py +++ b/tests/test_model_html_element_canvas.py @@ -53,4 +53,4 @@ def test_formatting(): # and prefixes + suffixes h.prefix = '>>' h.suffix = '<<' - assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast' + assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast' diff --git a/tox.ini b/tox.ini index 6d20c5f..63c1093 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,8 @@ # standard unit tests [testenv:pytest] -deps = pytest - pytest-coverage -commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests +deps = pytest ~= 7.1.2 + pytest-cov ~= 3.0.0 +commands = py.test --cov-config=.coveragerc --cov=inscriptis ./tests # python packaging best practices [testenv:pyroma] @@ -12,31 +12,27 @@ commands = pyroma . # checks compatible with flake 4 [testenv:flake8-4] deps = flake8 ~= 4.0.1 - flake8-blind-except ~= 0.2.0 - flake8-bandit ~= 2.1.2 - flake8-bugbear ~= 21.9.2 + flake8-blind-except ~= 0.2.1 + flake8-bandit ~= 3.0.0 + flake8-bugbear ~= 22.7.1 flake8-builtins ~= 1.5.3 flake8-cognitive-complexity ~= 0.1.0 flake8-colors ~= 0.1.9 - flake8-comprehensions ~= 3.7.0 + flake8-comprehensions ~= 3.10.0 flake8-docstrings ~= 1.6.0 - flake8-eradicate ~= 1.2.0 - flake8-expression-complexity ~= 0.0.9 + flake8-encodings ~= 0.5.0.post1 + flake8-eradicate ~= 1.2.1 + flake8-expression-complexity ~= 0.0.11 flake8-string-format ~= 0.3.0 flake8-tuple ~= 0.4.1 flake8-logging-format ~= 0.6.0 flake8-pytest ~= 1.3 flake8-quotes ~= 3.3.1 flake8-raise ~= 0.0.5 - flake8-simplify ~= 0.14.2 - pep8-naming ~= 0.12.1 + flake8-simplify ~= 0.19.2 + pep8-naming ~= 0.13.1 flake8-mutable ~= 1.2.0 -commands = flake8 - -# checks compatible with flake < 4.0.0 -[testenv:flake8-3] -deps = flake8 < 4.0.0 - flake8-use-pathlib ~= 0.2.0 + flake8-use-pathlib ~= 0.2.1 commands = flake8 [flake8] @@ -51,11 +47,10 @@ exclude = .tox # S410 - bind to all IPs is okay in the case of the Web service, since it is # aimed for use with docker. # W503 - replaced with W504 -# E402 - required for importing inscriptis metadata in setup.py # D102 - missing docstring in public method # D105 - missing docstring in magic method (e.g., __str__) # D107 - missing docstring in __init__ -ignore = S104, S410, W503, E402, D107, D105, D102 +ignore = S104, S410, W503, D107, D105, D102 show-source = true enable-extensions=G application-import-names = inscriptis