diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py index 3ee9215..2eb150e 100644 --- a/src/inscriptis/__init__.py +++ b/src/inscriptis/__init__.py @@ -60,7 +60,8 @@ """ import re -import lxml.html +from lxml.html import fromstring, HtmlElement +from lxml.etree import ParserError from typing import Dict, Optional, Any @@ -70,7 +71,7 @@ RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>') -def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]: +def _get_html_tree(html_content: str) -> Optional[HtmlElement]: """Obtain the HTML parse tree for the given HTML content. Args: @@ -87,7 +88,10 @@ def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]: if html_content.startswith('' + html_content + '') def get_text(html_content: str, config: ParserConfig = None) -> str: diff --git a/src/inscriptis/css_profiles.py b/src/inscriptis/css_profiles.py index 48bb660..3d08c45 100644 --- a/src/inscriptis/css_profiles.py +++ b/src/inscriptis/css_profiles.py @@ -70,7 +70,6 @@ whitespace=WhiteSpace.pre), 'plaintext': HtmlElement(display=Display.block, whitespace=WhiteSpace.pre), - } RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy() diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index c38899c..85664b7 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -4,6 +4,7 @@ from typing import List import lxml.html +from lxml.etree import Comment from inscriptis.annotation import Annotation from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT @@ -86,25 +87,25 @@ def _parse_html_tree(self, tree): Args: tree: the HTML tree to parse. """ - # ignore comments - if not isinstance(tree.tag, str): - return + if isinstance(tree.tag, str): + self.handle_starttag(tree.tag, tree.attrib) + cur = self.tags[-1] + cur.canvas.open_tag(cur) - self.handle_starttag(tree.tag, tree.attrib) - cur = self.tags[-1] - cur.canvas.open_tag(cur) + self.tags[-1].write(tree.text) - self.tags[-1].write(tree.text) + for node in tree: + self._parse_html_tree(node) - for node in tree: - self._parse_html_tree(node) + self.handle_endtag(tree.tag) + prev = self.tags.pop() + prev.canvas.close_tag(prev) - self.handle_endtag(tree.tag) - prev = self.tags.pop() - prev.canvas.close_tag(prev) + # write the tail text to the element's container + self.tags[-1].write(tree.tail) - # write the tail text to the element's container - self.tags[-1].write_tail(tree.tail) + elif tree.tag is Comment and tree.tail: + self.tags[-1].canvas.write(self.tags[-1], tree.tail) def get_text(self) -> str: """Return the text extracted from the HTML page.""" diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py index c39d143..2462526 100644 --- a/src/inscriptis/metadata.py +++ b/src/inscriptis/metadata.py @@ -2,6 +2,6 @@ __author__ = 'Albert Weichselbraun, Fabian Odoni' __author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch' -__copyright__ = '2016-2021 Albert Weichselbraun, Fabian Odoni' +__copyright__ = '2016-2022 Albert Weichselbraun, Fabian Odoni' __license__ = 'Apache 2.0' -__version__ = '2.2.0' +__version__ = '2.3.0' diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py index aa5094c..3ea95fe 100644 --- a/src/inscriptis/model/html_element.py +++ b/src/inscriptis/model/html_element.py @@ -71,20 +71,9 @@ def write(self, text: str): """Write the given HTML text to the element's canvas.""" if not text or self.display == Display.none: return - self.canvas.write(self, ''.join( (self.prefix, text, self.suffix))) - def write_tail(self, text: str): - """Write the given tail text the the element's canvas. - - Args: - text: the text to write - """ - if not text or self.display == Display.none: - return - self.write(text) - def set_canvas(self, canvas) -> 'HtmlElement': self.canvas = canvas return self diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py index acd91b7..559aa79 100644 --- a/src/inscriptis/model/table.py +++ b/src/inscriptis/model/table.py @@ -115,11 +115,10 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]: # the easy case - the cell has only one line :) if len(self.blocks) == 1: - annotations = horizontal_shift(self.annotations, - self.line_width[0], - self.width, self.align, idx) self.line_width[0] = self.width - return annotations + return horizontal_shift(self.annotations, + self.line_width[0], + self.width, self.align, idx) # the more challenging one - multiple cell lines line_break_pos = list(accumulate(self.line_width)) diff --git a/tests/html/html-comment-ofuscation.html b/tests/html/html-comment-ofuscation.html new file mode 100644 index 0000000..f98d56b --- /dev/null +++ b/tests/html/html-comment-ofuscation.html @@ -0,0 +1 @@ +$90.74 diff --git a/tests/html/html-comment-ofuscation.txt b/tests/html/html-comment-ofuscation.txt new file mode 100644 index 0000000..cfcccae --- /dev/null +++ b/tests/html/html-comment-ofuscation.txt @@ -0,0 +1 @@ +$90.74 diff --git a/tests/html/table-pre.html b/tests/html/table-pre.html index b505796..c3ba857 100644 --- a/tests/html/table-pre.html +++ b/tests/html/table-pre.html @@ -18,7 +18,7 @@

Pre elements that have been nested in a table.

 int b = 1;
-for (int a=0; a<10; a++) {
+for (int a=0; a<10; a++) {
    System.out.println(a);
    b = b * a;
    System.out.println(b);
diff --git a/tests/test_empty_string.py b/tests/test_empty_string.py
index d436dc1..dd46353 100644
--- a/tests/test_empty_string.py
+++ b/tests/test_empty_string.py
@@ -9,8 +9,9 @@
 
 
 def test_empty_and_corrupt():
-    assert get_text("test").strip() == "test"
-    assert get_text("  ") == ""
-    assert get_text("") == ""
-    assert get_text("<<<").strip() == "<<"
+    assert get_text('test').strip() == 'test'
+    assert get_text('  ') == ''
+    assert get_text('') == ''
+    # test for the behaviour of older and recent lxml versions.
+    assert get_text('<<<').strip() in ('<<<', '<<', '')
 
diff --git a/tests/test_model_html_element_canvas.py b/tests/test_model_html_element_canvas.py
index ecc4698..574c047 100644
--- a/tests/test_model_html_element_canvas.py
+++ b/tests/test_model_html_element_canvas.py
@@ -53,4 +53,4 @@ def test_formatting():
     # and prefixes + suffixes
     h.prefix = '>>'
     h.suffix = '<<'
-    assert  _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast'
+    assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast'
diff --git a/tox.ini b/tox.ini
index 6d20c5f..63c1093 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,8 +1,8 @@
 # standard unit tests
 [testenv:pytest]
-deps = pytest
-       pytest-coverage
-commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests
+deps = pytest ~= 7.1.2
+       pytest-cov ~= 3.0.0
+commands = py.test --cov-config=.coveragerc --cov=inscriptis ./tests
 
 # python packaging best practices
 [testenv:pyroma]
@@ -12,31 +12,27 @@ commands = pyroma .
 # checks compatible with flake 4
 [testenv:flake8-4]
 deps = flake8 ~= 4.0.1
-       flake8-blind-except ~= 0.2.0
-       flake8-bandit ~= 2.1.2
-       flake8-bugbear ~= 21.9.2
+       flake8-blind-except ~= 0.2.1
+       flake8-bandit ~= 3.0.0
+       flake8-bugbear ~= 22.7.1
        flake8-builtins ~= 1.5.3
        flake8-cognitive-complexity ~= 0.1.0
        flake8-colors ~= 0.1.9
-       flake8-comprehensions ~= 3.7.0
+       flake8-comprehensions ~= 3.10.0
        flake8-docstrings ~= 1.6.0
-       flake8-eradicate ~= 1.2.0
-       flake8-expression-complexity ~= 0.0.9
+       flake8-encodings ~= 0.5.0.post1
+       flake8-eradicate ~= 1.2.1
+       flake8-expression-complexity ~= 0.0.11
        flake8-string-format ~= 0.3.0
        flake8-tuple ~= 0.4.1
        flake8-logging-format ~= 0.6.0
        flake8-pytest ~= 1.3
        flake8-quotes ~= 3.3.1
        flake8-raise ~= 0.0.5
-       flake8-simplify ~= 0.14.2
-       pep8-naming ~= 0.12.1
+       flake8-simplify ~= 0.19.2
+       pep8-naming ~= 0.13.1
        flake8-mutable ~= 1.2.0
-commands = flake8
-
-# checks compatible with flake < 4.0.0
-[testenv:flake8-3]
-deps = flake8 < 4.0.0
-       flake8-use-pathlib ~= 0.2.0
+       flake8-use-pathlib ~= 0.2.1
 commands = flake8
 
 [flake8]
@@ -51,11 +47,10 @@ exclude = .tox
 # S410 - bind to all IPs is okay in the case of the Web service, since it is
 #        aimed for use with docker.
 # W503 - replaced with W504
-# E402 - required for importing inscriptis metadata in setup.py
 # D102 - missing docstring in public method
 # D105 - missing docstring in magic method (e.g., __str__)
 # D107 - missing docstring in __init__
-ignore = S104, S410, W503, E402, D107, D105, D102
+ignore = S104, S410, W503, D107, D105, D102
 show-source = true
 enable-extensions=G
 application-import-names = inscriptis