diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..b142fb9 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +omit = tests/ diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c585c2f..d0b6f94 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -13,7 +13,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.7, 3.8, 3.9] + python-version: [3.6, 3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 diff --git a/.gitignore b/.gitignore index 4473554..cbdd7fa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc +*.pyx .*.swp *.egg-info __pycache__/ @@ -16,3 +17,9 @@ dist/ _build/ .mypy_cache/ .idea/ +venv/ +tests/converted.txt +tests/reference.txt +*.c +paper/*.pdf +htmlcov/ diff --git a/README.rst b/README.rst index b5ed98e..00413d2 100644 --- a/README.rst +++ b/README.rst @@ -26,25 +26,22 @@ inscriptis -- HTML to text conversion library, command line client and Web servi :target: https://badge.fury.io/py/inscriptis :alt: PyPI version -A python based HTML to text conversion library, command line client and Web service with support for **nested tables** and a **subset of CSS**. -Please take a look at the `Rendering `_ document for a demonstration of inscriptis' conversion quality. +A python based HTML to text conversion library, command line client and Web +service with support for **nested tables**, a **subset of CSS** and optional +support for providing an **annotated output**. +Please take a look at the +`Rendering `_ +document for a demonstration of inscriptis' conversion quality. -A Java port of inscriptis is availabe `here `_. +A Java port of inscriptis 1.x is available +`here `_. -Documentation -============= +This document provides a short introduction to Inscriptis. The full +documentation is built automatically and published on +`Read the Docs `_. -The full documentation is built automatically and published on `Read the Docs `_. -Table of Contents -================= - -1. `Installation`_ -2. `Python library`_ -3. `Standalone command line client`_ -4. `Web service`_ -5. `Fine tuning`_ -6. `Changelog`_ +.. contents:: Table of Contents Installation @@ -68,12 +65,14 @@ If you want to install from the latest sources, you can do:: Python library ============== -Embedding inscriptis into your code is easy, as outlined below:: +Embedding inscriptis into your code is easy, as outlined below: + +.. code-block:: python import urllib.request from inscriptis import get_text - url = "https://www.informationscience.ch" + url = "https://www.fhgr.ch" html = urllib.request.urlopen(url).read().decode('utf-8') text = get_text(html) @@ -82,48 +81,51 @@ Embedding inscriptis into your code is easy, as outlined below:: Standalone command line client ============================== -The command line client converts HTML files or text retrieved from Web pages to the -corresponding text representation. +The command line client converts HTML files or text retrieved from Web pages to +the corresponding text representation. Command line parameters ----------------------- The inscript.py command line client supports the following parameters:: - usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] - [--indentation INDENTATION] [-v] - [input] - - Converts HTML from file or url to a clean text version - - positional arguments: - input Html input either from a file or an url - (default:stdin) - - optional arguments: - -h, --help show this help message and exit - -o OUTPUT, --output OUTPUT - Output file (default:stdout). - -e ENCODING, --encoding ENCODING - Content encoding for reading and writing files - (default:utf-8) - -i, --display-image-captions - Display image captions (default:false). - -d, --deduplicate-image-captions - Deduplicate image captions (default:false). - -l, --display-link-targets - Display link targets (default:false). - -a, --display-anchor-urls - Deduplicate image captions (default:false). - --indentation INDENTATION - How to handle indentation (extended or strict; - default: extended). - -v, --version display version information + usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] + [--indentation INDENTATION] [-v] + [input] + + Convert the given HTML document to text. + + positional arguments: + input Html input either from a file or a URL (default:stdin). + + optional arguments: + -h, --help show this help message and exit + -o OUTPUT, --output OUTPUT + Output file (default:stdout). + -e ENCODING, --encoding ENCODING + Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs). + -i, --display-image-captions + Display image captions (default:false). + -d, --deduplicate-image-captions + Deduplicate image captions (default:false). + -l, --display-link-targets + Display link targets (default:false). + -a, --display-anchor-urls + Deduplicate image captions (default:false). + -r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES + Path to an optional JSON file containing rules for annotating the retrieved text. + -p POSTPROCESSOR, --postprocessor POSTPROCESSOR + Optional component for postprocessing the result (html, surface, xml). + --indentation INDENTATION + How to handle indentation (extended or strict; default: extended). + -v, --version display version information Examples -------- +HTML to text conversion +~~~~~~~~~~~~~~~~~~~~~~~ convert the given page to text and output the result to the screen:: $ inscript.py https://www.fhgr.ch @@ -132,11 +134,94 @@ convert the file to text and save the output to output.txt:: $ inscript.py fhgr.html -o fhgr.txt -convert text provided via stdin and save the output to output.txt:: +convert HTML provided via stdin and save the output to output.txt:: $ echo '

Make it so!

>' | inscript.py -o output.txt +HTML to annotated text conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +convert and annotate HTML from a Web page using the provided annotation rules:: + + $ inscript.py https://www.fhgr.ch -r ./examples/annotation-profile.json + +The annotation rules are specified in `annotation-profile.json`: + +.. code-block:: json + + { + "h1": ["heading", "h1"], + "h2": ["heading", "h2"], + "b": ["emphasis"], + "div#class=toc": ["table-of-contents"], + "#class=FactBox": ["fact-box"], + "#cite": ["citation"] + } + +The dictionary maps an HTML tag and/or attribute to the annotations +inscriptis should provide for them. In the example above, for instance, the tag +`h1` yields the annotations `heading` and `h1`, a `div` tag with a +`class` that contains the value `toc` results in the annotation +`table-of-contents`, and all tags with a `cite` attribute are annotated with +`citation`. + +Given these annotation rules the HTML file + +.. code-block:: HTML + +

Chur

+ Chur is the capital and largest town of the Swiss canton of the + Grisons and lies in the Grisonian Rhine Valley. + +yields the following JSONL output + +.. code-block:: json + + {"text": "Chur\n\nChur is the capital and largest town of the Swiss canton + of the Grisons and lies in the Grisonian Rhine Valley.", + "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]]} + +The provided list of labels contains all annotated text elements with their +start index, end index and the assigned label. + +Annotation postprocessors +~~~~~~~~~~~~~~~~~~~~~~~~~ +Annotation postprocessors enable the post processing of annotations to formats +that are suitable for you particular application. Post processors can be +specified with the `-p` or `--postprocessor` command line argument:: + + $ inscript.py https://www.fhgr.ch \ + -r ./examples/annotation-profile.json \ + -p tag + + +Output: + +.. code-block:: json + + {"text": " Chur\n\n Chur is the capital and largest town of the Swiss + canton of the Grisons and lies in the Grisonian Rhine Valley.", + "label": [[0, 6, "heading"], [8, 14, "emphasis"]], + "tag": "Chur\n\nChur is the + capital and largest town of the Swiss canton of the Grisons and + lies in the Grisonian Rhine Valley."} + + + +Currently, inscriptis supports the following postprocessors: + +- surface: returns an additional mapping between the annotation's surface form and its label:: + + ['heading': 'Chur', 'emphasis': 'Chur'] + +- tag: returns an additional annotated text version:: + + Chur + + Chur is the capital and largest town of the Swiss + canton of the Grisons and lies in the Grisonian Rhine Valley. + + Web Service =========== @@ -158,31 +243,96 @@ Start the inscriptis Web service with the following command:: Usage ----- -The Web services receives the HTML file in the request body and returns the corresponding text. The file's encoding needs to be specified +The Web services receives the HTML file in the request body and returns the +corresponding text. The file's encoding needs to be specified in the `Content-Type` header (`UTF-8` in the example below):: - $ curl -X POST -H "Content-Type: text/html; encoding=UTF8" --data-binary @test.html http://localhost:5000/get_text + $ curl -X POST -H "Content-Type: text/html; encoding=UTF8" \ + --data-binary @test.html http://localhost:5000/get_text The service also supports a version call:: $ curl http://localhost:5000/version +Advanced topics +=============== + +Annotated text +-------------- +Inscriptis can provide annotations alongside the extracted text which allows +downstream components to draw upon semantics that have only been available in +the original HTML file. + +The extracted text and annotations can be exported in different formats, +including the popular JSONL format which is used by +`doccano `_. + +Example output: + +.. code-block:: json + + {"text": "Chur\n\nChur is the capital and largest town of the Swiss canton + of the Grisons and lies in the Grisonian Rhine Valley.", + "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]]} + +The output above is produced, if inscriptis is run with the following +annotation rules: + +.. code-block:: json + + { + "h1": ["heading", "h1"], + "b": ["emphasis"], + } + +The code below demonstrates how inscriptis' annotation capabilities can +be used within a program: + +.. code-block:: python + + import urllib.request + from inscriptis import get_annotated_text + + url = "https://www.fhgr.ch" + html = urllib.request.urlopen(url).read().decode('utf-8') + + rules = {'h1': ['heading', 'h1'], + 'h2': ['heading', 'h2'], + 'b': ['emphasis'], + 'table': ['table'] + } + + output = get_annotated_text(html, ParserConfig(annotation_rules=rules) + print("Text:", output['text']) + print("Annotations:", output['label']) + Fine tuning -=========== +----------- The following options are available for fine tuning inscriptis' HTML rendering: -1. **More rigorous indentation:** call `inscriptis.get_text()` with the parameter `indentation='extended'` to also use indentation for tags such as `
` and `` that do not provide indentation in their standard definition. This strategy is the default in `inscript.py` and many other tools such as lynx. If you do not want extended indentation you can use the parameter `indentation='standard'` instead. +1. **More rigorous indentation:** call `inscriptis.get_text()` with the + parameter `indentation='extended'` to also use indentation for tags such as + `
` and `` that do not provide indentation in their standard + definition. This strategy is the default in `inscript.py` and many other + tools such as lynx. If you do not want extended indentation you can use the + parameter `indentation='standard'` instead. + +2. **Overwriting the default CSS definition:** inscriptis uses CSS definitions + that are maintained in `inscriptis.css.CSS` for rendering HTML tags. You can + override these definitions (and therefore change the rendering) as outlined + below: -2. **Overwriting the default CSS definition:** inscriptis uses CSS definitions that are maintained in `inscriptis.css.CSS` for rendering HTML tags. You can override these definitions (and therefore change the rendering) as outlined below:: +.. code-block:: python from lxml.html import fromstring from inscriptis.css_profiles import CSS_PROFILES, HtmlElement from inscriptis.html_properties import Display from inscriptis.model.config import ParserConfig - # create a custom CSS based on the default style sheet and change the rendering of `div` and `span` elements + # create a custom CSS based on the default style sheet and change the + # rendering of `div` and `span` elements css = CSS_PROFILES['strict'].copy() css['div'] = HtmlElement('div', display=Display.block, padding=2) css['span'] = HtmlElement('span', prefix=' ', suffix=' ') @@ -197,4 +347,5 @@ The following options are available for fine tuning inscriptis' HTML rendering: Changelog ========= -A full list of changes can be found in the `release notes `_. +A full list of changes can be found in the +`release notes `_. diff --git a/benchmarking/a b/benchmarking/a new file mode 100644 index 0000000..fc976e9 --- /dev/null +++ b/benchmarking/a @@ -0,0 +1,113 @@ +justext is not available. Please install it in order to compare with justext. + +URL: www.watson.de +Lynx : 0.15138936042785645 --> fastest +Inscriptis : 0.20263218879699707 +0.051242828369140625 +BeautifulSoup: 0.3756422996520996 +0.22425293922424316 +Html2Text : 0.43219757080078125 +0.2808082103729248 + + +URL: www.watson.ch-Digital20&20Games-Android-134350872-Der-Monster-Akku-in-diesem-Smartphone-hC3A4lt-bis- +Inscriptis : 0.07737088203430176 --> fastest +BeautifulSoup: 0.1150212287902832 +0.037650346755981445 +Lynx : 0.1359405517578125 +0.05856966972351074 +Html2Text : 0.1448962688446045 +0.06752538681030273 + + +URL: www.heise.de +Lynx : 0.15659260749816895 --> fastest +Inscriptis : 0.20164966583251953 +0.045057058334350586 +BeautifulSoup: 0.29897594451904297 +0.14238333702087402 +Html2Text : 0.37505173683166504 +0.2184591293334961 + + +URL: www.heise.de-newsticker-meldung-Fairphone-2-im-Test-Das-erste-modulare-Smartphone-3043417.html +Inscriptis : 0.09370565414428711 --> fastest +Lynx : 0.15947198867797852 +0.0657663345336914 +BeautifulSoup: 0.16203570365905762 +0.06833004951477051 +Html2Text : 0.21861886978149414 +0.12491321563720703 + + +URL: www.nzz.de +Lynx : 0.17096304893493652 --> fastest +Inscriptis : 0.2877614498138428 +0.11679840087890625 +Html2Text : 0.4983334541320801 +0.32737040519714355 +BeautifulSoup: 0.5966424942016602 +0.42567944526672363 + + +URL: www.nzz.ch-mobilitaet-auto-mobil-bekenntnis-zum-stromauto-ld.3630 +Inscriptis : 0.1326134204864502 --> fastest +Lynx : 0.14449405670166016 +0.011880636215209961 +BeautifulSoup: 0.16537070274353027 +0.03275728225708008 +Html2Text : 0.2061011791229248 +0.07348775863647461 + + +URL: de.wikipedia.org-wiki-Wikipedia-Hauptseite +Inscriptis : 0.0768730640411377 --> fastest +BeautifulSoup: 0.1140899658203125 +0.037216901779174805 +Html2Text : 0.1279299259185791 +0.051056861877441406 +Lynx : 0.13344478607177734 +0.05657172203063965 + + +URL: de.wikipedia.org-wiki-Python_(Programmiersprache) +Lynx : 0.15608739852905273 --> fastest +Inscriptis : 0.2505784034729004 +0.09449100494384766 +BeautifulSoup: 0.3396627902984619 +0.18357539176940918 +Html2Text : 0.407498836517334 +0.25141143798828125 + + +URL: de.wikipedia.org-wiki-Chur +Lynx : 0.19526290893554688 --> fastest +Inscriptis : 0.4372870922088623 +0.24202418327331543 +BeautifulSoup: 0.5105750560760498 +0.31531214714050293 +Html2Text : 0.7925112247467041 +0.5972483158111572 + + +URL: jr-central.co.jp +Inscriptis : 0.030536651611328125 --> fastest +BeautifulSoup: 0.04150390625 +0.010967254638671875 +Html2Text : 0.05070781707763672 +0.020171165466308594 +Lynx : 0.1379244327545166 +0.10738778114318848 + + +URL: www.aljazeera.net-portal +Lynx : 0.18790936470031738 --> fastest +Inscriptis : 0.3582143783569336 +0.1703050136566162 +BeautifulSoup: 0.5611743927001953 +0.37326502799987793 +Html2Text : 0.6482110023498535 +0.46030163764953613 + + +URL: www.aljazeera.net-news-humanrights-2015-12-14-D8A3D988D8A8D8A7D985D8A7-D98AD8ACD8AFD8AF-D8A7D984D8AA +Inscriptis : 0.13330984115600586 --> fastest +Lynx : 0.14847993850708008 +0.015170097351074219 +BeautifulSoup: 0.17941498756408691 +0.046105146408081055 +Html2Text : 0.242262601852417 +0.10895276069641113 + + +URL: www.fhgr.ch +Lynx : 0.20734667778015137 --> fastest +Inscriptis : 0.5514888763427734 +0.34414219856262207 +BeautifulSoup: 0.7790236473083496 +0.5716769695281982 +Html2Text : 0.9708971977233887 +0.7635505199432373 + + +URL: www.diepresse.com +Lynx : 0.18340134620666504 --> fastest +Inscriptis : 0.2943253517150879 +0.11092400550842285 +BeautifulSoup: 0.48204803466796875 +0.2986466884613037 +Html2Text : 0.5474369525909424 +0.36403560638427734 + + +URL: derstandard.at +Lynx : 0.17057490348815918 --> fastest +Inscriptis : 0.3920929431915283 +0.22151803970336914 +BeautifulSoup: 0.4781017303466797 +0.3075268268585205 +Html2Text : 0.5499060153961182 +0.379331111907959 + + +URL: krone.at +Lynx : 0.18678593635559082 --> fastest +Inscriptis : 0.41831398010253906 +0.23152804374694824 +BeautifulSoup: 0.6808819770812988 +0.494096040725708 +Html2Text : 0.794529914855957 +0.6077439785003662 + diff --git a/benchmarking/b b/benchmarking/b new file mode 100644 index 0000000..fdd3bfc --- /dev/null +++ b/benchmarking/b @@ -0,0 +1,3 @@ +justext is not available. Please install it in order to compare with justext. + +URL: www.watson.de diff --git a/benchmarking/run_benchmarking.py b/benchmarking/run_benchmarking.py index d8b00a4..4ae9652 100755 --- a/benchmarking/run_benchmarking.py +++ b/benchmarking/run_benchmarking.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 # coding:utf-8 -''' +""" Runs a benchmarking suite to compare speed and output of different implementations. -''' - +""" +import argparse from datetime import datetime import operator import os @@ -55,25 +55,25 @@ OUTFILE = 'speed_comparisons.txt' -class AbstractHtmlConverter(): - ''' +class AbstractHtmlConverter: + """ An abstract HTML convert class. - ''' + """ def get_text(self, html): - ''' + """ Returns: a text representation of the given HTML snippet. - ''' + """ raise NotImplementedError def benchmark(self, html): - ''' + """ Benchmarks the classes HTML to text converter. Returns: A tuple of the required time and the obtained text representation. - ''' + """ start_time = time() for _ in range(TRIES): text = self.get_text(html) @@ -81,9 +81,9 @@ def benchmark(self, html): class BeautifulSoupHtmlConverter(AbstractHtmlConverter): - ''' + """ Converts HTML to text using BeautifulSoup. - ''' + """ name = 'BeautifulSoup' def __init__(self): @@ -103,10 +103,10 @@ def get_text(self, html): return result -class JustextHtmlConverter(AbstractHtmlConverter): - ''' +class JustextConverter(AbstractHtmlConverter): + """ Converts HTML to text using Justtext. - ''' + """ name = 'Justtext' def __init__(self): @@ -119,9 +119,9 @@ def get_text(self, html): class Html2TextConverter(AbstractHtmlConverter): - ''' + """ Converts HTML to text using Html2Text. - ''' + """ name = 'Html2Text' def __init__(self): @@ -135,11 +135,11 @@ def get_text(self, html): return ''.join(result) -class LynxHtmlConverter(AbstractHtmlConverter): - ''' +class LynxConverter(AbstractHtmlConverter): + """ Converts HTML to text using lynx. - ''' - name = 'lynx' + """ + name = 'Lynx' def __init__(self): try: @@ -174,29 +174,32 @@ def kill_lynx(pid): class InscriptisHtmlConverter(AbstractHtmlConverter): - ''' + """ Converts HTML to text using Inscriptis. - ''' + """ name = 'Inscriptis' def __init__(self): self.available = 'inscriptis' in sys.modules + if self.available: + from inscriptis import get_text + self.get_text = get_text def get_text(self, html): - return inscriptis.get_text(html) + return self.get_text(html) timestamp = str(datetime.now()).replace(' ', '_').replace(':', '-')\ .split('.')[0] -benchmarking_results_dir = os.path.join(BENCHMARKING_ROOT, - 'benchmarking_results', timestamp) -CACHE_DIR = os.path.join(BENCHMARKING_ROOT, 'html_cache') +DEFAULT_RESULT_DIR = os.path.join(BENCHMARKING_ROOT, 'benchmarking_results', + timestamp) +DEFAULT_CACHE_DIR = os.path.join(BENCHMARKING_ROOT, 'html_cache') -def save_to_file(algorithm, url, data): - ''' +def save_to_file(algorithm, url, data, benchmarking_results_dir): + """ Saves a benchmarking result to the given file. - ''' + """ result_file = os.path.join(benchmarking_results_dir, '{}_{}.txt'.format(algorithm, url)) with open(result_file, 'w') as output_file: @@ -204,9 +207,9 @@ def save_to_file(algorithm, url, data): def get_speed_table(times): - ''' + """ Provides the table which compares the conversion speed. - ''' + """ fastest = 999999 for key, value in times.items(): if value < fastest: @@ -241,10 +244,10 @@ def get_speed_table(times): return result -def get_fname(url): - ''' +def get_fname(url) -> str: + """ Transforms a URL to a file name. - ''' + """ trash = (('http://', ''), ('https://', ''), ('/', '-'), @@ -257,29 +260,55 @@ def get_fname(url): CONVERTER = (BeautifulSoupHtmlConverter(), - JustextHtmlConverter(), + JustextConverter(), Html2TextConverter(), - LynxHtmlConverter(), + LynxConverter(), InscriptisHtmlConverter()) +def parse_args(): + """ + Parse optional benchmarking arguments. + """ + parser = argparse.ArgumentParser(description='Inscriptis benchmarking ' + 'suite') + parser.add_argument('converter', type=str, nargs='*', + help='The list of converters to benchmark (options:' + 'BeautifulSoup, Justext, Html2Text, Lynx, ' + 'Inscriptis; default: all)') + parser.add_argument('-u', '--benchmarking-urls', + default=os.path.join(BENCHMARKING_ROOT, + 'url_list.txt'), + help='A list of URLs to use in the benchmark.') + parser.add_argument('-r', '--benchmarking-results', + default=DEFAULT_RESULT_DIR, + help='Optional directory for saving the benchmarking ' + 'results.') + parser.add_argument('-c', '--cache', default=DEFAULT_CACHE_DIR, + help='Optional cache directory for the retrieved Web ' + 'pages.') + return parser.parse_args() + + def benchmark(): - ''' + """ Runs the benchmark. - ''' + """ + args = parse_args() + # These are a few predefined urls the script will - with open(os.path.join(BENCHMARKING_ROOT, 'url_list.txt')) as url_list: + with open(args.benchmarking_urls) as url_list: sources = [url.strip() for url in url_list] - if not os.path.exists(benchmarking_results_dir): - os.makedirs(benchmarking_results_dir) + if not os.path.exists(args.benchmarking_results): + os.makedirs(args.benchmarking_results) - if not os.path.exists(CACHE_DIR): - os.makedirs(CACHE_DIR) + if not os.path.exists(args.cache): + os.makedirs(args.cache) for source in sources: source_name = get_fname(source) - source_cache_path = os.path.join(CACHE_DIR, source_name) + source_cache_path = os.path.join(args.cache, source_name) if os.path.exists(source_cache_path): html = open(source_cache_path).read() else: @@ -290,26 +319,28 @@ def benchmark(): html = urllib.request.urlopen(req).read().decode('latin1') open(source_cache_path, 'w').write(html) - with open(os.path.join(benchmarking_results_dir, + with open(os.path.join(args.benchmarking_results, 'speed_comparisons.txt'), 'a') as output_file: output_file.write('\nURL: {}\n'.format(source_name)) print('\nURL: {}'.format(source_name)) times = {} for converter in CONVERTER: - if converter.available: + if converter.available and not args.converter or converter.name \ + in args.converter: time_required, text = converter.benchmark(html) times[converter.name] = time_required - save_to_file(converter.name, source_name, text) + save_to_file(converter.name, source_name, text, + args.benchmarking_results) speed_table = get_speed_table(times) print(speed_table) - with open(os.path.join(benchmarking_results_dir, + with open(os.path.join(args.benchmarking_results, OUTFILE), 'a') as output_file: output_file.write(speed_table + '\n') - with open(os.path.join(benchmarking_results_dir, + with open(os.path.join(args.benchmarking_results, OUTFILE), 'a') as output_file: output_file.write('\n') diff --git a/benchmarking/speed_comparisons.txt b/benchmarking/speed_comparisons.txt new file mode 100644 index 0000000..2793862 --- /dev/null +++ b/benchmarking/speed_comparisons.txt @@ -0,0 +1,113 @@ + +URL: www.watson.de +inscriptis : 0.0886073112487793 --> fastest +lynx : 0.09243917465209961 +0.0038318634033203125 +html2text : 0.27269411087036133 +0.18408679962158203 +beautifulsoup: 0.3715205192565918 +0.2829132080078125 + + +URL: www.watson.ch-Digital20&20Games-Android-134350872-Der-Monster-Akku-in-diesem-Smartphone-hC3A4lt-bis- +inscriptis : 0.031877756118774414 --> fastest +lynx : 0.06591463088989258 +0.034036874771118164 +html2text : 0.09615325927734375 +0.06427550315856934 +beautifulsoup: 0.10839462280273438 +0.07651686668395996 + + +URL: www.heise.de +inscriptis : 0.0771639347076416 --> fastest +lynx : 0.0936579704284668 +0.016494035720825195 +html2text : 0.2419900894165039 +0.1648261547088623 +beautifulsoup: 0.29470372200012207 +0.21753978729248047 + + +URL: www.heise.de-newsticker-meldung-Fairphone-2-im-Test-Das-erste-modulare-Smartphone-3043417.html +inscriptis : 0.036151885986328125 --> fastest +lynx : 0.0704348087310791 +0.03428292274475098 +html2text : 0.10545611381530762 +0.06930422782897949 +beautifulsoup: 0.12367486953735352 +0.08752298355102539 + + +URL: www.nzz.de +lynx : 0.10388016700744629 --> fastest +inscriptis : 0.11366724967956543 +0.00978708267211914 +html2text : 0.34471607208251953 +0.24083590507507324 +beautifulsoup: 0.37203025817871094 +0.26815009117126465 + + +URL: www.nzz.ch-mobilitaet-auto-mobil-bekenntnis-zum-stromauto-ld.3630 +inscriptis : 0.05420851707458496 --> fastest +lynx : 0.08396458625793457 +0.02975606918334961 +html2text : 0.15306854248046875 +0.09886002540588379 +beautifulsoup: 0.16551637649536133 +0.11130785942077637 + + +URL: de.wikipedia.org-wiki-Wikipedia-Hauptseite +inscriptis : 0.029024839401245117 --> fastest +lynx : 0.0713193416595459 +0.04229450225830078 +beautifulsoup: 0.08946847915649414 +0.06044363975524902 +html2text : 0.09077596664428711 +0.06175112724304199 + + +URL: de.wikipedia.org-wiki-Python_(Programmiersprache) +inscriptis : 0.08830070495605469 --> fastest +lynx : 0.09342122077941895 +0.005120515823364258 +html2text : 0.30716776847839355 +0.21886706352233887 +beautifulsoup: 0.3195374011993408 +0.23123669624328613 + + +URL: de.wikipedia.org-wiki-Chur +lynx : 0.110748291015625 --> fastest +inscriptis : 0.16320323944091797 +0.05245494842529297 +html2text : 0.4872932434082031 +0.3765449523925781 +beautifulsoup: 0.4883759021759033 +0.3776276111602783 + + +URL: jr-central.co.jp +inscriptis : 0.012284517288208008 --> fastest +html2text : 0.03157520294189453 +0.019290685653686523 +beautifulsoup: 0.04013681411743164 +0.027852296829223633 +lynx : 0.06790828704833984 +0.055623769760131836 + + +URL: www.aljazeera.net-portal +lynx : 0.11873912811279297 --> fastest +inscriptis : 0.13616037368774414 +0.017421245574951172 +html2text : 0.35196900367736816 +0.2332298755645752 +beautifulsoup: 0.5011019706726074 +0.38236284255981445 + + +URL: www.aljazeera.net-news-humanrights-2015-12-14-D8A3D988D8A8D8A7D985D8A7-D98AD8ACD8AFD8AF-D8A7D984D8AA +inscriptis : 0.04958152770996094 --> fastest +lynx : 0.08647871017456055 +0.03689718246459961 +html2text : 0.1424856185913086 +0.09290409088134766 +beautifulsoup: 0.21869587898254395 +0.169114351272583 + + +URL: www.htwchur.ch +inscriptis : 0.04151415824890137 --> fastest +lynx : 0.07280635833740234 +0.03129220008850098 +html2text : 0.11662626266479492 +0.07511210441589355 +beautifulsoup: 0.1333613395690918 +0.09184718132019043 + + +URL: www.diepresse.com +lynx : 0.10844087600708008 --> fastest +inscriptis : 0.11291694641113281 +0.004476070404052734 +html2text : 0.3410661220550537 +0.23262524604797363 +beautifulsoup: 0.42446470260620117 +0.3160238265991211 + + +URL: derstandard.at +lynx : 0.10470342636108398 --> fastest +inscriptis : 0.14974093437194824 +0.04503750801086426 +html2text : 0.4319000244140625 +0.3271965980529785 +beautifulsoup: 0.4459238052368164 +0.3412203788757324 + + +URL: krone.at +lynx : 0.11936330795288086 --> fastest +inscriptis : 0.18073749542236328 +0.06137418746948242 +html2text : 0.571204662322998 +0.4518413543701172 +beautifulsoup: 0.6350071430206299 +0.515643835067749 + + diff --git a/benchmarking/url_list.txt b/benchmarking/url_list.txt index 94225ea..e58ed7a 100644 --- a/benchmarking/url_list.txt +++ b/benchmarking/url_list.txt @@ -14,3 +14,5 @@ https://www.fhgr.ch https://www.diepresse.com https://derstandard.at https://krone.at +https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python/46921881 +https://www.chur.ch/churinzahlen diff --git a/docs/benchmarking.rst b/docs/benchmarking.rst index 7b84eea..1898ebc 100644 --- a/docs/benchmarking.rst +++ b/docs/benchmarking.rst @@ -4,12 +4,33 @@ Testing, benchmarking and evaluation Unit tests ========== -Test cases concerning the html to text conversion are located in the `tests/html` directory and consist of two files: +In addition to the standard unit tests that are located in the project's `test` directory Inscriptis also contains +test cases that solely focus on the html to text conversion and are located in the `tests/html` directory. +These tests consist of two files: 1. `test-name.html` and 2. `test-name.txt` -the latter one containing the reference text output for the given html file. +The `.txt` file contains the reference text output for the given html file. + +Since Inscripits 2.0 there may also be a third file named `test-name.json` in the `tests/html` directory which contains a JSON dictioanry with keys + + 1. `annotation-rules` containing the annotation rules for extracting metadata from the corresponding html file, and + 2. `result` which stores the surface forms of the extracted metadata. + + +Example:: + + {"annotation_rules": { + "h1": ["heading"], + "b": ["emphasis"] + }, + "result": [ + ["heading", "The first"], + ["heading", "The second"], + ["heading", "Subheading"] + ] + } Text conversion output comparison and benchmarking diff --git a/docs/conf.py b/docs/conf.py index 8b074b8..0f764f0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ sys.path.insert(0, os.path.abspath('.')) sys.path.insert(0, os.path.abspath('./../src')) -import inscriptis +from inscriptis.metadata import __copyright__, __author__, __version__ # -- General configuration ------------------------------------------------ @@ -52,17 +52,17 @@ # General information about the project. project = 'inscriptis' -copyright = inscriptis.__copyright__ -author = inscriptis.__author__ +copyright = __copyright__ +author = __author__ # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = inscriptis.__version__ +version = __version__ # The full version, including alpha/beta/rc tags. -release = inscriptis.__version__ +release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/inscriptis-module-documentation.rst b/docs/inscriptis-module-documentation.rst index 395f468..b25cb8f 100644 --- a/docs/inscriptis-module-documentation.rst +++ b/docs/inscriptis-module-documentation.rst @@ -5,28 +5,54 @@ Inscriptis module documentation .. automodule:: inscriptis :members: +Inscriptis model +================ + Inscriptis HTML engine -====================== +---------------------- .. automodule:: inscriptis.html_engine :members: Inscriptis HTML properties -========================== +-------------------------- .. automodule:: inscriptis.html_properties :members: Inscriptis CSS model -==================== +-------------------- .. automodule:: inscriptis.model.css :members: Inscriptis canvas model -======================= +----------------------- .. automodule:: inscriptis.model.canvas :members: +.. automodule:: inscriptis.model.canvas.block + :members: + +.. automodule:: inscriptis.model.canvas.prefix + :members: + + + Inscriptis table model -====================== +---------------------- .. automodule:: inscriptis.model.table :members: + +.. _annotations: + +Inscriptis annotations +====================== + +.. automodule:: inscriptis.annotation + :members: + + +Annotation processors +--------------------- + +.. automodule:: inscriptis.annotation.output + :members: diff --git a/docs/paper/Makefile b/docs/paper/Makefile new file mode 100644 index 0000000..5bcdf4a --- /dev/null +++ b/docs/paper/Makefile @@ -0,0 +1,6 @@ +all: + docker run --rm \ + --volume `pwd`:/data \ + --user $(id -u):$(id -g) \ + --env JOURNAL=joss \ + openjournals/paperdraft diff --git a/docs/paper/images/annotations.png b/docs/paper/images/annotations.png new file mode 100644 index 0000000..accec05 Binary files /dev/null and b/docs/paper/images/annotations.png differ diff --git a/docs/paper/images/inscriptis-vs-lynx.png b/docs/paper/images/inscriptis-vs-lynx.png new file mode 100644 index 0000000..c741e0c Binary files /dev/null and b/docs/paper/images/inscriptis-vs-lynx.png differ diff --git a/docs/paper/images/inscriptis-vs-lynx.xcf b/docs/paper/images/inscriptis-vs-lynx.xcf new file mode 100644 index 0000000..824527e Binary files /dev/null and b/docs/paper/images/inscriptis-vs-lynx.xcf differ diff --git a/docs/paper/images/raw/inscriptis.png b/docs/paper/images/raw/inscriptis.png new file mode 100644 index 0000000..7b86299 Binary files /dev/null and b/docs/paper/images/raw/inscriptis.png differ diff --git a/docs/paper/images/raw/lynx.png b/docs/paper/images/raw/lynx.png new file mode 100644 index 0000000..5cea2fd Binary files /dev/null and b/docs/paper/images/raw/lynx.png differ diff --git a/docs/paper/paper.bib b/docs/paper/paper.bib new file mode 100644 index 0000000..cfd89dc --- /dev/null +++ b/docs/paper/paper.bib @@ -0,0 +1,284 @@ + +@article{wang_review_2020, + title = {A review of emotion sensing: categorization models and algorithms}, + issn = {1573-7721}, + shorttitle = {A review of emotion sensing}, + url = {https://doi.org/10.1007/s11042-019-08328-z}, + doi = {10.1007/s11042-019-08328-z}, + abstract = {Sentiment analysis consists in the identification of the sentiment polarity associated with a target object, such as a book, a movie or a phone. Sentiments reflect feelings and attitudes, while emotions provide a finer characterization of the sentiments involved. With the huge number of comments generated daily on the Internet, besides sentiment analysis, emotion identification has drawn keen interest from different researchers, businessmen and politicians for polling public opinions and attitudes. This paper reviews and discusses existing emotion categorization models for emotion analysis and proposes methods that enhance existing emotion research. We carried out emotion analysis by inviting experts from different research areas to produce comprehensive results. Moreover, a computational emotion sensing model is proposed, and future improvements are discussed in this paper.}, + language = {en}, + urldate = {2020-01-07}, + journal = {Multimedia Tools and Applications}, + author = {Wang, Zhaoxia and Ho, Seng-Beng and Cambria, Erik}, + month = jan, + year = {2020}, + keywords = {Affective computing, Emotion categorization model, Emotion definition, Sentiment analysis}, + annote = {Summary +This paper discusses different emotion models such as Ekman's emotion model, Plutchik's wheel of emotions and the Hourglass of Emotions. + +the models are based on different assumptions and models that may be inspired by psychology or biology. +There is a clear distinction between feeling (primarily determined by the person's internal state) and emotion (triggered by the environment). +The discussed models cover a total of 65 different emotions. +there are emotional categories such as surprise which might not have a polarity or for which the polarity might depend on the context. This has also been confirmed by an experiment in which experts evaluated the polarity of 65 emotions. +OCC emotion model - calibrated by considering the author's personality based on openness, conscientiousness, agreeableness, extraversion and neuroticism) + +Datasets + +The paper contains background information and examples of different emotion models +Shaver et al. - hierachical emotion model +Gui et al. - public dataset based on SINA city news. +}, + file = {Wang et al. - 2020 - A review of emotion sensing categorization models.pdf:/home/albert/Zotero/storage/C5Y56QRX/Wang et al. - 2020 - A review of emotion sensing categorization models.pdf:application/pdf}, +} + +@inproceedings{mikolov_distributed_2013, + title = {Distributed {Representations} of {Words} and {Phrases} and their {Compositionality}}, + url = {http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality}, + booktitle = {Advances in {Neural} {Information} {Processing} {Systems} 26: 27th {Annual} {Conference} on {Neural} {Information} {Processing} {Systems} 2013. {Proceedings} of a meeting held {December} 5-8, 2013, {Lake} {Tahoe}, {Nevada}, {United} {States}}, + author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Gregory S. and Dean, Jeffrey}, + year = {2013}, + keywords = {word2vec}, + pages = {3111--3119}, + file = {arXiv\:1310.4546 PDF:/home/albert/Zotero/storage/695ESUSG/Mikolov et al. - 2013 - Distributed Representations of Words and Phrases a.pdf:application/pdf}, +} + +@inproceedings{pennington_glove:_2014, + address = {Doha, Qatar}, + title = {Glove: {Global} {Vectors} for {Word} {Representation}}, + shorttitle = {Glove}, + url = {https://www.aclweb.org/anthology/D14-1162}, + doi = {10.3115/v1/D14-1162}, + urldate = {2019-12-12}, + booktitle = {Proceedings of the 2014 {Conference} on {Empirical} {Methods} in {Natural} {Language} {Processing} ({EMNLP})}, + publisher = {Association for Computational Linguistics}, + author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher}, + month = oct, + year = {2014}, + keywords = {word embeddings}, + pages = {1532--1543}, + file = {Pennington et al. - 2014 - Glove Global Vectors for Word Representation.pdf:/home/albert/Zotero/storage/ZBQABJR6/Pennington et al. - 2014 - Glove Global Vectors for Word Representation.pdf:application/pdf}, +} + +@article{scharl_semantic_2017, + title = {Semantic {Systems} and {Visual} {Tools} to {Support} {Environmental} {Communication}}, + volume = {11}, + copyright = {All rights reserved}, + doi = {10.1109/JSYST.2015.2466439}, + abstract = {Given the intense attention that environmental topics such as climate change attract in news and social media coverage, key questions are how different stakeholders perceive observable threats and policy options, how public media react to new scientific insights, and how journalists present climate science knowledge to the public. This paper investigates the potential of semantic technologies to address these questions. After summarizing methods to extract and disambiguate context information, we present visualization techniques to explore the lexical, geospatial and relational context of topics and entities referenced in these repositories. The examples stem from the Media Watch on Climate Change, the Climate Resilience Toolkit and the NOAA Media Watch - three applications that aggregate environmental resources from a wide range of online sources. These systems not only show the value of providing comprehensive information to the public, but also have helped to develop a novel communication success metric that goes beyond bipolar assessments of sentiment.}, + number = {2}, + journal = {IEEE Systems Journal}, + author = {Scharl, Arno and Herring, David and Rafelsberger, Walter and Hubmann-Haidvogel, Alexander and Kamolov, Ruslan and Fischl, Daniel and Föls, Michael and Weichselbraun, Albert}, + year = {2017}, + keywords = {annotation, climate science, context information, relation extraction, visual analytics, web intelligence}, + pages = {762--771}, + file = {Scharl et al. - 2017 - Semantic Systems and Visual Tools to Support Envir.pdf:/home/albert/Zotero/storage/3VIFSWB3/Scharl et al. - 2017 - Semantic Systems and Visual Tools to Support Envir.pdf:application/pdf}, +} + +@article{li_effect_2014, + title = {The effect of news and public mood on stock movements}, + volume = {278}, + issn = {0020-0255}, + url = {http://www.sciencedirect.com/science/article/pii/S0020025514003879}, + doi = {10.1016/j.ins.2014.03.096}, + abstract = {With technological advancements that cultivate vibrant creation, sharing, and collaboration among Web users, investors can rapidly obtain more valuable and timely information. Meanwhile, the adaption of user engagement in media effectively magnifies the information in the news. With such rapid information influx, investor decisions tend to be influenced by peer and public emotions. An effective methodology to quantitatively analyze the mechanism of information percolation and its degree of impact on stock markets has yet to be explored. In this article, we propose a quantitative media-aware trading strategy to investigate the media impact on stock markets. Our main findings are that (1) fundamental information of firm-specific news articles can enrich the knowledge of investors and affect their trading activities; (2) public sentiments cause emotional fluctuations in investors and intervene in their decision making; and (3) the media impact on firms varies according to firm characteristics and article content.}, + urldate = {2016-02-24}, + journal = {Information Sciences}, + author = {Li, Qing and Wang, TieJun and Li, Ping and Liu, Ling and Gong, Qixu and Chen, Yuanzhu}, + month = sep, + year = {2014}, + keywords = {News, Sentiment Analysis, social media, Stock market, text mining}, + pages = {826--840}, + file = {Li et al. - 2014 - The effect of news and public mood on stock moveme.pdf:/home/albert/Zotero/storage/4AIV6JCP/Li et al. - 2014 - The effect of news and public mood on stock moveme.pdf:application/pdf}, +} + +@inproceedings{weichselbraun_extracting_2016, + address = {Kauai, Hawaii}, + title = {Extracting {Opinion} {Targets} from {Environmental} {Web} {Coverage} and {Social} {Media} {Streams}}, + copyright = {All rights reserved}, + abstract = {Policy makers and environmental organizations have a keen interest in awareness building and the evolution of stakeholder opinions on environmental issues. Mere polarity detection, as provided by many existing methods, does not suffice to understand the emergence of collective awareness. Methods for extracting affective knowledge should be able to pinpoint opinion targets within a thread. Opinion target extraction provides a more accurate and fine-grained identification of opinions expressed in online media. This paper compares two different approaches for identifying potential opinion targets and applies them to comments from the YouTube video sharing platform. The first approach is based on statistical keyword analysis in conjunction with sentiment classification on the sentence level. The second approach uses dependency parsing to pinpoint the target of an opinionated term. A case study based on YouTube postings applies the developed methods and measures their ability to handle noisy input data from social media streams.}, + booktitle = {Proceedings of the 49th {Hawaii} {International} {Conference} on {System} {Sciences} ({HICSS}-49)}, + publisher = {IEEE Computer Society Press}, + author = {Weichselbraun, Albert and Scharl, Arno and Gindl, Stefan}, + month = jan, + year = {2016}, + note = {Accepted 17 August 2015}, + keywords = {climate change, keyword analysis, Opinion mining, opinion target extraction, sentiment analysis}, +} + +@article{weichselbraun_adapting_2021, + title = {Adapting {Data}-{Driven} {Research} to the {Fields} of {Social} {Sciences} and the {Humanities}}, + volume = {13}, + issn = {1999-5903}, + doi = {10.3390/fi13030059}, + abstract = {Recent developments in the fields of computer science such as advances in the areas of big data, knowledge extraction and deep learning have triggered the application of data-driven research methods to disciplines such as social sciences and humanities. This article presents a collaborative, interdisciplinary process for adapting data-driven research to research questions within other disciplines that considers the methodological background required to obtain a significant impact on the target discipline, guides the systematic collection and formalization of domain knowledge, the selection of appropriate data sources and methods for analyzing, visualizing and interpreting the results. +Finally, we present a case study that applies the described process to the domain of communication science by creating approaches that aid domain experts in locating, tracking, analyzing and finally better understanding the dynamics of media criticism. The study clearly demonstrates the potential of the presented method but also shows that data-driven research approaches require a tighter integration with the methodological framework of the target discipline to really provide a significant impact on the target discipline.}, + number = {3}, + journal = {Future Internet}, + author = {Weichselbraun, Albert and Kuntschik, Philipp and Fancolino, Vincenzo and Saner, Mirco and Wyss, Vinzenz}, + year = {2021}, + note = {Accepted 22 February 2021}, +} + +@article{reis_transformers_2021, + title = {Transformers aftermath: current research and rising trends}, + volume = {64}, + issn = {0001-0782}, + shorttitle = {Transformers aftermath}, + url = {https://doi.org/10.1145/3430937}, + doi = {10.1145/3430937}, + abstract = {Attention, particularly self-attention, is a standard in current NLP literature, but to achieve meaningful models, attention is not enough.}, + number = {4}, + urldate = {2021-05-18}, + journal = {Communications of the ACM}, + author = {Reis, Eduardo Souza Dos and Costa, Cristiano André Da and Silveira, Diórgenes Eugênio Da and Bavaresco, Rodrigo Simon and Righi, Rodrigo Da Rosa and Barbosa, Jorge Luis Victória and Antunes, Rodolfo Stoffel and Gomes, Márcio Miguel and Federizzi, Gustavo}, + month = mar, + year = {2021}, + pages = {154--163}, + annote = {Summary +This overview article discusses the transformer architecture, ground breaking models and open issues in the use of transformers for natural language understanding. +Background + +most NLP research relies upon a sentence-to-sentence modelling:- input: x\_1, ... x\_n- output: y\_1, .. y\_n-1 - determine y\_n given intermediate representations z\_1, ... z\_n +solution: RNNs - predict y\_n given the distribution of previous inputs- problem:vanishing gradient problem; addressed by a forgetting mechanism- training time increases exponentially with the context vector +encoder-decoder architecture: - fed with input tokens x\_i- yields a hidden states h\_1, ... h\_n- the last hidden state is fed into a decoder which yields y\_1, ... y\_n based on h\_n and the decoders hidden state d\_1, ... d\_n +Transformers draw upon attention rather than recurrence.- recurrence is replaced by a stack of encoders and decoders- input x\_1, ... x\_n is projected into three vector spaces: (i) query (decoder), (ii) keys, and (iii) values (encoder)- queries: features relevant to the previous decoder- dot product yields the similarity between the input and the query keys- exponential softmax function increases the gap between relevant keys and less relevant ones. + +Advantages of attention + +multiple forms of attention identify relevant tokens +faster than recurrent layers, when the input sequence is shorter than the context vector + +Types of attention + +self-attention: correlates positions within the same sequence (e.g., sentence) +multihead attention: each attention head yields different weights (i.e., focuses on a different aspect of the problem) similar to an ensemble model. + +Alternative approaches + +CNNs: more parallelized +multi-step attention: decoder receives a matrix of attention weights from the previous decoder and computes its own + +Unsupervised transformers + +unsupervised pre-training based on large corpora +semi-supervised approaches are fine-tuned on task-specific data (e.g., BERT) +task agnostic: use a single model architecture across all tasks (e.g. by using a delimiter between input and target for summarization or question answering tasks) +GPT2: model size should allow for task solving tasks without the fine-tuning step. + +Research directions + +commonsense learning- RNNs outperform transformers when sentence lengths differ too much (!)- combine knowledge graphs or knowledge distillation with unsupervised pre-training- apply data augmentation and increase data diversity instead of data volume (e.g., by paraphrasing sentences for question answering tasks) +multitask learning + +Enhancements + +Transformer-XL model: map longer dependencies among the input tokens by caching context representations +RoBERTa optimizations for BERT-based models: use bigger training batches and remove the next sentence prediction (NSP) step +XLNet: learns dependencies independent of the token order (i.e., sequence of tokens) +ALBERT:remove the number of parameters by sharing parameters across layers (smaller models but computationally more complex) +GPT-3: large-scale model with 175 billion parameters; produces news articles that are hard to distinguish from human-written text. + +Integrating domain knowledge + +should help in mapping meaning rather than "statistical" predictions +two approaches: + +knowledge graphs (leverage prior world knowledge that no longer needs to be stored in the model; reduces the attention search space; enables the ue of graph-based methods) +knowledge distillation (learn a compressed knowledge representation (student) from a huge (teacher model) while minimizing the performance loss- DistilBERT: 97 \% of model performance at 40\% of the size- Shallow models (shallow bidirectional LSTM achieves comparable results to ELMo with 1/100 of the parameters + + + +Interesting: models that draw upon background knowledge (e.g., Amnervaz et al. 2018)}, + file = {Reis et al. - 2021 - Transformers aftermath current research and risin.pdf:/home/albert/Zotero/storage/YPRNSTBE/Reis et al. - 2021 - Transformers aftermath current research and risin.pdf:application/pdf}, +} + +@article{ding_jel_2021, + title = {{JEL}: {Applying} {End}-to-{End} {Neural} {Entity} {Linking} in {JPMorgan} {Chase}}, + volume = {35}, + copyright = {Copyright (c) 2021 Association for the Advancement of Artificial Intelligence}, + issn = {2374-3468}, + shorttitle = {{JEL}}, + url = {https://ojs.aaai.org/index.php/AAAI/article/view/17796}, + language = {en}, + number = {17}, + urldate = {2021-06-03}, + journal = {Proceedings of the AAAI Conference on Artificial Intelligence}, + author = {Ding, Wanying and Chaudhri, Vinay K. and Chittar, Naren and Konakanchi, Krihshna}, + month = may, + year = {2021}, + note = {Number: 17}, + keywords = {business intelligence, Deep \& Wide Learning, Named entity linking}, + pages = {15301--15308}, + annote = {Summary +Named entity linking in the business domain is a challenging task, since most state of the art methods rely upon comprehensive context information (e.g., from Wikipedia) that is not available for many business entities. +This paper, therefore, addresses an end-to-end neural entity linking model (JEL) that uses minimal context information and margin loss to generate entity embeddings and a Wide \& Deep Learning model that draws upon these embeddings for entity linking. +Method +The authors use (i) spaCy for entity recognition and (ii) their method for linking to identified entities. +Entit Embedding +triple loss model that selects + +10 terms that are positive context examples for an entity, and +10 randomly selected negative examples (how representative are these examples; effectiveness?) + +to compute contextualized entity embeddings Enitity Linking + +Character matching: Wide Character Learning with subword information for matching entities (more effective than verbatim matching since it also considers typos) +Semantic matching: Use deep semantic embeddings to embed mentions based on the context into a vector. Compute the similarity between mention-vectors and entity-vectors based on Euclidean distance. + +Application +similar to WISDOM - identify businesses in financial news and propagate the impact of the coverage (e.g., financial difficulties) along the supply chain.}, + file = {Snapshot:/home/albert/Zotero/storage/53XPV5G9/17796.html:text/html;Ding et al. - 2021 - JEL Applying End-to-End Neural Entity Linking in .pdf:/home/albert/Zotero/storage/4KWA6M6F/Ding et al. - 2021 - JEL Applying End-to-End Neural Entity Linking in .pdf:application/pdf}, +} + +@article{fu_spanner_2021, + title = {{SpanNER}: {Named} {Entity} {Re}-/{Recognition} as {Span} {Prediction}}, + shorttitle = {{SpanNER}}, + url = {http://arxiv.org/abs/2106.00641}, + abstract = {Recent years have seen the paradigm shift of Named Entity Recognition (NER) systems from sequence labeling to span prediction. Despite its preliminary effectiveness, the span prediction model's architectural bias has not been fully understood. In this paper, we first investigate the strengths and weaknesses when the span prediction model is used for named entity recognition compared with the sequence labeling framework and how to further improve it, which motivates us to make complementary advantages of systems based on different paradigms. We then reveal that span prediction, simultaneously, can serve as a system combiner to re-recognize named entities from different systems' outputs. We experimentally implement 154 systems on 11 datasets, covering three languages, comprehensive results show the effectiveness of span prediction models that both serve as base NER systems and system combiners. We make all code and datasets available: {\textbackslash}url\{https://github.com/neulab/spanner\}, as well as an online system demo: {\textbackslash}url\{http://spanner.sh\}. Our model also has been deployed into the ExplainaBoard platform, which allows users to flexibly perform a system combination of top-scoring systems in an interactive way: {\textbackslash}url\{http://explainaboard.nlpedia.ai/leaderboard/task-ner/\}.}, + urldate = {2021-06-11}, + journal = {arXiv:2106.00641 [cs]}, + author = {Fu, Jinlan and Huang, Xuanjing and Liu, Pengfei}, + month = jun, + year = {2021}, + note = {arXiv: 2106.00641}, + keywords = {Computer Science - Computation and Language}, + annote = {Comment: Accepted by ACL 2021 (Main track)}, + file = {arXiv.org Snapshot:/home/albert/Zotero/storage/4LF375ZK/2106.html:text/html;Fu et al_2021_SpanNER.pdf:/home/albert/Zotero/storage/3IEZ8VGV/Fu et al_2021_SpanNER.pdf:application/pdf}, +} + +@article{convertino_usefulness_2018, + title = {The usefulness of listening social media for pharmacovigilance purposes: a systematic review}, + volume = {17}, + issn = {1744-764X}, + shorttitle = {The usefulness of listening social media for pharmacovigilance purposes}, + doi = {10.1080/14740338.2018.1531847}, + abstract = {INTRODUCTION: Social media mining could be a possible strategy to retrieve drug safety information. The mining of social media is a complex process under progressive evolution, falling into three broad categories: listening (safety data reporting), engaging (follow-up), and broadcasting (risk communication). This systematic review is aimed at evaluating the usefulness and quality of proto-signals by social media listening. Areas covered: In this systematic search, performed according to MOOSE and PRISMA statements, we selected studies, published in MEDLINE, EMBASE, and Google Scholar until 31 December 2017, that listened at least one social media to identify proto-adverse drug events and proto-signals. Expert opinion: The selected 38 studies identified serious and unexpected proto-adverse drug events characterized by poorer information quality as compared with spontaneous reporting databases. This feature allows rarely the evaluation of causal relationships. Proto-signals identified by social media listening had the potential of anticipating pre-specified known signals in only six studies. Moreover, the personal perception of patients reported in social media could be used to implement effective risk communication strategies. However, signal detection in social media cannot be currently recommended for routine pharmacovigilance, due to logistic and technical issues.}, + language = {eng}, + number = {11}, + journal = {Expert Opinion on Drug Safety}, + author = {Convertino, Irma and Ferraro, Sara and Blandizzi, Corrado and Tuccori, Marco}, + month = nov, + year = {2018}, + pmid = {30285501}, + keywords = {Adverse Drug Reaction Reporting Systems, Data Mining, Databases, Factual, Drug-Related Side Effects and Adverse Reactions, Humans, pharmacovigilance, Pharmacovigilance, proto-signal, signal detection, Social media, Social Media}, + pages = {1081--1093}, + file = {Convertino et al_2018_The usefulness of listening social media for pharmacovigilance purposes.pdf:/home/albert/Zotero/storage/TKI5AH7M/Convertino et al_2018_The usefulness of listening social media for pharmacovigilance purposes.pdf:application/pdf}, +} + +@article{harris_distributional_1954, + title = {Distributional {Structure}}, + volume = {10}, + issn = {0043-7956}, + url = {https://doi.org/10.1080/00437956.1954.11659520}, + doi = {10.1080/00437956.1954.11659520}, + number = {2-3}, + urldate = {2021-06-29}, + journal = {WORD}, + author = {Harris, Zellig S.}, + month = aug, + year = {1954}, + note = {Publisher: Routledge +\_eprint: https://doi.org/10.1080/00437956.1954.11659520}, + pages = {146--162}, + file = {Harris_1954_Distributional Structure.pdf:/home/albert/Zotero/storage/59ZE8S3N/Harris_1954_Distributional Structure.pdf:application/pdf;Snapshot:/home/albert/Zotero/storage/HTSHI7XH/00437956.1954.html:text/html}, +} diff --git a/docs/paper/paper.md b/docs/paper/paper.md new file mode 100644 index 0000000..d533802 --- /dev/null +++ b/docs/paper/paper.md @@ -0,0 +1,72 @@ +--- +title: 'Inscriptis - A Python-based HTML to text conversion library optimized for knowledge extraction from the Web' +tags: + - Python + - web mining + - knowledge extraction + - text conversion + - gold standard creation + - annotated text output +authors: + - name: Albert Weichselbraun + orcid: 0000-0001-6399-045X + affiliation: 1 +affiliations: + - name: University of Applied Sciences of the Grisons, Chur, Switzerland + index: 1 +date: 25 June 2021 +bibliography: paper.bib +--- + +# Summary + +``Inscriptis`` provides a library, command line client and Web service for converting HTML content to plain text. In contrast to existing software packages such as [HTML2text](https://github.com/Alir3z4/html2text/), [jusText](https://github.com/miso-belica/jusText/) and [Lynx](https://lynx.invisible-island.net/), it has been tailored towards knowledge extraction pipelines by + +1. providing a layout-aware conversion of textual output that more closely resembles the rendering obtained from standard Web browsers. ``Inscriptis`` excels in terms of conversion quality, since it correctly converts complex HTML constructs such as nested tables and also interprets a subset of HTML (e.g., `align`, `valign`) and CSS (e.g., `display`, `white-space`, `margin-top`, `vertical-algin`, etc.) attributes that determine the text alignment. + 2. supporting annotation rules, i.e., user-provided mappings that allow for annotating the extracted text based on structural and semantic information encoded in HTML tags and attributes used for controlling structure and layout in the original HTML document. + +These unique features ensure that downstream Knowledge Extraction components can operate on accurate text representations without drawing upon a heavyweight solution such as [Selenium](https://www.selenium.dev/) which requires interaction with a full-fledged Web browser. In addition, its optional annotation support enables downstream components to use information on the structure of the original HTML document. + + +# Statement of need + +Research in a growing number of scientific disciplines relies upon Web content. @li_effect_2014, for instance, studied the impact of company-specific News coverage on stock prices, in medicine and pharmacovigilance social media listening plays an important role in gathering insights into patient needs and the monitoring of adverse drug effects [@convertino_usefulness_2018], and communication sciences draw upon media coverage to obtain information on the perception and framing of issues as well as on the rise and fall of topics within News and social media [@scharl_semantic_2017; @weichselbraun_adapting_2021]. + +Computer science focuses on analyzing content by applying knowledge extraction techniques such as entity recognition [@fu_spanner_2021] to automatically identify entities (e.g., persons, organizations, locations, products, etc.) within text documents, entity linking [@ding_jel_2021] to link these entities to knowledge bases (e.g., Wikidata and DBPedia), and sentiment analysis to automatically assess sentiment polarity (i.e., positive versus negative coverage) and emotions expressed towards these entities [@wang_review_2020]. + +Most knowledge extraction methods operate on text and, therefore, require an accurate conversion of HTML content which also preserves the spatial alignment between text elements. This is particularly true for methods drawing upon algorithms which directly or indirectly leverage information on the proximity between terms, such as word embeddings [@mikolov_distributed_2013; @pennington_glove:_2014] and language models [@reis_transformers_2021], sentiment analysis which often also considers the distance between target and sentiment terms, and automatic keyword and phrase extraction techniques. + +Despite this need from within the research community, many standard HTML to text conversion techniques are not layout aware, yielding text representations that fail to preserve the spatial properties of text snippets, as illustrated below. + +![Text representation of a table from DBpedia computed by ``Inscriptis`` (left) and lynx (right). Lynx fails to correctly interpret the cascaded table and, therefore, does not properly align the temperature values.](images/inscriptis-vs-lynx.png) + +``Inscriptis`` is not only able to correctly render such pages but also offers the option to preserve parts of the original HTML document's semantics (e.g., information on headings, emphasised text, tables, etc.) by complementing the extracted text with annotations obtained from the document. \autoref{fig:annotations} provides an example of annotations extracted from a Wikipedia page. These annotations might be useful for + +- aiding downstream knowledge extraction components with additional information that may be leveraged to improve their respective performance. Text summarization techniques, for instance, can put a stronger emphasis on paragraphs that contain bold and italic text, and sentiment analysis may consider this information in addition to textual clues such as uppercase text. +- assisting manual document annotation processes (e.g., for qualitative analysis or gold standard creation). ``Inscripti``s supports multiple export formats such as XML, annotated HTML and the JSONL format that is used by the open source annotation tool [doccano](https://github.com/doccano/doccano)^[Please note that doccano currently does not support overlapping annotations and, therefore, cannot import files containing overlapping annotations.]. Support for further annotation formats can be easily added by implementing custom annotation processors. +- enabling the use of ``Inscriptis`` for tasks such as content extraction (i.e., extract task-specific relevant content from a Web page) which rely on information on the HTML document's structure. + +![Annotations extracted from the DBpedia entry for Chur using the ``--postprocessor html`` command line option.\label{fig:annotations}](images/annotations.png) + +In conclusion, ``Inscriptis`` provides knowledge extraction components with high quality conversions of HTML documents. +Since its first public release in March 2016, ``Inscriptis`` has been downloaded over 121,000 times from the Python Package Index (PyPI)^[Source: https://pepy.tech/project/inscriptis], has proven its capabilities in national and European research projects and has been integrated into commercial products such as the [webLyzard Web Intelligence and Visual Analytics Platform](https://www.weblyzard.com/visual-analytics-dashboard/). + + + +# Mentions + +The following research projects use ``Inscriptis`` within their knowledge extraction pipelines: + +- [CareerCoach](https://www.fhgr.ch/CareerCoach): Automatic Knowledge Extraction and Recommender Systems for Personalized Re- and Upskilling suggestions funded by Innosuisse. +- [EPOCH project](https://www.epoch-project.eu) funded by the Austrian Federal Ministry for Climate Action, Environment, Energy, Mobility and Technology (BMK) via the ICT of the Future Program. +- [MedMon](https://www.fhgr.ch/medmon): Monitoring of Internet Resources for Pharamceutical Research and Development funded by Innosuisse. +- [ReTV project](https://www.retv-project.eu) funded by the European Union’s Horizon 2020 Research and Innovation Programme. + + +# Acknowledgements + +Work on ``Inscriptis`` has been conducted within the MedMon and CareerCoach projects funded by Innosuisse. + + +# References + diff --git a/docs/paper/paper.pdf b/docs/paper/paper.pdf new file mode 100644 index 0000000..c4cfca4 Binary files /dev/null and b/docs/paper/paper.pdf differ diff --git a/examples/annotation-profile.json b/examples/annotation-profile.json new file mode 100644 index 0000000..4dfa31b --- /dev/null +++ b/examples/annotation-profile.json @@ -0,0 +1,14 @@ +{ + "h1": ["heading"], + "h2": ["heading"], + "h3": ["heading"], + "h4": ["heading"], + "h5": ["heading"], + "b": ["emphasis"], + "div#class=toc": ["table-of-contents"], + "#class=FactBox": ["fact-box"], + "#class=shortdescription]": ["description"], + "table": ["table"], + "tr": ["row"], + "td": ["cell"] +} diff --git a/examples/table-annotation-profile.json b/examples/table-annotation-profile.json new file mode 100644 index 0000000..d00bb3a --- /dev/null +++ b/examples/table-annotation-profile.json @@ -0,0 +1,7 @@ +{ + "table": ["table"], + "th": ["table-heading"], + "tr": ["table-row"], + "td": ["table-cell"], + "b": ["emphasis"] +} diff --git a/examples/unittest.json b/examples/unittest.json new file mode 100644 index 0000000..48a58ec --- /dev/null +++ b/examples/unittest.json @@ -0,0 +1,7 @@ +{ + "h1": ["heading"], + "h2": ["heading"], + "h3": ["heading"], + "b": ["emphasis"], + "table": ["table"] +} diff --git a/examples/wikipedia.json b/examples/wikipedia.json new file mode 100644 index 0000000..c81dfe5 --- /dev/null +++ b/examples/wikipedia.json @@ -0,0 +1,12 @@ +{ + "h1": ["heading"], + "h2": ["heading"], + "h3": ["subheading"], + "h4": ["subheading"], + "h5": ["subheading"], + "i": ["emphasis"], + "b": ["bold"], + "table": ["table"], + "th": ["tableheading"], + "a": ["link"] +} diff --git a/scripts/inscript.py b/scripts/inscript.py index fd601f6..c6b5bc4 100755 --- a/scripts/inscript.py +++ b/scripts/inscript.py @@ -1,34 +1,49 @@ #!/usr/bin/env python3 # coding:utf-8 -''' -Inscriptis command line client. -''' +"""Inscriptis command line client.""" import argparse import sys +from json import load, dumps from pathlib import Path import requests -from inscriptis import __version__, __copyright__, __license__ -from inscriptis import get_text +from inscriptis import get_text, get_annotated_text +from inscriptis.metadata import __version__, __copyright__, __license__ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig +DEFAULT_ENCODING = 'utf8' + + +def get_postprocessor(name): + """Return the postprocessor (if available) for the given name. + + Args: + name: the name of the postprocessor + + Returns: + The matching postprocessing function + """ + pp_class = name.capitalize() + 'Extractor' + mod = __import__('inscriptis.annotation.output.' + name, + fromlist=[pp_class]) + return getattr(mod, pp_class)() + def get_parser(): - """ Parses the arguments if script is run directly via console """ + """Parse the arguments if script is run via console.""" parser = argparse.ArgumentParser( - description='Converts HTML from file or url to a clean text version') + description='Convert the given HTML document to text.') parser.add_argument('input', nargs='?', default=None, - help='Html input either from a file or an url ' - '(default:stdin)') + help='Html input either from a file or a URL ' + '(default:stdin).') parser.add_argument('-o', '--output', type=str, help='Output file (default:stdout).') parser.add_argument('-e', '--encoding', type=str, - help='Content encoding for reading and writing files ' - '(default:utf-8)', - default='utf-8') + help='Input encoding to use (default:utf-8 for ' + 'files; detected server encoding for Web URLs).') parser.add_argument('-i', '--display-image-captions', action='store_true', default=False, help='Display image captions (default:false).') @@ -41,6 +56,13 @@ def get_parser(): parser.add_argument('-a', '--display-anchor-urls', action='store_true', default=False, help='Deduplicate image captions (default:false).') + parser.add_argument('-r', '--annotation-rules', default=None, + help='Path to an optional JSON file containing rules ' + 'for annotating the retrieved text.') + parser.add_argument('-p', '--postprocessor', type=get_postprocessor, + default=lambda x: x, + help='Optional component for postprocessing the ' + 'result (html, surface, xml). ') parser.add_argument('--indentation', default='extended', help='How to handle indentation (extended or strict;' ' default: extended).') @@ -50,14 +72,13 @@ def get_parser(): return parser -if __name__ == "__main__": +if __name__ == '__main__': parser = get_parser() args = parser.parse_args() if args.version: - print('Inscript HTML to text conversion ' - '(based on the inscriptis library version {0})'.format( - __version__)) + print('Inscript HTML to text conversion (based on the inscriptis ' + 'library version {0})'.format(__version__)) print('Copyright (C)', __copyright__) print('\nInscript comes with ABSOLUTELY NO WARRANTY.') print('This is free software and you are welcome to redistribute it ' @@ -67,26 +88,48 @@ def get_parser(): if not args.input: html_content = sys.stdin.read() elif Path(args.input).is_file(): - with Path(args.input).open(encoding=args.encoding, + with Path(args.input).open(encoding=args.encoding or DEFAULT_ENCODING, errors='ignore') as f: html_content = f.read() - elif args.input.startswith("http://") or args.input.startswith("https://"): - html_content = requests.get(args.input).text + elif args.input.startswith('http://') or args.input.startswith('https://'): + req = requests.get(args.input) + html_content = req.content.decode(args.encoding or req.encoding) else: print("ERROR: Cannot open input file '{0}'.\n".format(args.input)) parser.print_help() sys.exit(-1) + if args.annotation_rules: + try: + with Path(args.annotation_rules).open() as f: + annotation_rules = load(f) + except IOError: + print("ERROR: Cannot open annotation rule file '{0}'.".format( + args.annotation_rules + )) + sys.exit(-1) + else: + annotation_rules = None + css_profile = CSS_PROFILES['relaxed'] if args.indentation == 'extended' \ else CSS_PROFILES['strict'] config = ParserConfig(css=css_profile, display_images=args.display_image_captions, deduplicate_captions=args.deduplicate_image_captions, display_links=args.display_link_targets, - display_anchors=args.display_anchor_urls) - text = get_text(html_content, config) + display_anchors=args.display_anchor_urls, + annotation_rules=annotation_rules) + if not annotation_rules: + output = get_text(html_content, config) + else: + output = args.postprocessor( + get_annotated_text(html_content, config)) + if hasattr(args.postprocessor, 'verbatim') \ + and not args.postprocessor.verbatim: + output = dumps(output) + if args.output: - with Path(args.output).open('w', encoding=args.encoding) as open_file: - open_file.write(text) + with Path(args.output).open('w', encoding=DEFAULT_ENCODING) as open_file: + open_file.write(output) else: - print(text) + print(output) diff --git a/scripts/web-service.py b/scripts/web-service.py index 218977d..cbd9d75 100755 --- a/scripts/web-service.py +++ b/scripts/web-service.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 # coding:utf-8 -''' -Inscriptis Web Service -''' +"""Inscriptis Web Service.""" from flask import request, Response, Flask -from inscriptis import get_text, __version__ + +from inscriptis import get_text +from inscriptis.metadata import __version__ from inscriptis.css_profiles import RELAXED_CSS_PROFILE from inscriptis.model.config import ParserConfig @@ -14,17 +14,15 @@ deduplicate_captions=True, display_links=False) -@app.route("/") +@app.route('/') def index(): - return "Hello" + """Print a short status message for the Web service's base URL.""" + return 'Inscriptis text to HTML Web service.' -@app.route("/get_text", methods=['POST']) +@app.route('/get_text', methods=['POST']) def get_text_call(): - ''' - Returns: - the text representation of the given HTML content. - ''' + """Return the text representation of the given HTML content.""" content_type = request.headers['Content-type'] if '; encoding=' in content_type: encoding = content_type.split('; encoding=')[1] @@ -35,15 +33,12 @@ def get_text_call(): return Response(text, mimetype='text/plain') -@app.route("/version", methods=['GET']) +@app.route('/version', methods=['GET']) def get_version_call(): - ''' - Returns: - the used inscriptis version. - ''' + """Return the used inscriptis version.""" return Response(__version__ + '\n', mimetype='text/plain') if __name__ == '__main__': - print("Starting Web service based on Inscriptis", __version__) + print('Starting Web service based on Inscriptis', __version__) app.run(threaded=True, host='0.0.0.0', port=5000) diff --git a/setup.py b/setup.py index c55d113..2415393 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,30 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- -import sys +"""Inscriptis setup script.""" + from pathlib import Path from setuptools import setup, find_packages from os import path -here = Path(path.dirname(__file__)).resolve() -sys.path.insert(0, path.join(str(here), 'src')) - -from inscriptis import (__version__, __author__, __author_email__, __license__) +here = Path(path.dirname(__file__)).resolve() +# get version information +with here.joinpath('src/inscriptis/metadata.py').open() as f: + exec(f.read()) # Get the long description from the README.md file -with here.joinpath(Path('README.rst')).open() as f: # , encoding='utf-8' +with here.joinpath('README.rst').open() as f: # , encoding='utf-8' long_description = f.read() setup( # Metadata - name="inscriptis", + name='inscriptis', version=__version__, description='inscriptis - HTML to text converter.', long_description=long_description, author=__author__, author_email=__author_email__, - python_requires='>=3.5', + python_requires='>=3.6', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', @@ -33,7 +33,6 @@ 'Topic :: Text Processing :: Markup :: HTML', 'Topic :: Utilities', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', @@ -56,5 +55,5 @@ install_requires=[ 'lxml', 'requests' - ] + ], ) diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py index fe43741..3ee9215 100644 --- a/src/inscriptis/__init__.py +++ b/src/inscriptis/__init__.py @@ -1,11 +1,15 @@ -""" -Inscriptis parses HTML content and converts it into a text representation. -Among others it provides support for +r"""Parse HTML content and converts it into a text representation. + +Inscriptis provides support for + + - nested HTML tables + - basic Cascade Style Sheets + - annotations -- nested HTML tables and -- basic Cascade Style Sheets. +The following example provides the text representation of +``_. -Example:: +.. code:: import urllib.request from inscriptis import get_text @@ -17,52 +21,114 @@ print(text) -""" +Use the method :meth:`~inscriptis.get_annotated_text` to obtain text and +annotations. The method requires annotation rules as described in annotations_. -__author__ = 'Albert Weichselbraun, Fabian Odoni' -__author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch' -__copyright__ = '2016-2021 Albert Weichselbraun, Fabian Odoni' -__license__ = 'Apache 2.0' -__version__ = '1.2' +.. code:: + import urllib.request + from inscriptis import get_annotated_text -try: - import re - from lxml.html import fromstring + url = "https://www.fhgr.ch" + html = urllib.request.urlopen(url).read().decode('utf-8') - from inscriptis.html_engine import Inscriptis + # annotation rules specify the HTML elements and attributes to annotate. + rules = {'h1': ['heading'], + 'h2': ['heading'], + '#class=FactBox': ['fact-box'], + 'i': ['emphasis']} -except ImportError: - import warnings - warnings.warn( - "Missing dependencies - inscriptis has not been properly installed") + output = get_annotated_text(html, ParserConfig(annotation_rules=rules) + print("Text:", output['text']) + print("Annotations:", output['label']) +The method returns a dictionary with two keys: -RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>') + 1. `text` which contains the page's plain text and + 2. `label` with the annotations in JSONL format that is used by annotators + such as `doccano `_. +Annotations in the `label` field are returned as a list of triples with + `start index`, `end index` and `label` as indicated below: + +.. code-block:: json + + {"text": "Chur\n\nChur is the capital and largest town of the Swiss canton + of the Grisons and lies in the Grisonian Rhine Valley.", + "label": [[0, 4, "heading"], [6, 10, "emphasis"]]} + +""" + +import re +import lxml.html + +from typing import Dict, Optional, Any + +from inscriptis.model.config import ParserConfig +from inscriptis.html_engine import Inscriptis + +RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>') -def get_text(html_content, config=None): - """ - Converts an HTML string to text, optionally including and deduplicating - image captions, displaying link targets and using either the standard - or extended indentation strategy. +def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]: + """Obtain the HTML parse tree for the given HTML content. Args: - html_content (str): the HTML string to be converted to text. - config: An optional ParserConfig object. + html_content: The content to parse. Returns: - str -- The text representation of the HTML content. + The corresponding HTML parse tree. """ html_content = html_content.strip() if not html_content: - return '' + return None # strip XML declaration, if necessary if html_content.startswith(' str: + """Provide a text representation of the given HTML content. + + Args: + html_content (str): The HTML content to convert. + config: An optional ParserConfig object. + + Returns: + The text representation of the HTML content. + """ + html_tree = _get_html_tree(html_content) + return Inscriptis(html_tree, config).get_text() if html_tree is not None \ + else '' + + +def get_annotated_text(html_content: str, + config: ParserConfig = None) -> Dict[str, Any]: + """Return a dictionary of the extracted text and annotations. + + Notes: + - the text is stored under the key 'text'. + - annotations are provided under the key 'label' which contains a + list of :class:`Annotation`s. + + Examples: + {"text": "EU rejects German call to boycott British lamb.", " + label": [ [0, 2, "strong"], ... ]} + {"text": "Peter Blackburn", + "label": [ [0, 15, "heading"] ]} + + Returns: + A dictionary of text (key: 'text') and annotations (key: 'label') + """ + html_tree = _get_html_tree(html_content) + if html_tree is None: + return {} + + inscriptis = Inscriptis(html_tree, config) + labels = [(a.start, a.end, a.metadata) + for a in inscriptis.get_annotations()] + return {'text': inscriptis.get_text(), + 'label': labels} diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py new file mode 100644 index 0000000..3d2b626 --- /dev/null +++ b/src/inscriptis/annotation/__init__.py @@ -0,0 +1,60 @@ +"""The model used for saving annotations.""" + +from typing import NamedTuple, Tuple +from typing import List + +from inscriptis.html_properties import HorizontalAlignment + + +class Annotation(NamedTuple): + """An Inscriptis annotation which provides metadata on the extracted text. + + The :attr:`start` and :attr:`end` indices indicate the span of the text + to which the metadata refers, and the attribute :attr:`metadata` contains + the tuple of tags describing this span. + + Example:: + + Annotation(0, 10, ('heading', )) + + The annotation above indicates that the text span between the 1st (index 0) + and 11th (index 10) character of the extracted text contains a *heading*. + """ + + start: int + """the annotation's start index within the text output.""" + end: int + """the annotation's end index within the text output.""" + metadata: Tuple[str] + """a tuple of tags to be attached to the annotation.""" + + +def horizontal_shift(annotations: List[Annotation], content_width: int, + line_width: int, align: HorizontalAlignment, + shift: int = 0) -> List[Annotation]: + r"""Shift annotations based on the given line's formatting. + + Adjusts the start and end indices of annotations based on the line's + formatting and width. + + Args: + annotations: a list of Annotations. + content_width: the width of the actual content + line_width: the width of the line in which the content is placed. + align: the horizontal alignment (left, right, center) to assume for + the adjustment + shift: an optional additional shift + + Returns: + A list of :class:`Annotation`\s with the adjusted start and end + positions. + """ + if align == HorizontalAlignment.left: + h_align = shift + elif align == HorizontalAlignment.right: + h_align = shift + line_width - content_width + else: + h_align = shift + (line_width - content_width) // 2 + + return [Annotation(a.start + h_align, a.end + h_align, a.metadata) + for a in annotations] diff --git a/src/inscriptis/annotation/output/__init__.py b/src/inscriptis/annotation/output/__init__.py new file mode 100644 index 0000000..41a7fb2 --- /dev/null +++ b/src/inscriptis/annotation/output/__init__.py @@ -0,0 +1,45 @@ +r""":class:`AnnotationProcessor`\s transform annotations to an output format. + +All AnnotationProcessor's implement the :class:`AnnotationProcessor` interface +by overwrite the class's :meth:`AnnotationProcessor.__call__` method. + +.. note:: + 1. The AnnotationExtractor class must be put into a package with the + extractor's name (e.g., :mod:`inscriptis.annotation.output.*package*`) + and be named :class:`*PackageExtractor*` (see the examples below). + 2. The overwritten :meth:`__call__` method may either extend the original + dictionary which contains the extracted text and annotations (e.g., + :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or + may replace it with an custom output (e.g., + :class:`~inscriptis.annotation.output.html.HtmlExtractor` and + :class:`~inscriptis.annotation.output.xml.XmlExtractor`. + +Currently, Inscriptis supports the following built-in AnnotationProcessors: + + 1. :class:`~inscriptis.annotation.output.html.HtmlExtractor` provides an + annotated HTML output format. + 2. :class:`~inscriptis.annotation.output.xml.XmlExtractor` yields an output + which marks annotations with XML tags. + 3. :class:`~inscriptis.annotation.output.surface.SurfaceExtractor` adds the + key `surface` to the result dictionary which contains the surface forms + of the extracted annotations. + +""" +from typing import Dict, Any + + +class AnnotationProcessor: + """An AnnotationProcessor is called for formatting annotations.""" + + def __call__(self, annotated_text: Dict[str, str]) -> Any: + """Format the given text and annotations. + + Args: + annotated_text: a dictionary that contains the converted text and + all annotations that have been found. + + Returns: + An output representation that has been changed according to the + AnnotationProcessor's design. + """ + raise NotImplementedError diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py new file mode 100644 index 0000000..310f935 --- /dev/null +++ b/src/inscriptis/annotation/output/html.py @@ -0,0 +1,103 @@ +"""HTML Annotation Processor.""" +from collections import defaultdict +from itertools import cycle +from typing import Dict, Any, List + +from inscriptis.annotation.output import AnnotationProcessor + +COLOR_SCHEMA = ('#D8115980', '#8F2D5680', '#21838080', + '#FBB13C80', '#73D2DE80') + + +class HtmlExtractor(AnnotationProcessor): + """Provides an HTML version of the extracted text. + + The generated HTML colors annotations based on the COLOR_SCHEMA + constant. + """ + + verbatim = True + + def __call__(self, annotated_text: Dict[str, Any]) -> str: + tag_indices = defaultdict(list) + + for start, end, label in sorted(annotated_text['label']): + tag_indices[start].append(label) + tag_indices[end].append('/' + label) + + open_tags = [] + tagged_content = ['
']
+        for idx, ch in enumerate(annotated_text['text']):
+            if idx in tag_indices:
+                tags = tag_indices[idx]
+                # close tags:
+                for _ in (t for t in sorted(tags, reverse=True)
+                          if t.startswith('/')):
+                    open_tags.pop()
+                    tagged_content.append('')
+                # open tags
+                for tag in (t for t in sorted(tags, reverse=True)
+                            if not t.startswith('/')):
+                    open_tags.append(tag)
+                    tagged_content.append(
+                        '{tag}'
+                        ''.format(tag=tag))
+
+            if ch == '\n':
+                tagged_content.extend(['' for _ in open_tags])
+                tagged_content.append('
\n
')
+                tagged_content.extend([''.format(tag=tag)
+                                       for tag in open_tags])
+            else:
+                tagged_content.append(ch)
+
+        return ''.join(tagged_content) + '
' + + @staticmethod + def _get_label_colors(labels: List[str]) -> Dict[str, str]: + """Compute the mapping between annotation labels and colors. + + The used color schema is available in the global variable COLOR_SCHEMA. + + Args: + labels: a list of the annotations classes (e.g., heading, etc.) + that need to be color-coded. + Returns: + A mapping between the available labels and the corresponding color + from the COLOR_SCHEMA. + """ + return {label: color + for label, color in zip({a[2] for a in sorted(labels)}, + cycle(COLOR_SCHEMA))} + + def _get_css(self, labels: List[str]) -> str: + """Compute the CSS to be included into the HTML output. + + Args: + labels: a list of the annotations classes (e.g., heading, etc.) + that need to be color-coded. + + Returns: + A string containing the CSS to be embedded into the HTML output. + + """ + css = [] + for label, color in sorted(self._get_label_colors(labels).items()): + css.append( + 'pre{{' + ' position: relative;\n' + '}}\n' + '.{label} {{\n' + ' background-color: {color};\n' + ' border-radius: 0.4em;\n' + '}}\n' + '.{label}-label {{\n' + ' top: -1.0em;\n' + ' content: "{label}";\n' + ' position: absolute;\n' + ' background-color: {color};\n' + ' font-size: 75%; }}\n'.format(label=label, + color=color)) + return '\n'.join(css) diff --git a/src/inscriptis/annotation/output/surface.py b/src/inscriptis/annotation/output/surface.py new file mode 100644 index 0000000..52472d4 --- /dev/null +++ b/src/inscriptis/annotation/output/surface.py @@ -0,0 +1,27 @@ +"""Surface Form Annotation Processor.""" +from typing import Dict, Any + +from inscriptis.annotation.output import AnnotationProcessor + + +class SurfaceExtractor(AnnotationProcessor): + """Extracts the surface form of all annotated labels.""" + + verbatim = False + + def __call__(self, annotated_text: Dict[str, Any]) -> Dict[str, Any]: + """ + Add information on the surface forms to the annotated_text dictionary. + + Args: + annotated_text: a dictionary containing the plain text and the + extracted annotations. + + Returns: + An extended dictionary which contains the extracted surface-forms + of the annotations under the key 'surface'. + """ + surface_forms = [(label, annotated_text['text'][start:end]) + for start, end, label in annotated_text['label']] + annotated_text['surface'] = surface_forms + return annotated_text diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py new file mode 100644 index 0000000..6e07f90 --- /dev/null +++ b/src/inscriptis/annotation/output/xml.py @@ -0,0 +1,44 @@ +"""XML Annotation processor.""" +from collections import defaultdict +from typing import Dict, Any + +from inscriptis.annotation.output import AnnotationProcessor + + +class XmlExtractor(AnnotationProcessor): + """Provide the converted text with XML-style annotations.""" + + verbatim = True + + def __call__(self, annotated_text: Dict[str, Any]) -> str: + """Provide an XML version of the given text and annotations. + + Args: + annotated_text: a dictionary containing the plain text and the + extracted annotations. + + Returns: + A string with the XML-version of the content. + """ + tag_indices = defaultdict(list) + + for start, end, label in sorted(annotated_text['label']): + tag_indices[start].append(label) + tag_indices[end].append('/' + label) + + current_idx = 0 + tagged_content = [''] + text = annotated_text['text'] + for index, tags in sorted(tag_indices.items()): + tagged_content.append(text[current_idx:index]) + # close tags + tagged_content.extend(['<' + tag + '>' + for tag in sorted(tags, reverse=True) + if tag.startswith('/')]) + # open tags + tagged_content.extend(['<' + tag + '>' for tag in sorted(tags) + if not tag.startswith('/')]) + current_idx = index + tagged_content.append(text[current_idx:]) + + return ''.join(tagged_content) diff --git a/src/inscriptis/annotation/parser.py b/src/inscriptis/annotation/parser.py new file mode 100644 index 0000000..500df4f --- /dev/null +++ b/src/inscriptis/annotation/parser.py @@ -0,0 +1,101 @@ +"""Parse annotation configuration files. + +Annotation configuration files contain a dictionary that maps tags and +attributes to the corresponding annotation. + + - tags are referenced by their name + - attributes by a `#` (e.g., `#class`) and an optional selector (e.g., + `#class=short-description`) + +Example:: + + { + "h1": ["heading"], + "b": ["emphasis"], + "div#class=toc": ["table-of-contents"], + "#class=short-description]": ["description"] + } +""" +from collections import defaultdict +from copy import copy + +from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT + + +class ApplyAnnotation: + """Apply an Annotation to the given attribute. + + Arguments: + annotations: a tuple of annotations to be applied to the attribute. + attr: the name of the attribute. + match_tag: only apply annotations to attributes that belong to the + given match_tag. + match_value: only apply annotations to attribute with the given + match_value. + """ + + __slots__ = ('annotations', 'match_tag', 'match_value', 'attr', 'matcher') + + def __init__(self, annotations: tuple, attr: str, match_tag: str = None, + match_value: str = None): + self.annotations = tuple(annotations) + self.attr = attr + self.match_tag = match_tag + self.match_value = match_value + + def apply(self, attr_value: str, html_element: HtmlElement): + """Apply the annotation to HtmlElements with matching tags.""" + if (self.match_tag and self.match_tag != html_element.tag) or ( + self.match_value and self.match_value + not in attr_value.split()): + return + + html_element.annotation += self.annotations + + def __str__(self): + return ' 'AnnotationModel': + """Compute the AnnotationModel from a model dictionary. + + Returns: + the AnnotationModel matching the input dictionary. + """ + tags = defaultdict(list) + attrs = [] + for key, annotations in model.items(): + if '#' in key: + tag, attr = key.split('#') + if '=' in attr: + attr, value = attr.split('=') + else: + value = None + attrs.append(ApplyAnnotation(annotations, attr, + tag, value)) + else: + tags[key].extend(annotations) + return tags, attrs diff --git a/src/inscriptis/css_profiles.py b/src/inscriptis/css_profiles.py index 3b4c084..48bb660 100644 --- a/src/inscriptis/css_profiles.py +++ b/src/inscriptis/css_profiles.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # coding: utf-8 -""" -Standard CSS profiles shipped with inscriptis. +"""Standard CSS profiles shipped with inscriptis. - `strict`: this profile corresponds to the defaults used by Firefox - `relaxed`: this profile is more suited for text analytics, since it ensures @@ -9,74 +8,75 @@ preventing cases where two words stick together. """ -from inscriptis.model.css import HtmlElement +from inscriptis.model.html_element import HtmlElement from inscriptis.html_properties import Display, WhiteSpace STRICT_CSS_PROFILE = { - 'body': HtmlElement('body', display=Display.inline, + 'body': HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal), - 'head': HtmlElement('head', display=Display.none), - 'link': HtmlElement('link', display=Display.none), - 'meta': HtmlElement('meta', display=Display.none), - 'script': HtmlElement('script', display=Display.none), - 'title': HtmlElement('title', display=Display.none), - 'style': HtmlElement('style', display=Display.none), + 'head': HtmlElement(display=Display.none), + 'link': HtmlElement(display=Display.none), + 'meta': HtmlElement(display=Display.none), + 'script': HtmlElement(display=Display.none), + 'title': HtmlElement(display=Display.none), + 'style': HtmlElement(display=Display.none), - 'p': HtmlElement('p', display=Display.block, margin_before=1, + 'p': HtmlElement(display=Display.block, margin_before=1, margin_after=1), - 'figure': HtmlElement('figure', display=Display.block, margin_before=1, + 'figure': HtmlElement(display=Display.block, margin_before=1, margin_after=1), - 'h1': HtmlElement('h1', display=Display.block, margin_before=1, + 'h1': HtmlElement(display=Display.block, margin_before=1, margin_after=1), - 'h2': HtmlElement('h2', display=Display.block, margin_before=1, + 'h2': HtmlElement(display=Display.block, margin_before=1, margin_after=1), - 'h3': HtmlElement('h3', display=Display.block, margin_before=1, + 'h3': HtmlElement(display=Display.block, margin_before=1, margin_after=1), - 'h4': HtmlElement('h4', display=Display.block, margin_before=1, + 'h4': HtmlElement(display=Display.block, margin_before=1, margin_after=1), - 'h5': HtmlElement('h5', display=Display.block, margin_before=1, + 'h5': HtmlElement(display=Display.block, margin_before=1, margin_after=1), - 'h6': HtmlElement('h6', display=Display.block, margin_before=1, + 'h6': HtmlElement(display=Display.block, margin_before=1, margin_after=1), - 'ul': HtmlElement('ul', display=Display.block, margin_before=0, - margin_after=0, padding=4), - 'ol': HtmlElement('ol', display=Display.block, margin_before=0, - margin_after=0, padding=4), - 'li': HtmlElement('li', display=Display.block), + 'ul': HtmlElement(display=Display.block, margin_before=0, + margin_after=0, padding_inline=4), + 'ol': HtmlElement(display=Display.block, margin_before=0, + margin_after=0, padding_inline=4), + 'li': HtmlElement(display=Display.block), - 'address': HtmlElement('address', display=Display.block), - 'article': HtmlElement('article', display=Display.block), - 'aside': HtmlElement('aside', display=Display.block), - 'div': HtmlElement('div', display=Display.block), - 'footer': HtmlElement('footer', display=Display.block), - 'header': HtmlElement('header', display=Display.block), - 'hgroup': HtmlElement('hgroup', display=Display.block), - 'layer': HtmlElement('layer', display=Display.block), - 'main': HtmlElement('main', display=Display.block), - 'nav': HtmlElement('nav', display=Display.block), - 'figcaption': HtmlElement('figcaption', display=Display.block), + 'address': HtmlElement(display=Display.block), + 'article': HtmlElement(display=Display.block), + 'aside': HtmlElement(display=Display.block), + 'div': HtmlElement(display=Display.block), + 'footer': HtmlElement(display=Display.block), + 'header': HtmlElement(display=Display.block), + 'hgroup': HtmlElement(display=Display.block), + 'layer': HtmlElement(display=Display.block), + 'main': HtmlElement(display=Display.block), + 'nav': HtmlElement(display=Display.block), + 'figcaption': HtmlElement(display=Display.block), - 'blockquote': HtmlElement('blockquote', display=Display.block), + 'blockquote': HtmlElement(display=Display.block), - 'q': HtmlElement('q', prefix='"', suffix='"'), + 'q': HtmlElement(prefix='"', suffix='"'), # Handling of
-    'pre': HtmlElement('pre', display=Display.block,
+    'pre': HtmlElement(display=Display.block,
                        whitespace=WhiteSpace.pre),
-    'xmp': HtmlElement('xmp', display=Display.block,
+    'xmp': HtmlElement(display=Display.block,
                        whitespace=WhiteSpace.pre),
-    'listing': HtmlElement('listing', display=Display.block,
+    'listing': HtmlElement(display=Display.block,
                            whitespace=WhiteSpace.pre),
-    'plaintext': HtmlElement('plaintext', display=Display.block,
+    'plaintext': HtmlElement(display=Display.block,
                              whitespace=WhiteSpace.pre),
+
 }
 
 RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy()
-RELAXED_CSS_PROFILE['div'] = HtmlElement('div', display=Display.block,
-                                         padding=2)
-RELAXED_CSS_PROFILE['span'] = HtmlElement('span', display=Display.inline,
+RELAXED_CSS_PROFILE['div'] = HtmlElement(display=Display.block,
+                                         padding_inline=2)
+RELAXED_CSS_PROFILE['span'] = HtmlElement(display=Display.inline,
                                           prefix=' ', suffix=' ',
                                           limit_whitespace_affixes=True)
 
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
index 1587510..595c61d 100644
--- a/src/inscriptis/html_engine.py
+++ b/src/inscriptis/html_engine.py
@@ -1,27 +1,19 @@
 #!/usr/bin/env python
 # coding:utf-8
-"""
-The HTML Engine is responsible for converting HTML to text.
+"""The HTML Engine is responsible for converting HTML to text."""
+from typing import List
 
-Guiding principles:
+import lxml.html
 
- 1. break lines only if we encounter a block element
-"""
-from itertools import chain
-from html import unescape
-
-from inscriptis.model.attribute import apply_attributes
-from inscriptis.model.css import HtmlElement
-from inscriptis.model.canvas import Line
+from inscriptis.annotation import Annotation
+from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT
+from inscriptis.model.canvas import Canvas
 from inscriptis.model.config import ParserConfig
-from inscriptis.model.table import Table
-from inscriptis.html_properties import Display, WhiteSpace
+from inscriptis.model.table import Table, TableCell
 
 
 class Inscriptis:
-    """
-    The Inscriptis class translates an lxml HTML tree to the corresponding
-    text representation.
+    """Translate an lxml HTML tree to the corresponding text representation.
 
     Args:
       html_tree: the lxml HTML tree to convert.
@@ -45,9 +37,8 @@ class Inscriptis:
     UL_COUNTER = ('* ', '+ ', 'o ', '- ')
     UL_COUNTER_LEN = len(UL_COUNTER)
 
-    DEFAULT_ELEMENT = HtmlElement()
-
-    def __init__(self, html_tree, config=None):
+    def __init__(self, html_tree: lxml.html.HtmlElement,
+                 config: ParserConfig = None):
         # use the default configuration, if no config object is provided
         self.config = config or ParserConfig()
 
@@ -74,19 +65,13 @@ def __init__(self, html_tree, config=None):
         }
 
         # instance variables
-        self.current_tag = [self.config.css['body']]
-        self.current_line = [Line()]
-        self.next_line = [Line()]
-
-        # the canvases used for displaying text
-        # clean_text_line[0] refers to the root canvas; tables write into child
-        # canvases that are created for every table line and merged with the
-        # root canvas at the end of a table
-        self.clean_text_lines = [[]]
+        self.canvas = Canvas()
+        self.css = self.config.css
+        self.apply_attributes = self.config.attribute_handler.apply_attributes
 
+        self.tags = [self.css['body'].set_canvas(self.canvas)]
         self.current_table = []
         self.li_counter = []
-        self.li_level = 0
         self.last_caption = None
 
         # used if display_links is enabled
@@ -94,157 +79,91 @@ def __init__(self, html_tree, config=None):
 
         # crawl the html tree
         self._parse_html_tree(html_tree)
-        if self.current_line[-1]:
-            self._write_line()
 
     def _parse_html_tree(self, tree):
-        """
-        Parses the HTML tree.
+        """Parse the HTML tree.
 
         Args:
             tree: the HTML tree to parse.
         """
-        if isinstance(tree.tag, str):
-            self.handle_starttag(tree.tag, tree.attrib)
-            if tree.text:
-                self.handle_data(tree.text)
+        # ignore comments
+        if not isinstance(tree.tag, str):
+            return
 
-            for node in tree:
-                self._parse_html_tree(node)
+        self.handle_starttag(tree.tag, tree.attrib)
+        cur = self.tags[-1]
+        cur.canvas.open_tag(cur)
 
-            self.handle_endtag(tree.tag)
+        self.tags[-1].write(tree.text)
 
-        if tree.tail:
-            self.handle_data(tree.tail)
+        for node in tree:
+            self._parse_html_tree(node)
 
-    def get_text(self):
-        """
-        Returns:
-          str -- A text representation of the parsed content.
-        """
-        return unescape('\n'.join(chain(*self.clean_text_lines))).rstrip()
+        self.handle_endtag(tree.tag)
+        prev = self.tags.pop()
+        prev.canvas.close_tag(prev)
 
-    def _write_line(self, force=False):
-        """
-        Writes the current line to the buffer, provided that there is any
-        data to write.
+        # write the tail text to the element's container
+        self.tags[-1].write_tail(tree.tail)
 
-        Returns:
-          bool -- True, if a line has been writer, otherwise False.
-        """
-        # only write the line if it contains relevant content
-        if not force and (not self.current_line[-1].content
-                          or self.current_line[-1].content.isspace()):
-            self.current_line[-1].margin_before = \
-                max(self.current_line[-1].margin_before,
-                    self.current_tag[-1].margin_before)
-            return False
-
-        line = self.current_line[-1].get_text()
-        self.clean_text_lines[-1].append(line)
-        self.current_line[-1] = self.next_line[-1]
-        self.next_line[-1] = Line()
-        return True
-
-    def _write_line_verbatim(self, text):
-        """
-        Writes the current buffer without any modifications.
+    def get_text(self) -> str:
+        """Return the text extracted from the HTML page."""
+        return self.canvas.get_text()
 
-        Args:
-          text (str): the text to write.
-        """
-        self.clean_text_lines[-1].append(text)
+    def get_annotations(self) -> List[Annotation]:
+        """Return the annotations extracted from the HTML page."""
+        return self.canvas.annotations
 
     def handle_starttag(self, tag, attrs):
-        """
-        Handles HTML start tags.
+        """Handle HTML start tags.
+
+        Compute the style of the current :class:`HtmlElement`, based on
+
+        1. the used :attr:`css`,
+        2. apply attributes and css with :meth:`~Attribute.apply_attributes`
+        3. add the `HtmlElement` to the list of open tags.
+
+        Lookup and apply and tag-specific start tag handler in
+        :attr:`start_tag_handler_dict`.
 
         Args:
-          tag (str): the HTML start tag to process.
-          attrs (dict): a dictionary of HTML attributes and their respective
-             values.
+          tag: the HTML start tag to process.
+          attrs: a dictionary of HTML attributes and their respective values.
         """
         # use the css to handle tags known to it :)
-
-        cur = self.current_tag[-1].get_refined_html_element(
-            self.config.css.get(tag, Inscriptis.DEFAULT_ELEMENT))
-        apply_attributes(attrs, html_element=cur)
-        self.current_tag.append(cur)
-
-        self.next_line[-1].padding = self.current_line[-1].padding \
-            + cur.padding
-        # flush text before display:block elements
-        if cur.display == Display.block:
-            if not self._write_line():
-                self.current_line[-1].margin_before = 0 \
-                    if not self.clean_text_lines[0] else max(
-                        self.current_line[-1].margin_before, cur.margin_before)
-                self.current_line[-1].padding = self.next_line[-1].padding
-            else:
-                self.current_line[-1].margin_after = max(
-                    self.current_line[-1].margin_after, cur.margin_after)
+        cur = self.tags[-1].get_refined_html_element(
+            self.apply_attributes(attrs, html_element=self.css.get(
+                tag, DEFAULT_HTML_ELEMENT).__copy__().set_tag(tag)))
+        self.tags.append(cur)
 
         handler = self.start_tag_handler_dict.get(tag, None)
         if handler:
             handler(attrs)
 
     def handle_endtag(self, tag):
-        """
-        Handles HTML end tags.
+        """Handle HTML end tags.
+
+        Look up the handler for closing the tag in :attr:`end_tag_handler_dict`
+        and execute it, if available.
 
         Args:
-          tag(str): the HTML end tag to process.
+          tag: the HTML end tag to process.
         """
-        cur = self.current_tag.pop()
-        self.next_line[-1].padding = self.current_line[-1].padding \
-            - cur.padding
-        self.current_line[-1].margin_after = max(
-            self.current_line[-1].margin_after, cur.margin_after)
-        # flush text after display:block elements
-        if cur.display == Display.block:
-            # propagate the new padding to the current line, if nothing has
-            # been written
-            if not self._write_line():
-                self.current_line[-1].padding = self.next_line[-1].padding
-
         handler = self.end_tag_handler_dict.get(tag, None)
         if handler:
             handler()
 
-    def handle_data(self, data):
-        """
-        Handles text belonging to HTML tags.
-
-        Args:
-          data (str): The text to process.
-        """
-        if self.current_tag[-1].display == Display.none:
-            return
-
-        # protect pre areas
-        if self.current_tag[-1].whitespace == WhiteSpace.pre:
-            data = '\0' + data + '\0'
-
-        # add prefix, if present
-        data = self.current_tag[-1].prefix + data + self.current_tag[-1].suffix
-
-        # determine whether to add this content to a table column
-        # or to a standard line
-        self.current_line[-1].content += data
-
-    def _start_ul(self, attrs):
-        self.li_level += 1
-        self.li_counter.append(Inscriptis.get_bullet(self.li_level - 1))
+    def _start_ul(self, _):
+        self.li_counter.append(self.get_bullet())
 
     def _end_ul(self):
-        self.li_level -= 1
         self.li_counter.pop()
 
     def _start_img(self, attrs):
         image_text = attrs.get('alt', '') or attrs.get('title', '')
         if image_text and not (self.config.deduplicate_captions
                                and image_text == self.last_caption):
-            self.current_line[-1].content += '[{0}]'.format(image_text)
+            self.tags[-1].write('[{0}]'.format(image_text))
             self.last_caption = image_text
 
     def _start_a(self, attrs):
@@ -255,85 +174,80 @@ def _start_a(self, attrs):
             self.link_target = self.link_target or attrs.get('name', '')
 
         if self.link_target:
-            self.current_line[-1].content += '['
+            self.tags[-1].write('[')
 
     def _end_a(self):
         if self.link_target:
-            self.current_line[-1].content += ']({0})'.format(self.link_target)
+            self.tags[-1].write(']({0})'.format(self.link_target))
 
-    def _start_ol(self, attrs):
+    def _start_ol(self, _):
         self.li_counter.append(1)
-        self.li_level += 1
 
     def _end_ol(self):
-        self.li_level -= 1
         self.li_counter.pop()
 
-    def _start_li(self, attrs):
-        self._write_line()
-        if self.li_level > 0:
-            bullet = self.li_counter[-1]
-        else:
-            bullet = "* "
+    def _start_li(self, _):
+        bullet = self.li_counter[-1] if self.li_counter else '* '
         if isinstance(bullet, int):
             self.li_counter[-1] += 1
-            self.current_line[-1].list_bullet = "{0}. ".format(bullet)
+            self.tags[-1].list_bullet = '{0}. '.format(bullet)
         else:
-            self.current_line[-1].list_bullet = bullet
+            self.tags[-1].list_bullet = bullet
 
-    def _start_table(self, attrs):
-        self.current_table.append(Table())
+        self.tags[-1].write('')
 
-    def _start_tr(self, attrs):
-        if self.current_table:
-            # check whether we need to cleanup a  tag that has not been
-            # closed yet
-            if self.current_table[-1].td_is_open:
-                self._end_td()
+    def _start_table(self, _):
+        self.tags[-1].set_canvas(Canvas())
+        self.current_table.append(Table(
+            left_margin_len=self.tags[-1].canvas.left_margin))
 
+    def _start_tr(self, _):
+        if self.current_table:
             self.current_table[-1].add_row()
 
-    def _start_td(self, attrs):
+    def _start_td(self, _):
         if self.current_table:
-            # check whether we need to cleanup a  tag that has not been
-            # closed yet
-            if self.current_table[-1].td_is_open:
-                self._end_td()
-
             # open td tag
-            self.clean_text_lines.append([])
-            self.current_line.append(Line())
-            self.next_line.append(Line())
-            self.current_table[-1].add_cell(self.clean_text_lines[-1],
-                                            align=self.current_tag[-1].align,
-                                            valign=self.current_tag[-1].valign)
-            self.current_table[-1].td_is_open = True
+            table_cell = TableCell(align=self.tags[-1].align,
+                                   valign=self.tags[-1].valign)
+            self.tags[-1].canvas = table_cell
+            self.current_table[-1].add_cell(table_cell)
 
     def _end_td(self):
-        if self.current_table and self.current_table[-1].td_is_open:
-            self.current_table[-1].td_is_open = False
-            self._write_line(force=True)
-            self.clean_text_lines.pop()
-            self.current_line.pop()
-            self.next_line.pop()
-
-    def _end_tr(self):
-        pass
+        if self.current_table:
+            self.tags[-1].canvas.close_tag(self.tags[-1])
 
     def _end_table(self):
-        if self.current_table and self.current_table[-1].td_is_open:
+        if self.current_table:
             self._end_td()
-        self._write_line()
         table = self.current_table.pop()
-        self._write_line_verbatim(table.get_text())
-
-    def _newline(self, attrs):
-        self._write_line(force=True)
-
-    @staticmethod
-    def get_bullet(index):
-        """
-        Returns:
-          str -- The bullet that corresponds to the given index.
-        """
-        return Inscriptis.UL_COUNTER[index % Inscriptis.UL_COUNTER_LEN]
+        # last tag before the table: self.tags[-2]
+        # table tag: self.tags[-1]
+
+        out_of_table_text = self.tags[-1].canvas.get_text().strip()
+        if out_of_table_text:
+            self.tags[-2].write(out_of_table_text)
+            self.tags[-2].canvas.write_newline()
+
+        start_idx = self.tags[-2].canvas.current_block.idx
+        self.tags[-2].write_verbatim_text(table.get_text())
+        self.tags[-2].canvas._flush_inline()
+
+        # transfer annotations from the current tag
+        if self.tags[-1].annotation:
+            end_idx = self.tags[-2].canvas.current_block.idx
+            for a in self.tags[-1].annotation:
+                self.tags[-2].canvas.annotations.append(Annotation(
+                    start_idx, end_idx, a))
+
+        # transfer in-table annotations
+        self.tags[-2].canvas.annotations.extend(
+            table.get_annotations(start_idx, self.tags[-2].canvas.left_margin))
+
+    def _newline(self, _):
+        self.tags[-1].canvas.write_newline()
+
+    def get_bullet(self) -> str:
+        """Return the bullet that correspond to the given index."""
+        return Inscriptis.UL_COUNTER[
+            len(self.li_counter) % Inscriptis.UL_COUNTER_LEN]
diff --git a/src/inscriptis/html_properties.py b/src/inscriptis/html_properties.py
index a33c6d6..b1d24ea 100644
--- a/src/inscriptis/html_properties.py
+++ b/src/inscriptis/html_properties.py
@@ -1,59 +1,58 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-"""
-This module provides the following properties used for the rendering of HTML
-pages:
+r"""Provide properties used for rendering HTML pages.
 
+Supported attributes::
  1. :class:`Display` properties.
  2. :class:`WhiteSpace` properties.
  3. :class:`HorizontalAlignment` properties.
+ 4. :class:`VerticalAlignment` properites.
 """
 
 from enum import Enum
 
 
 class Display(Enum):
+    """Specify whether content will be rendered as inline, block or none.
+
+    .. note::
+        A display attribute on none indicates, that the content should not be
+        rendered at all.
     """
-    This enum specifies whether content will be rendered as inline, block or
-    none (i.e. not rendered).
-    """
+
     inline = 1
     block = 2
     none = 3
 
 
 class WhiteSpace(Enum):
-    """
-    This enum specifies the whitespace handling used for an HTML element as
-    outlined in the `Cascading Style Sheets `_
-    specification.
-
-    .. data:: normal
-
-    Sequences of whitespaces will be collapsed into a single one.
+    """Specify the HTML element's whitespace handling.
 
-    .. data:: pre
-
-    Sequences of whitespaces will preserved.
+    Inscriptis supports the following handling strategies outlined in the
+    `Cascading Style Sheets `_ specification.
     """
+
     normal = 1
+    """Collapse multiple whitespaces into a single one."""
     pre = 3
+    """Preserve sequences of whitespaces."""
 
 
 class HorizontalAlignment(Enum):
-    """
-    This enum specifies the horizontal alignment.
-    """
+    """Specify the content's horizontal alignment."""
+
     left = '<'
+    """Left alignment of the block's content."""
     right = '>'
+    """Right alignment of the block's content."""
     center = '^'
+    """Center the block's content."""
 
 
 class VerticalAlignment(Enum):
-    """
-    This enum specifies the vertical alignment.
-    """
+    """Specify the content's vertical alignment."""
+
     top = 1
+    """Align all content at the top."""
     middle = 2
+    """Align all content in the middle."""
     bottom = 3
+    """Align all content at the bottom."""
diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py
new file mode 100644
index 0000000..8653994
--- /dev/null
+++ b/src/inscriptis/metadata.py
@@ -0,0 +1,7 @@
+"""Inscriptis metadata information."""
+
+__author__ = 'Albert Weichselbraun, Fabian Odoni'
+__author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch'
+__copyright__ = '2016-2021 Albert Weichselbraun, Fabian Odoni'
+__license__ = 'Apache 2.0'
+__version__ = '2.0rc1'
diff --git a/src/inscriptis/model/__init__.py b/src/inscriptis/model/__init__.py
index 8a706f5..c1b40bc 100644
--- a/src/inscriptis/model/__init__.py
+++ b/src/inscriptis/model/__init__.py
@@ -1,5 +1,4 @@
-"""
-The model used for HTML rendering.
+"""The model used for HTML rendering.
 
 - :mod:`inscriptis.model.canvas`: classes required for rendering parts of
     the HTML page.
diff --git a/src/inscriptis/model/attribute.py b/src/inscriptis/model/attribute.py
index fd524f1..0102e3f 100644
--- a/src/inscriptis/model/attribute.py
+++ b/src/inscriptis/model/attribute.py
@@ -1,31 +1,68 @@
 #!/usr/bin/env python
 # encoding: utf-8
 
-"""
-This class handles HTML attributes such as `align`, and `valign` by
-mapping them to the corresponding functions in the CssParse class.
-"""
+"""HTML attribute handling."""
+from copy import copy
+from typing import Dict, List
 
+from inscriptis.annotation.parser import ApplyAnnotation
 from inscriptis.model.css import CssParse
+from inscriptis.model.html_element import HtmlElement
 
-HTML_ATTRIBUTE_MAPPING = {
+DEFAULT_ATTRIBUTE_MAP = {
     'style': CssParse.attr_style,
     'align': CssParse.attr_horizontal_align,
     'valign': CssParse.attr_vertical_align
 }
 
 
-def apply_attributes(attributes, html_element):
-    """
+def merge_function(func1, func2):
+    """Merge two functions with the same arguments into a single one.
 
-    Applies the attributes to the given HTML element.
+    This function is used for cascading functions that operate on HtmlElements
+    and attributes.
 
     Args:
-        attributes: the list of attributes
-        html_element: the HTML element for which the attributes are parsed
+        func1: the first function
+        func2: the second function
+    """
+    def merged(*args):
+        func1(*args)
+        func2(*args)
+    return merged
+
+
+class Attribute:
+    """Handle HTML attributes such as `align`, and `valign`.
+
+    This class handles HTML attributes by mapping them to the corresponding
+    functions in the :class:`~inscriptis.model.css.CssParse` class.
+
+    Attributes:
+        attribute_mapping: a mapping of attributes to the corresponding handler
+                           functions.
     """
-    supported_attributes = filter(lambda t: t[0] in HTML_ATTRIBUTE_MAPPING,
-                                  attributes.items())
-    for attr_name, attr_value in supported_attributes:
-        HTML_ATTRIBUTE_MAPPING[attr_name](attr_value, html_element)
-    return html_element
+
+    def __init__(self):
+        self.attribute_mapping = DEFAULT_ATTRIBUTE_MAP
+
+    def apply_attributes(self, attributes: Dict[str, str],
+                         html_element: HtmlElement) -> HtmlElement:
+        """Apply the attributes to the given HTML element.
+
+        Args:
+            attributes: the list of attributes
+            html_element: the HTML element for which the attributes are parsed
+        """
+        supported_attributes = ((name, val) for name, val in attributes.items()
+                                if name in self.attribute_mapping)
+        for attr_name, attr_value in supported_attributes:
+            self.attribute_mapping[attr_name](attr_value, html_element)
+        return html_element
+
+    def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None):
+        attributes = copy(self.attribute_mapping)
+        for a in annotations:
+            attributes[a.attr] = a.apply if a.attr not in attributes \
+                else merge_function(attributes[a.attr], a.apply)
+        self.attribute_mapping = attributes
diff --git a/src/inscriptis/model/canvas.py b/src/inscriptis/model/canvas.py
deleted file mode 100644
index e3e118c..0000000
--- a/src/inscriptis/model/canvas.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-"""
-Elements used for rendering (parts) of the canvas.
-
-The :class:`Line` determines how a single line is rendered.
-"""
-
-
-class Line:
-    """
-    This class represents a line to render.
-
-    Args:
-        margin_before: number of empty lines before the given line.
-        margin_after: number of empty lines before the given line.
-        prefix: prefix add before the line's content.
-        suffix: suffix to add after the line's content.
-        list_bullet: a bullet to add before the line.
-        padding: horizontal padding
-        align: determines the alignment of the line (not used yet)
-        width: total width of the line in characters (not used yet)
-    """
-    __slots__ = ('margin_before', 'margin_after', 'prefix', 'suffix',
-                 'content', 'list_bullet', 'padding', 'align', 'width')
-
-    def __init__(self):
-        self.margin_before = 0
-        self.margin_after = 0
-        self.prefix = ""
-        self.suffix = ""
-        self.content = ""
-        self.list_bullet = ""
-        self.padding = 0
-
-    def get_text(self):
-        """
-        Returns:
-          str -- The text representation of the current line.
-        """
-        if '\0' not in self.content:
-            # standard text without any `WhiteSpace.pre` formatted text.
-            text = self.content.split()
-        else:
-            # content containing `WhiteSpace.pre` formatted text
-            self.content = self.content.replace('\0\0', '')
-            text = []
-            # optional padding to add before every line
-            base_padding = ' ' * self.padding
-
-            for no, data in enumerate(self.content.split('\0')):
-                # handle standard content
-                if no % 2 == 0:
-                    text.extend(data.split())
-                # handle `WhiteSpace.pre` formatted content.
-                else:
-                    text.append(data.replace('\n', '\n' + base_padding))
-
-        return ''.join(('\n' * self.margin_before,
-                        ' ' * (self.padding - len(self.list_bullet)),
-                        self.list_bullet,
-                        self.prefix,
-                        ' '.join(text),
-                        self.suffix,
-                        '\n' * self.margin_after))
-
-    def __str__(self):
-        return "".format(self.get_text())
-
-    def __repr__(self):
-        return str(self)
diff --git a/src/inscriptis/model/canvas/__init__.py b/src/inscriptis/model/canvas/__init__.py
new file mode 100644
index 0000000..dcc573d
--- /dev/null
+++ b/src/inscriptis/model/canvas/__init__.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+"""Classes used for rendering (parts) of the canvas.
+
+Every parsed :class:`~inscriptis.model.html_element.HtmlElement` writes its
+textual content to the canvas which is managed by the following three classes:
+
+  - :class:`Canvas` provides the drawing board on which the HTML page is
+    serialized and annotations are recorded.
+  - :class:`~inscriptis.model.canvas.block.Block` contains the current line to
+    which text is written.
+  - :class:`~inscriptis.model.canvas.prefix.Prefix` handles indentation
+    and bullets that prefix a line.
+"""
+from html import unescape
+
+from inscriptis.annotation import Annotation
+from inscriptis.html_properties import WhiteSpace, Display
+from inscriptis.model.canvas.block import Block
+from inscriptis.model.html_element import HtmlElement
+from inscriptis.model.canvas.prefix import Prefix
+
+
+class Canvas:
+    r"""The text Canvas on which Inscriptis writes the HTML page.
+
+    Attributes:
+        margin: the current margin to the previous block (this is required to
+            ensure that the `margin_after` and `margin_before` constraints of
+            HTML block elements are met).
+        current_block: A :class:`~inscriptis.model.canvas.block.Block` which
+            merges the input text into a block (i.e., line).
+        blocks: a list of strings containing the completed blocks (i.e.,
+            text lines). Each block spawns at least one line.
+        annotations: the list of recorded
+            :class:`~inscriptis.annotation.Annotation`\s.
+        _open_annotations: a map of open tags that contain annotations.
+    """
+
+    __slots__ = ('annotations', 'blocks', 'current_block', '_open_annotations',
+                 'margin')
+
+    def __init__(self):
+        self.margin = 1000  # margin to the previous block
+        self.current_block = Block(0, Prefix())
+        self.blocks = []
+        self.annotations = []
+        self._open_annotations = {}
+
+    def open_tag(self, tag: HtmlElement) -> None:
+        """Register that a tag is opened.
+
+        Args:
+            tag: the tag to open.
+        """
+        if tag.annotation:
+            self._open_annotations[tag] = self.current_block.idx
+
+        if tag.display == Display.block:
+            self.open_block(tag)
+
+    def open_block(self, tag: HtmlElement):
+        """Open an HTML block element."""
+        # write missing bullets, if no content has been written
+        if not self._flush_inline() and tag.list_bullet:
+            self.write_unconsumed_bullet()
+        self.current_block.prefix.register_prefix(tag.padding_inline,
+                                                  tag.list_bullet)
+
+        # write the block margin
+        required_margin = max(tag.previous_margin_after, tag.margin_before)
+        if required_margin > self.margin:
+            required_newlines = required_margin - self.margin
+            self.current_block.idx += required_newlines
+            self.blocks.append('\n' * (required_newlines - 1))
+            self.margin = required_margin
+
+    def write_unconsumed_bullet(self):
+        """Write unconsumed bullets to the blocks list."""
+        bullet = self.current_block.prefix.unconsumed_bullet
+        if bullet:
+            self.blocks.append(bullet)
+            self.current_block = self.current_block.new_block()
+            self.margin = 0
+
+    def write(self, tag: HtmlElement, text: str,
+              whitespace: WhiteSpace = None) -> None:
+        """Write the given text to the current block."""
+        self.current_block.merge(text, whitespace or tag.whitespace)
+
+    def close_tag(self, tag: HtmlElement) -> None:
+        """Register that the given tag tag is closed.
+
+        Args:
+            tag: the tag to close.
+        """
+        if tag.display == Display.block:
+            # write missing bullets, if no content has been written so far.
+            if not self._flush_inline() and tag.list_bullet:
+                self.write_unconsumed_bullet()
+            self.current_block.prefix.remove_last_prefix()
+            self.close_block(tag)
+
+        if tag in self._open_annotations:
+            start_idx = self._open_annotations.pop(tag)
+            # do not record annotations with no content
+            if start_idx == self.current_block.idx:
+                return
+
+            for annotation in tag.annotation:
+                self.annotations.append(
+                    Annotation(start_idx, self.current_block.idx, annotation))
+
+    def close_block(self, tag: HtmlElement):
+        """Close the given HtmlElement by writing its bottom margin.
+
+        Args:
+            tag: the HTML Block element to close
+        """
+        if tag.margin_after > self.margin:
+            required_newlines = tag.margin_after - self.margin
+            self.current_block.idx += required_newlines
+            self.blocks.append('\n' * (required_newlines - 1))
+            self.margin = tag.margin_after
+
+    def write_newline(self):
+        if not self._flush_inline():
+            self.blocks.append('')
+            self.current_block = self.current_block.new_block()
+
+    def get_text(self) -> str:
+        """Provide a text representation of the Canvas."""
+        self._flush_inline()
+        return unescape('\n'.join(self.blocks))
+
+    def _flush_inline(self) -> bool:
+        """Attempt to flush the content in self.current_block into a new block.
+
+        Notes:
+            - If self.current_block does not contain any content (or only
+              whitespaces) no changes are made.
+            - Otherwise the content of current_block is added to blocks and a
+              new current_block is initialized.
+
+        Returns:
+            True if the attempt was successful, False otherwise.
+        """
+        if not self.current_block.is_empty():
+            self.blocks.append(self.current_block.content)
+            self.current_block = self.current_block.new_block()
+            self.margin = 0
+            return True
+
+        return False
+
+    @property
+    def left_margin(self) -> int:
+        """Return the length of the current line's left margin."""
+        return self.current_block.prefix.current_padding
diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py
new file mode 100644
index 0000000..9a0cbdd
--- /dev/null
+++ b/src/inscriptis/model/canvas/block.py
@@ -0,0 +1,89 @@
+"""Representation of a text block within the HTML canvas."""
+from inscriptis.html_properties import WhiteSpace
+
+
+class Block:
+    """The current block of text.
+
+    A block usually refers to one line of output text.
+
+    .. note::
+        If pre-formatted content is merged with a block, it may also contain
+        multiple lines.
+
+    Args:
+        idx: the current block's start index.
+        prefix: prefix used within the current block.
+    """
+
+    __slots__ = ('idx', 'prefix', '_content', 'collapsable_whitespace')
+
+    def __init__(self, idx: int, prefix: str):
+        self.idx = idx
+        self.prefix = prefix
+        self._content = ''
+        self.collapsable_whitespace = True
+
+    def merge(self, text: str, whitespace: WhiteSpace) -> None:
+        """Merge the given text with the current block.
+
+        Args:
+            text: the text to merge.
+            whitespace: whitespace handling.
+        """
+        if whitespace == WhiteSpace.pre:
+            self.merge_pre_text(text)
+        else:
+            self.merge_normal_text(text)
+
+    def merge_normal_text(self, text: str) -> None:
+        """Merge the given text with the current block.
+
+        Args:
+            text: the text to merge
+        """
+        normalized_text = []
+
+        for ch in text:
+            if not ch.isspace():
+                normalized_text.append(ch)
+                self.collapsable_whitespace = False
+            elif not self.collapsable_whitespace:
+                normalized_text.append(' ')
+                self.collapsable_whitespace = True
+
+        if normalized_text:
+            text = ''.join((self.prefix.first, *normalized_text)) if not \
+                self._content else ''.join(normalized_text)
+            self._content += text
+            self.idx += len(text)
+
+    def merge_pre_text(self, text: str) -> None:
+        """Merge the given pre-formatted text with the current block.
+
+        Args:
+            text: the text to merge
+        """
+        text = ''.join((self.prefix.first,
+                        text.replace('\n', '\n' + self.prefix.rest)))
+        self._content += text
+        self.idx += len(text)
+        self.collapsable_whitespace = False
+
+    def is_empty(self) -> bool:
+        return len(self.content) == 0
+
+    @property
+    def content(self):
+        if not self.collapsable_whitespace:
+            return self._content
+
+        if self._content.endswith(' '):
+            self._content = self._content[:-1]
+            self.idx -= 1
+        return self._content
+
+    def new_block(self) -> 'Block':
+        """Return a new Block based on the current one."""
+        self.prefix.consumed = False
+        return Block(idx=self.idx + 1, prefix=self.prefix)
diff --git a/src/inscriptis/model/canvas/prefix.py b/src/inscriptis/model/canvas/prefix.py
new file mode 100644
index 0000000..ca0b768
--- /dev/null
+++ b/src/inscriptis/model/canvas/prefix.py
@@ -0,0 +1,99 @@
+"""Manage the horizontal prefix (left-indentation, bullets) of canvas lines."""
+
+from contextlib import suppress
+
+
+class Prefix:
+    """Class Prefix manages paddings and bullets that prefix an HTML block.
+
+    Attributes:
+        current_padding: the number of characters used for the current
+                         left-indentation.
+        paddings: the list of paddings for the current and all previous tags.
+        bullets: the list of bullets in the current and all previous tags.
+        consumed: whether the current bullet has already been consumed.
+    """
+
+    __slots__ = ('current_padding', 'paddings', 'bullets', 'consumed')
+
+    def __init__(self):
+        self.current_padding = 0
+        self.paddings = []
+        self.bullets = []
+        self.consumed = False
+
+    def register_prefix(self, padding_inline, bullet):
+        """Register the given prefix.
+
+        Args:
+            padding_inline: the number of characters used for padding_inline
+            bullet: an optional bullet.
+        """
+        self.current_padding += padding_inline
+        self.paddings.append(padding_inline)
+        self.bullets.append(bullet if bullet else '')
+
+    def remove_last_prefix(self):
+        """Remove the last prefix from the list."""
+        with suppress(IndexError):
+            self.current_padding -= self.paddings.pop()
+            del self.bullets[-1]
+
+    def pop_next_bullet(self):
+        """Pop the next bullet to use, if any bullet is available."""
+        next_bullet_idx = next((-idx for idx, val
+                                in enumerate(reversed(self.bullets))
+                                if val), 1) - 1
+
+        if not next_bullet_idx:
+            return ''
+
+        bullet = self.bullets[next_bullet_idx]
+        self.bullets[next_bullet_idx] = ''
+        return bullet
+
+    @property
+    def first(self):
+        """Return the prefix used at the beginning of a tag.
+
+        Note::
+            A new block needs to be prefixed by the current padding and bullet.
+            Once this has happened (i.e., :attr:`consumed` is set to `True`) no
+            further prefixes should be used for a line.
+        """
+        if self.consumed:
+            return ''
+
+        self.consumed = True
+        bullet = self.pop_next_bullet()
+        return ' ' * (self.current_padding - len(bullet)) \
+               + bullet
+
+    @property
+    def unconsumed_bullet(self):
+        """Yield any yet unconsumed bullet.
+
+        Note::
+            This function yields the previous element's bullets, if they have
+            not been consumed yet.
+        """
+        if self.consumed:
+            return ''
+
+        bullet = self.pop_next_bullet()
+        if not bullet:
+            return ''
+
+        padding = self.current_padding - self.paddings[-1]
+        return ' ' * (padding - len(bullet)) \
+               + bullet
+
+    @property
+    def rest(self):
+        """Return the prefix used for new lines within a block.
+
+        This prefix is used for pre-text that contains newlines. The lines
+        need to be prefixed with the right padding to preserver the
+        indentation.
+        """
+        return ' ' * self.current_padding
diff --git a/src/inscriptis/model/config.py b/src/inscriptis/model/config.py
index 836731d..06ed699 100644
--- a/src/inscriptis/model/config.py
+++ b/src/inscriptis/model/config.py
@@ -1,22 +1,23 @@
 #!/usr/bin/env python
-"""
-Provides configuration objects for the Inscriptis HTML 2 text parser.
-"""
+"""Provide configuration objects for the Inscriptis HTML to text converter."""
+
+from copy import deepcopy
 
 from inscriptis.css_profiles import CSS_PROFILES
+from inscriptis.annotation.parser import AnnotationModel
+from inscriptis.model.attribute import Attribute
 
 DEFAULT_CSS_PROFILE_NAME = 'relaxed'
 
 
 class ParserConfig:
-    """
-    The ParserConfig object encapsulates configuration options and custom CSS
-    definitions used by inscriptis for translating HTML to text.
-    """
+    """Encapsulate configuration options and CSS definitions."""
+
     def __init__(self, css=None, display_images=False,
                  deduplicate_captions=False, display_links=False,
-                 display_anchors=False):
-        """
+                 display_anchors=False, annotation_rules=None):
+        """Create a ParserConfig configuration.
+
         Args:
             css: an optional custom CSS definition.
             display_images: whether to include image tiles/alt texts.
@@ -26,18 +27,30 @@ def __init__(self, css=None, display_images=False,
             display_links: whether to display link targets
                            (e.g. `[Python](https://www.python.org)`).
             display_anchors: whether to display anchors (e.g. `[here](#here)`).
-
+            annotation_rules: an optional dictionary of annotation rules which
+                              specify tags and attributes to annotation.
         """
-
-        self.css = css or CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME]
         self.display_images = display_images
         self.deduplicate_captions = deduplicate_captions
         self.display_links = display_links
         self.display_anchors = display_anchors
+        self.css = css or CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME]
+        self.attribute_handler = Attribute()
+        if annotation_rules:
+            # ensure that we do not modify the original model or its
+            # members.
+            annotation_model = AnnotationModel(deepcopy(self.css),
+                                               annotation_rules)
+            # css with annotation support
+            self.css = annotation_model.css
+            # attribute handler with annotation support
+            self.attribute_handler.merge_attribute_map(
+                annotation_model.css_attr)
 
-    def parse_a(self):
-        """
-        Returns:
+    def parse_a(self) -> bool:
+        """Indicate whether the text output should contain links or anchors.
+
+        Returns
             Whether we need to parse  tags.
         """
         return self.display_links or self.display_anchors
diff --git a/src/inscriptis/model/css.py b/src/inscriptis/model/css.py
index 563746f..89f2362 100644
--- a/src/inscriptis/model/css.py
+++ b/src/inscriptis/model/css.py
@@ -1,113 +1,30 @@
-#!/usr/bin/env python
-# coding: utf-8
-"""
-This module implements basic CSS support for inscriptis.
+"""Implement basic CSS support for inscriptis.
 
-- The :class:`HtmlElement` class encapsulates all CSS properties of a single
-  HTML element.
+- The :class:`~inscriptis.model.html_element.HtmlElement` class
+  encapsulates all CSS properties of a single HTML element.
 - :class:`CssParse` parses CSS specifications and translates them into the
   corresponding HtmlElements used by Inscriptis for rendering HTML pages.
 """
-from copy import copy
+from contextlib import suppress
 from re import compile as re_compile
 from inscriptis.html_properties import (Display, WhiteSpace,
                                         HorizontalAlignment, VerticalAlignment)
-
-
-class HtmlElement:
-    """
-    The HtmlElement class stores the following CSS properties of HTML
-    elements:
-
-    - tag: tag name of the given HtmlElement.
-    - prefix: specifies a prefix that to insert before the tag's content.
-    - suffix: a suffix to append after the tag's content.
-    - display: :class:`~inscriptis.html_properties.Display` strategy used for
-      the content.
-    - margin_before: vertical margin before the tag's content.
-    - margin_after: vertical margin after the tag's content.
-    - padding: horizontal padding before the tag's content.
-    - whitespace: the :class:`~inscriptis.html_properties.Whitespace` handling
-      strategy.
-    - limit_whitespace_affixes: limit printing of whitespace affixes to
-      elements with `normal` whitespace handling.
-    """
-
-    __slots__ = ('tag', 'prefix', 'suffix', 'display', 'margin_before',
-                 'margin_after', 'padding', 'whitespace',
-                 'limit_whitespace_affixes', 'align', 'valign')
-
-    def __init__(self, tag='/', prefix='', suffix='', display=None,
-                 margin_before=0, margin_after=0, padding=0,
-                 whitespace=None, limit_whitespace_affixes=False,
-                 align=HorizontalAlignment.left,
-                 valign=VerticalAlignment.middle):
-        self.tag = tag
-        self.prefix = prefix
-        self.suffix = suffix
-        self.display = display
-        self.margin_before = margin_before
-        self.margin_after = margin_after
-        self.padding = padding
-        self.whitespace = whitespace
-        self.limit_whitespace_affixes = limit_whitespace_affixes
-        self.align = align
-        self.valign = valign
-
-    def get_refined_html_element(self, new):
-        """
-        Args:
-            new: The new HtmlElement to be applied to the current context.
-
-        Returns:
-            The refined element with the context applied.
-        """
-        refined_element = copy(new)
-
-        # inherit display:none attributes
-        if self.display == Display.none:
-            refined_element.display = Display.none
-
-        # no whitespace set => inherit
-        refined_element.whitespace = refined_element.whitespace \
-            or self.whitespace
-
-        # do not display whitespace only affixes in Whitespace.pre areas
-        # if `limit_whitespace_affixes` is set.
-        if (refined_element.limit_whitespace_affixes
-                and self.whitespace == WhiteSpace.pre):
-            if refined_element.prefix.isspace():
-                refined_element.prefix = ''
-            if refined_element.suffix.isspace():
-                refined_element.suffix = ''
-
-        return refined_element
-
-    def __str__(self):
-        return (
-            '<{self.tag} prefix={self.prefix}, suffix={self.suffix}, '
-            'display={self.display}, margin_before={self.margin_before}, '
-            'margin_after={self.margin_after}, padding={self.padding}, '
-            'whitespace={self.whitespace}, align={self.align}, '
-            'valign={self.valign}>'
-        ).format(self=self)
+from inscriptis.model.html_element import HtmlElement
 
 
 class CssParse:
-    """
-    Parses CSS specifications and translates them into the corresponding
-    HtmlElements.
+    """Parse CSS specifications and applies them to HtmlElements.
 
     The attribute `display: none`, for instance, is translated to
-    `HtmlElement.display=Display.none`.
+    :attr:`HtmlElement.display=Display.none`.
     """
+
     # used to separate value and unit from each other
     RE_UNIT = re_compile(r'(-?[0-9.]+)(\w+)')
 
     @staticmethod
-    def attr_style(style_attribute, html_element):
-        """
-        Applies the provided style attributes to the given html_element.
+    def attr_style(style_attribute: str, html_element: HtmlElement):
+        """Apply the provided style attributes to the given HtmlElement.
 
         Args:
           style_attribute: The attribute value of the given style sheet.
@@ -120,22 +37,25 @@ def attr_style(style_attribute, html_element):
             key, value = (s.strip() for s in style_directive.split(':', 1))
 
             try:
-                apply_style = getattr(CssParse, "attr_"
+                apply_style = getattr(CssParse, 'attr_'
                                       + key.replace('-webkit-', '')
-                                      .replace("-", "_"))
+                                      .replace('-', '_'))
                 apply_style(value, html_element)
             except AttributeError:
                 pass
 
     @staticmethod
-    def _get_em(length):
-        """
+    def _get_em(length: str) -> int:
+        """Convert length specifications into em.
+
+        This function takes a length specification (e.g., 2em, 2px, etc.) and
+        transforms it into em.
+
         Args:
-          length (str): the length (e.g. 2em, 2px, etc.) as specified in the
-                        CSS.
+          length: the length specification.
 
         Returns:
-            int -- the length in em's.
+            the length in em.
         """
         _m = CssParse.RE_UNIT.search(length)
         value = float(_m.group(1))
@@ -150,10 +70,8 @@ def _get_em(length):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def attr_display(value, html_element):
-        """
-        Apply the given display value.
-        """
+    def attr_display(value: str, html_element: HtmlElement):
+        """Apply the given display value."""
         if html_element.display == Display.none:
             return
 
@@ -165,55 +83,39 @@ def attr_display(value, html_element):
             html_element.display = Display.inline
 
     @staticmethod
-    def attr_white_space(value, html_element):
-        """
-        Apply the given white-space value.
-        """
+    def attr_white_space(value: str, html_element: HtmlElement):
+        """Apply the given white-space value."""
         if value in ('normal', 'nowrap'):
             html_element.whitespace = WhiteSpace.normal
         elif value in ('pre', 'pre-line', 'pre-wrap'):
             html_element.whitespace = WhiteSpace.pre
 
     @staticmethod
-    def attr_margin_top(value, html_element):
-        """
-        Apply the given top margin.
-        """
+    def attr_margin_top(value: str, html_element: HtmlElement):
+        """Apply the given top margin."""
         html_element.margin_before = CssParse._get_em(value)
 
     @staticmethod
-    def attr_margin_bottom(value, html_element):
-        """
-        Apply the provided bottom margin.
-        """
+    def attr_margin_bottom(value: str, html_element: HtmlElement):
+        """Apply the provided bottom margin."""
         html_element.margin_after = CssParse._get_em(value)
 
     @staticmethod
-    def attr_padding_left(value, html_element):
-        """
-        Apply the given left padding.
-        """
-        html_element.padding = CssParse._get_em(value)
+    def attr_padding_left(value: str, html_element: HtmlElement):
+        """Apply the given left padding_inline."""
+        html_element.padding_inline = CssParse._get_em(value)
 
     @staticmethod
-    def attr_horizontal_align(value, html_element):
-        """
-        Apply the provided horizontal alignment.
-        """
-        try:
+    def attr_horizontal_align(value: str, html_element: HtmlElement):
+        """Apply the provided horizontal alignment."""
+        with suppress(KeyError):
             html_element.align = HorizontalAlignment[value]
-        except KeyError:
-            pass
 
     @staticmethod
-    def attr_vertical_align(value, html_element):
-        """
-        Apply the given vertical alignment.
-        """
-        try:
+    def attr_vertical_align(value: str, html_element: HtmlElement):
+        """Apply the given vertical alignment."""
+        with suppress(KeyError):
             html_element.valign = VerticalAlignment[value]
-        except KeyError:
-            pass
 
     # register aliases
     attr_margin_before = attr_margin_top
diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py
new file mode 100644
index 0000000..aa5094c
--- /dev/null
+++ b/src/inscriptis/model/html_element.py
@@ -0,0 +1,167 @@
+"""Data structures for handling HTML Elements."""
+from typing import Tuple
+
+from inscriptis.html_properties import Display, HorizontalAlignment, \
+    VerticalAlignment, WhiteSpace
+
+
+class HtmlElement:
+    """The HtmlElement class stores properties and metadata of HTML elements.
+
+    Attributes:
+    - canvas: the canvas to which the HtmlElement writes its content.
+    - tag: tag name of the given HtmlElement.
+    - prefix: specifies a prefix that to insert before the tag's content.
+    - suffix: a suffix to append after the tag's content.
+    - display: :class:`~inscriptis.html_properties.Display` strategy used for
+      the content.
+    - margin_before: vertical margin before the tag's content.
+    - margin_after: vertical margin after the tag's content.
+    - padding_inline: horizontal padding_inline before the tag's content.
+    - whitespace: the :class:`~inscriptis.html_properties.Whitespace` handling
+      strategy.
+    - limit_whitespace_affixes: limit printing of whitespace affixes to
+      elements with `normal` whitespace handling.
+    - align: the element's horizontal alignment.
+    - valign: the element's vertical alignment.
+    - previous_margin_after: the margin after of the previous HtmlElement.
+    - annotation: annotations associated with the HtmlElement.
+    """
+
+    __slots__ = ('canvas', 'tag', 'prefix', 'suffix', 'display',
+                 'margin_before', 'margin_after', 'padding_inline',
+                 'list_bullet', 'whitespace', 'limit_whitespace_affixes',
+                 'align', 'valign', 'previous_margin_after', 'annotation')
+
+    def __init__(self, tag='default', prefix='', suffix='',
+                 display: Display = Display.inline,
+                 margin_before: int = 0,
+                 margin_after: int = 0,
+                 padding_inline: int = 0,
+                 list_bullet: str = '',
+                 whitespace: WhiteSpace = None,
+                 limit_whitespace_affixes: bool = False,
+                 align: HorizontalAlignment = HorizontalAlignment.left,
+                 valign: VerticalAlignment = VerticalAlignment.middle,
+                 annotation: Tuple[str] = ()):
+        self.canvas = None
+        self.tag = tag
+        self.prefix = prefix
+        self.suffix = suffix
+        self.display = display
+        self.margin_before = margin_before
+        self.margin_after = margin_after
+        self.padding_inline = padding_inline
+        self.list_bullet = list_bullet
+        self.whitespace = whitespace
+        self.limit_whitespace_affixes = limit_whitespace_affixes
+        self.align = align
+        self.valign = valign
+        self.previous_margin_after = 0
+        self.annotation = annotation
+
+    def __copy__(self) -> 'HtmlElement':
+        """Performance-optimized copy implementation."""
+        copy = self.__class__.__new__(self.__class__)
+        for attr in self.__slots__:
+            setattr(copy, attr, getattr(self, attr))
+        return copy
+
+    def write(self, text: str):
+        """Write the given HTML text to the element's canvas."""
+        if not text or self.display == Display.none:
+            return
+
+        self.canvas.write(self, ''.join(
+            (self.prefix, text, self.suffix)))
+
+    def write_tail(self, text: str):
+        """Write the given tail text the the element's canvas.
+
+        Args:
+            text: the text to write
+        """
+        if not text or self.display == Display.none:
+            return
+        self.write(text)
+
+    def set_canvas(self, canvas) -> 'HtmlElement':
+        self.canvas = canvas
+        return self
+
+    def set_tag(self, tag: str) -> 'HtmlElement':
+        self.tag = tag
+        return self
+
+    def write_verbatim_text(self, text: str):
+        """Write the given text with `Whitespace.pre` to the canvas.
+
+        Args:
+            text: the text to write
+        """
+        if not text:
+            return
+
+        if self.display == Display.block:
+            self.canvas.open_block(self)
+
+        self.canvas.write(self, text, whitespace=WhiteSpace.pre)
+
+        if self.display == Display.block:
+            self.canvas.close_block(self)
+
+    def get_refined_html_element(self, new: 'HtmlElement') -> 'HtmlElement':
+        """Compute the new HTML element based on the previous one.
+
+        Adaptations:
+            margin_top: additional margin required when considering
+                        margin_bottom of the previous element
+
+        Args:
+            new: The new HtmlElement to be applied to the current context.
+
+        Returns:
+            The refined element with the context applied.
+        """
+        new.canvas = self.canvas
+
+        # inherit `display:none` attributes and ignore further refinements
+        if self.display == Display.none:
+            new.display = Display.none
+            return new
+
+        # no whitespace set => inherit
+        new.whitespace = new.whitespace or self.whitespace
+
+        # do not display whitespace only affixes in Whitespace.pre areas
+        # if `limit_whitespace_affixes` is set.
+        if (new.limit_whitespace_affixes
+                and self.whitespace == WhiteSpace.pre):
+            if new.prefix.isspace():
+                new.prefix = ''
+            if new.suffix.isspace():
+                new.suffix = ''
+
+        if new.display == Display.block and self.display == Display.block:
+            new.previous_margin_after = self.margin_after
+
+        return new
+
+    def __str__(self):
+        return (
+            '<{self.tag} prefix={self.prefix}, suffix={self.suffix}, '
+            'display={self.display}, margin_before={self.margin_before}, '
+            'margin_after={self.margin_after}, '
+            'padding_inline={self.padding_inline}, '
+            'list_bullet={self.list_bullet}, '
+            'whitespace={self.whitespace}, align={self.align}, '
+            'valign={self.valign}, annotation={self.annotation}>'
+        ).format(self=self)
+
+    __repr__ = __str__
+
+
+"""
+An empty default HTML element.
+"""
+DEFAULT_HTML_ELEMENT = HtmlElement()
diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py
index 5673738..ed3fbe2 100644
--- a/src/inscriptis/model/table.py
+++ b/src/inscriptis/model/table.py
@@ -1,166 +1,267 @@
 #!/usr/bin/env python3
 # encoding: utf-8
-"""
-Classes for representing Tables, Rows and TableCells.
-"""
+"""Classes used for representing Tables, TableRows and TableCells."""
+
+from typing import List
+from itertools import chain, accumulate
 
-from itertools import chain, zip_longest
 from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment
+from inscriptis.annotation import Annotation, horizontal_shift
+from inscriptis.model.canvas import Canvas
 
 
-class TableCell:
-    """ A single table cell """
+class TableCell(Canvas):
+    """A table cell.
 
-    __slots__ = ('canvas', 'align', 'valign', 'width', 'height')
+    Attributes:
+        line_width: the original line widths per line (required to adjust
+                    annotations after a reformatting)
+        vertical_padding: vertical padding that has been introduced due to
+                          vertical formatting rules.
+    """
 
-    def __init__(self, canvas, align, valign, width=None, height=None):
-        """
-        Args:
-          canvas: canvas to which the table cell is written.
-          align: the :class:`~inscriptis.html_properties.HorizontalAlignment`
-            of the given line.
-         width: line width
-        """
-        self.canvas = canvas
+    __slots__ = ('annotations', 'block_annotations', 'blocks', 'current_block',
+                 'margin', 'annotation_counter', 'align', 'valign', '_width',
+                 'line_width', 'vertical_padding')
+
+    def __init__(self, align: HorizontalAlignment, valign: VerticalAlignment):
+        super().__init__()
         self.align = align
         self.valign = valign
-        self.width = width
-        self.height = height
+        self._width = None
+        self.line_width = None
+        self.vertical_padding = 0
 
-    def get_format_spec(self):
-        """
-        The format specification according to the values of `align` and
-        `width`.
-        """
-        return '{{:{self.align.value}{self.width}}}'.format(self=self)
+    def normalize_blocks(self) -> int:
+        """Split multi-line blocks into multiple one-line blocks.
 
-    def get_cell_lines(self):
-        """
         Returns:
-          list -- A list of all the lines stores within the :class:`TableCell`.
+            The height of the normalized cell.
         """
-        format_spec = self.get_format_spec()
-        # normalize the canvas
-        self.canvas = list(chain(*[line.split('\n') for line in self.canvas]))
-
-        if self.height and False:
-            canvas = self.canvas + ((self.height - len(self.canvas)) * [''])
-        else:
-            canvas = self.canvas
-
-        # horizontal alignment
-        rows = [format_spec.format(line) if self.width else line
-                for line in canvas]
-
-        # vertical alignment
-        if self.height and len(rows) < self.height:
-            empty_line = [' ' * self.width] if self.width else ['']
-            if self.valign == VerticalAlignment.bottom:
-                rows = ((self.height - len(rows)) * empty_line) + rows
-            elif self.valign == VerticalAlignment.middle:
-                rows = ((self.height - len(rows) - 1) // 2) * empty_line + rows
-                rows = rows + ((self.height - len(rows)) * empty_line)
-            else:
-                rows = rows + ((self.height - len(rows)) * empty_line)
-
-        return rows
+        self._flush_inline()
+        self.blocks = list(chain(*(line.split('\n') for line in self.blocks)))
+        if not self.blocks:
+            self.blocks = ['']
+        return len(self.blocks)
 
+    @property
+    def height(self):
+        """Compute the table cell's height.
 
-class Row:
-    """ A single row within a table """
-    __slot__ = ('columns', )
+        Returns:
+            The cell's current height.
+        """
+        return max(1, len(self.blocks))
 
-    def __init__(self):
-        self.columns = []
+    @property
+    def width(self):
+        """Compute the table cell's width.
 
-    def get_cell_lines(self, column_idx):
+        Returns:
+            The cell's current width.
         """
-        Computes the list of lines in the cell specified by the column_idx.
+        if self._width:
+            return self._width
+        return max((len(line) for line in chain(*(block.split('\n')
+                                                  for block in self.blocks))))
+
+    @width.setter
+    def width(self, width):
+        """Set the table's width and applies the cell's horizontal formatting.
 
         Args:
-          column_idx: The column index of the cell.
-        Returns:
-          list -- The list of lines in the cell specified by the column_idx or
-                  an empty list if the column does not exist.
+            The cell's expected width.
         """
-        return [] if column_idx >= len(self.columns) \
-            else self.columns[column_idx].get_cell_lines()
+        # save the original line widths before reformatting
+        self.line_width = [len(block) for block in self.blocks]
 
-    def get_text(self):
+        # record new width and start reformatting
+        self._width = width
+        format_spec = '{{:{align}{width}}}'.format(align=self.align.value,
+                                                   width=width)
+        self.blocks = [format_spec.format(b) for b in self.blocks]
+
+    @height.setter
+    def height(self, height: int):
+        """Set the cell's height to the given value.
+
+        Notes:
+            Depending on the height and the cell's vertical formatting this
+            might require the introduction of empty lines.
         """
+        rows = len(self.blocks)
+        if rows < height:
+            empty_line = ['']
+            if self.valign == VerticalAlignment.bottom:
+                self.vertical_padding = (height - rows)
+                self.blocks = self.vertical_padding * empty_line + self.blocks
+            elif self.valign == VerticalAlignment.middle:
+                self.vertical_padding = (height - rows) // 2
+                self.blocks = self.vertical_padding * empty_line + \
+                    self.blocks + ((height - rows + 1) // 2 * empty_line)
+            else:
+                self.blocks = self.blocks + ((height - rows) * empty_line)
+
+    def get_annotations(self, idx: int, row_width: int) -> List[Annotation]:
+        """Return a list of all annotations within the TableCell.
+
         Returns:
-          str -- A rendered string representation of the given row.
+            A list of annotations that have been adjusted to the cell's
+            position.
         """
-        row_lines = ['  '.join(line)
-                     for line in zip_longest(*[column.get_cell_lines()
-                                               for column in self.columns],
-                                             fillvalue=' ')]
+        self.current_block.idx = idx
+        if not self.annotations:
+            return []
+
+        # the easy case - the cell has only one line :)
+        if len(self.blocks) == 1:
+            annotations = horizontal_shift(self.annotations,
+                                           self.line_width[0],
+                                           self.width, self.align, idx)
+            self.line_width[0] = self.width
+            return annotations
+
+        # the more challenging one - multiple cell lines
+        line_break_pos = list(accumulate(self.line_width))
+        annotation_lines = [[] for _ in self.blocks]
+
+        # assign annotations to the corresponding line
+        for a in self.annotations:
+            for no, line_break in enumerate(line_break_pos):
+                if a.start <= (line_break + no):         # consider newline
+                    annotation_lines[no + self.vertical_padding].append(a)
+                    break
+
+        # compute the annotation index based on its line and delta :)
+        result = []
+        idx += self.vertical_padding   # newlines introduced by the padding
+        for line_annotations, line_len in zip(annotation_lines,
+                                              self.line_width):
+            result.extend(horizontal_shift(line_annotations, line_len,
+                                           self.width, self.align, idx))
+            idx += row_width - line_len
+        self.line_width = [self.width for _ in self.line_width]
+        return result
+
+
+class TableRow:
+    """A single row within a table."""
+
+    __slots__ = ('columns', 'cell_separator')
+
+    def __init__(self, cell_separator: str = '  '):
+        self.columns: List[TableCell] = []
+        self.cell_separator = cell_separator
+
+    def __len__(self):
+        return len(self.columns)
+
+    def get_text(self) -> str:
+        """Return a text representation of the TableRow."""
+        row_lines = [self.cell_separator.join(line)
+                     for line in zip(*[column.blocks
+                                       for column in self.columns])]
         return '\n'.join(row_lines)
 
+    @property
+    def width(self):
+        """Compute and return the width of the current row."""
+        if not self.columns:
+            return 0
+
+        return sum((cell.width for cell in self.columns)) + len(
+            self.cell_separator) * (len(self.columns) - 1)
+
 
 class Table:
-    """ A HTML table. """
+    """An HTML table.
+
+    Attributes:
+        rows: the table's rows.
+        left_margin_len: length of the left margin before the table.
+    """
 
-    __slot__ = ('rows', 'td_is_open')
+    __slots__ = ('rows', 'left_margin_len')
 
-    def __init__(self):
+    def __init__(self, left_margin_len: int):
         self.rows = []
-        # keep track of whether the last td tag has been closed
-        self.td_is_open = False
+        self.left_margin_len = left_margin_len
 
     def add_row(self):
-        """
-        Adds an empty :class:`Row` to the table.
-        """
-        self.rows.append(Row())
+        """Add an empty :class:`TableRow` to the table."""
+        self.rows.append(TableRow())
 
-    def add_cell(self, canvas, align=HorizontalAlignment.left,
-                 valign=VerticalAlignment.top):
-        """
-        Adds a new :class:`TableCell` to the table's last row. If no row
-        exists yet, a new row is created.
-        """
-        if not self.rows:
-            self.add_row()
-        self.rows[-1].columns.append(
-            TableCell(canvas, align, valign))
+    def add_cell(self, table_cell: TableCell):
+        """Add  a new :class:`TableCell` to the table's last row.
 
-    def compute_column_width_and_height(self):
-        """
-        Compute and set the column width and height for all columns in the
-        table.
+        .. note::
+            If no row exists yet, a new row is created.
         """
-        # skip tables with no row
         if not self.rows:
-            return
+            self.add_row()
+        self.rows[-1].columns.append(table_cell)
 
-        # determine row height
+    def _set_row_height(self):
+        """Set the cell height for all :class:`TableCell`s in the table."""
         for row in self.rows:
-            max_row_height = (max((len(cell.get_cell_lines())
-                                   for cell in row.columns))
-                              if row.columns else 1)
+            max_row_height = max((cell.normalize_blocks()
+                                  for cell in row.columns)) \
+                if row.columns else 0
             for cell in row.columns:
                 cell.height = max_row_height
 
+    def _set_column_width(self):
+        """Set the column width for all :class:`TableCell`s in the table."""
         # determine maximum number of columns
         max_columns = max((len(row.columns) for row in self.rows))
 
-        for column_idx in range(max_columns):
-            # determine max_column_width
-            row_cell_lines = [row.get_cell_lines(column_idx)
-                              for row in self.rows]
-            max_column_width = max((len(line)
-                                    for line in chain(*row_cell_lines)))
+        for cur_column_idx in range(max_columns):
+            # determine the required column width for the current column
+            max_column_width = max((row.columns[cur_column_idx].width
+                                    for row in self.rows
+                                    if len(row) > cur_column_idx))
 
-            # set column width in all rows
+            # set column width for all TableCells in the current column
             for row in self.rows:
-                if len(row.columns) > column_idx:
-                    row.columns[column_idx].width = max_column_width
+                if len(row) > cur_column_idx:
+                    row.columns[cur_column_idx].width = max_column_width
 
     def get_text(self):
-        """
+        """Return and render the text of the given table."""
+        if not self.rows:
+            return '\n'
+
+        self._set_row_height()
+        self._set_column_width()
+        return '\n'.join((row.get_text() for row in self.rows)) + '\n'
+
+    def get_annotations(self, idx: int,
+                        left_margin_len: int) -> List[Annotation]:
+        r"""Return all annotations in the given table.
+
+        Args:
+            idx: the table's start index.
+            left_margin_len: len of the left margin (required for adapting
+                             the position of annotations).
+
         Returns:
-          A rendered string representation of the given table.
+            A list of all :class:`~inscriptis.annotation.Annotation`\s present
+            in the table.
         """
-        self.compute_column_width_and_height()
-        return '\n'.join((row.get_text() for row in self.rows))
+        if not self.rows:
+            return []
+
+        annotations = []
+        idx += left_margin_len
+        for row in self.rows:
+            if not row.columns:
+                continue
+            row_width = row.width + left_margin_len
+            cell_idx = idx
+            for cell in row.columns:
+                annotations += cell.get_annotations(cell_idx, row_width)
+                cell_idx += cell.width + len(row.cell_separator)
+
+            idx += (row_width + 1) * cell.height   # linebreak
+
+        return annotations
diff --git a/tests/html/advanced-prefix-test.html b/tests/html/advanced-prefix-test.html
new file mode 100644
index 0000000..f441b0f
--- /dev/null
+++ b/tests/html/advanced-prefix-test.html
@@ -0,0 +1,19 @@
+
    +
  1. first
  2. +
  3. +
      +
    • y=0
      +for x in range(3,10):
      +   print(x)
      +   y += x
      +print(y)
      +
    • +
    • print("Hallo")
      +print("Echo")
      +print("123")
      +            
    • +
    • +
    +
  4. third
  5. +
+ diff --git a/tests/html/advanced-prefix-test.txt b/tests/html/advanced-prefix-test.txt new file mode 100644 index 0000000..79fddeb --- /dev/null +++ b/tests/html/advanced-prefix-test.txt @@ -0,0 +1,13 @@ + 1. first + 2. + + y=0 + for x in range(3,10): + print(x) + y += x + print(y) + + print("Hallo") + print("Echo") + print("123") + + + + 3. third diff --git a/tests/html/invalid-table3.html b/tests/html/invalid-table3.html new file mode 100644 index 0000000..2c40632 --- /dev/null +++ b/tests/html/invalid-table3.html @@ -0,0 +1,10 @@ +Good day + + first second third +
+forth + + oho + beta
alphaepsilon +
gamma +
diff --git a/tests/html/invalid-table3.txt b/tests/html/invalid-table3.txt new file mode 100644 index 0000000..8120203 --- /dev/null +++ b/tests/html/invalid-table3.txt @@ -0,0 +1,4 @@ +Good day first second third +forth oho beta + alpha epsilon +gamma \ No newline at end of file diff --git a/tests/html/invisible3.html b/tests/html/invisible3.html new file mode 100644 index 0000000..75ddc90 --- /dev/null +++ b/tests/html/invisible3.html @@ -0,0 +1,2 @@ + + diff --git a/tests/html/invisible3.txt b/tests/html/invisible3.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/html/nested-list.html b/tests/html/nested-list.html new file mode 100644 index 0000000..8dd7d8b --- /dev/null +++ b/tests/html/nested-list.html @@ -0,0 +1,36 @@ + + + First +
    +
  • +
  • +
  • 1 +
  • 2 +
  • 3 +
  • +
  • +
+ + Second +
    +
  • +
  • +
  • +
      +
    • 1 +
    • +
        +
      • a +
      • b +
      • c +
      +
    • +
    • 3 +
    +
  • +
  • +
  • +
      + + + diff --git a/tests/html/nested-list.txt b/tests/html/nested-list.txt new file mode 100644 index 0000000..7c65d7b --- /dev/null +++ b/tests/html/nested-list.txt @@ -0,0 +1,21 @@ +First + * + * + * 1 + * 2 + * 3 + * + * +Second + * + * + * + + 1 + + + o a + o b + o c + + 3 + * + * + * diff --git a/tests/html/nested-table-alignment-css.txt b/tests/html/nested-table-alignment-css.txt index 607bafa..690ad59 100644 --- a/tests/html/nested-table-alignment-css.txt +++ b/tests/html/nested-table-alignment-css.txt @@ -1,7 +1,7 @@ column with nested table column 2 column 3 column 4 nested table Tom -11 12 Joe -21 22 +11 12 +21 22 Joe 31 32 Sue -last line +last line \ No newline at end of file diff --git a/tests/html/nested-table-alignment.txt b/tests/html/nested-table-alignment.txt index 6184db7..08642b5 100644 --- a/tests/html/nested-table-alignment.txt +++ b/tests/html/nested-table-alignment.txt @@ -1,7 +1,7 @@ column with nested table column 2 column 3 column 4 nested table Tom - 11 12 Joe - 21 22 + 11 12 + 21 22 Joe 31 32 Sue -last line +last line \ No newline at end of file diff --git a/tests/html/nested-table.html b/tests/html/nested-table.html index fa71723..6cc6bc3 100644 --- a/tests/html/nested-table.html +++ b/tests/html/nested-table.html @@ -1,4 +1,4 @@ - +
      @@ -9,7 +9,7 @@
      column with nested table column 2 column 3
      12
      - Tom + Tom Joe diff --git a/tests/html/nested-table.txt b/tests/html/nested-table.txt index a94a71f..2f4a746 100644 --- a/tests/html/nested-table.txt +++ b/tests/html/nested-table.txt @@ -1,5 +1,5 @@ column with nested table column 2 column 3 -nested table Tom Joe -1 2 +nested table +1 2 Tom Joe -last line +last line \ No newline at end of file diff --git a/tests/html/stackoverflow-list-snippet.html b/tests/html/stackoverflow-list-snippet.html new file mode 100644 index 0000000..6d148a7 --- /dev/null +++ b/tests/html/stackoverflow-list-snippet.html @@ -0,0 +1,34 @@ +
    • +
      +
      +
      +
      +
      +
    • +
    • +
      +
      +
      +
      +
      +
      + + @nbedou docs.python.org/3/library/typing.html#typing.NamedTuple + +– nodakai + Oct 3 '18 at 7:44 +
      +
      +
    • diff --git a/tests/html/stackoverflow-list-snippet.txt b/tests/html/stackoverflow-list-snippet.txt new file mode 100644 index 0000000..38bfc38 --- /dev/null +++ b/tests/html/stackoverflow-list-snippet.txt @@ -0,0 +1,2 @@ +* I obtain "NameError: name 'NamedTuple' is not defined" – nbedou Jul 6 '18 at 12:45 +* @nbedou docs.python.org/3/library/typing.html#typing.NamedTuple – nodakai Oct 3 '18 at 7:44 diff --git a/tests/html/subsequent-headings.html b/tests/html/subsequent-headings.html new file mode 100644 index 0000000..ca3dc1a --- /dev/null +++ b/tests/html/subsequent-headings.html @@ -0,0 +1,24 @@ + + + Test the spacing between subsequent headings + +

      The first

      + + And text, concerning the first heading. + +

      The second

      + Text concerning the second heading. + +

      Subheading

      + Sub1 + +

      This is a subsubtopic

      + +

      Another subheading

      + Sub2 + +

      The third

      + The third and final heading. + + + diff --git a/tests/html/subsequent-headings.json b/tests/html/subsequent-headings.json new file mode 100644 index 0000000..247cf27 --- /dev/null +++ b/tests/html/subsequent-headings.json @@ -0,0 +1,18 @@ +{"annotation_rules": { + "h1": ["heading"], + "h2": ["heading"], + "h3": ["heading"], + "b": ["emphasis"], + "table": ["table"], + "th": ["table-heading"], + "td": ["table-cell"] + }, + "result": [ + ["heading", "The first\n\n"], + ["heading", "\nThe second\n\n"], + ["heading", "\nSubheading\n\n"], + ["heading", "\nThis is a subsubtopic\n\n"], + ["heading", "Another subheading\n\n"], + ["heading", "\nThe third\n\n"] + ] +} diff --git a/tests/html/subsequent-headings.txt b/tests/html/subsequent-headings.txt new file mode 100644 index 0000000..28ceb96 --- /dev/null +++ b/tests/html/subsequent-headings.txt @@ -0,0 +1,21 @@ +The first + +And text, concerning the first heading. + +The second + +Text concerning the second heading. + +Subheading + +Sub1 + +This is a subsubtopic + +Another subheading + +Sub2 + +The third + +The third and final heading. diff --git a/tests/html/table-empty-row.html b/tests/html/table-empty-row.html new file mode 100644 index 0000000..5d34f45 --- /dev/null +++ b/tests/html/table-empty-row.html @@ -0,0 +1,9 @@ + + + Leer + +
      Hallo + Echo +
      (1) + (2) +
      diff --git a/tests/html/table-empty-row.txt b/tests/html/table-empty-row.txt new file mode 100644 index 0000000..3f81159 --- /dev/null +++ b/tests/html/table-empty-row.txt @@ -0,0 +1,5 @@ +Leer +Hallo Echo + +(1) (2) + diff --git a/tests/html/table-in-table.html b/tests/html/table-in-table.html index 70f51af..e108496 100644 --- a/tests/html/table-in-table.html +++ b/tests/html/table-in-table.html @@ -1,35 +1,35 @@

      Single

      First

      - +
      redgreen
      redgreen
      blue
      redgreen

      Second

      - - +
      blue
      redgreen
      +
      blue
      red?green
      blue

      Nested

      - +
      - - @@ -43,6 +43,6 @@

      Nested

      + - + +
      redgreen
      -
      redgreen.
      blue
      redgreen
      +
      blue
      redgreen
      blue
      redgreen
      blue
      -
      blue
      redgreen
      blue
      blue
      + +
      blue
      blue.
      redgreen
      blue
      -
      redgreen
      blue
      redgreen
      redgreen!
      blue
      redgreen
      blue
      blue
      -
      blue
      redgreen
      blue
      + blue! diff --git a/tests/html/table-in-table.json b/tests/html/table-in-table.json new file mode 100644 index 0000000..09e7475 --- /dev/null +++ b/tests/html/table-in-table.json @@ -0,0 +1,25 @@ +{"annotation_rules": { + "h1": ["heading"], + "h2": ["heading"], + "h3": ["heading"], + "table#border": ["table"], + "b": ["bold"], + "i": ["italic"] + }, + "result": [ + ["heading", "Single\n\n"], + ["heading", "First\n\n"], + ["table", "red green\n blue \nred green\n\n"], + ["heading", "\nSecond\n\n"], + ["table", " blue \nred? green\n blue \n\n"], + ["bold", "red?"], + ["heading", "\nNested\n\n"], + ["table", "red green. blue blue \n blue red green red green \nred green blue blue \n \n blue. red green blue \nred green blue red green \n blue red green! blue \n \nred green blue blue \n blue red green red green\nred green blue blue! \n \n"], + ["italic", "green."], + ["italic", "blue."], + ["bold", "green!"], + ["bold", "blue!"] + ] +} + + diff --git a/tests/html/table-in-table.txt b/tests/html/table-in-table.txt index 7582610..2fb35fd 100644 --- a/tests/html/table-in-table.txt +++ b/tests/html/table-in-table.txt @@ -1,28 +1,29 @@ Single - First red green blue red green + Second - blue -red green - blue + blue +red? green + blue + Nested -red green blue blue - blue red green red green -red green blue blue - - blue red green blue -red green blue red green - blue red green blue - -red green blue blue - blue red green red green -red green blue blue +red green. blue blue + blue red green red green +red green blue blue + + blue. red green blue +red green blue red green + blue red green! blue + +red green blue blue + blue red green red green +red green blue blue! \ No newline at end of file diff --git a/tests/html/table-pre.txt b/tests/html/table-pre.txt index 3633aa0..c170f55 100644 --- a/tests/html/table-pre.txt +++ b/tests/html/table-pre.txt @@ -9,5 +9,4 @@ for a in range(10): for (int a=0; a<10; a++) { print(b) System.out.println(b); } - 3.8 14 diff --git a/tests/html/table.html b/tests/html/table.html index 8e2a86a..87dfc98 100644 --- a/tests/html/table.html +++ b/tests/html/table.html @@ -4,7 +4,7 @@ Third a - b + b c diff --git a/tests/html/table.json b/tests/html/table.json new file mode 100644 index 0000000..ddaf853 --- /dev/null +++ b/tests/html/table.json @@ -0,0 +1,20 @@ +{"annotation_rules": { + "h1": ["heading"], + "h2": ["heading"], + "h3": ["heading"], + "b": ["emphasis"], + "table": ["table"], + "th": ["table-heading"], + "td": ["table-cell"] + }, + "result": [ + ["table", "First Second Third\na b c \n"], + ["table-heading", "First"], + ["table-heading", "Second"], + ["table-heading", "Third"], + ["table-cell", "a"], + ["emphasis", "b"], + ["table-cell", "b"], + ["table-cell", "c"] + ] +} diff --git a/tests/html/wikipedia-consequtive-links-and-umlauts.html b/tests/html/wikipedia-consequtive-links-and-umlauts.html new file mode 100644 index 0000000..6a6c2c8 --- /dev/null +++ b/tests/html/wikipedia-consequtive-links-and-umlauts.html @@ -0,0 +1,16 @@ +

      Araschgen | +Chur City | +Dreibündenquartier | +Fürstenwald | +Giacomettiquartier | +Kornquader | +Lacunaquartier | +Masans | +Niederlachen-Untere Au | +Rheinquartier | +Rossboden | +Sand | +Sommerau | +Tittwiesen | +Wiesental +

      diff --git a/tests/html/wikipedia-consequtive-links-and-umlauts.txt b/tests/html/wikipedia-consequtive-links-and-umlauts.txt new file mode 100644 index 0000000..60dc66d --- /dev/null +++ b/tests/html/wikipedia-consequtive-links-and-umlauts.txt @@ -0,0 +1 @@ +Araschgen | Chur City | Dreibündenquartier | Fürstenwald | Giacomettiquartier | Kornquader | Lacunaquartier | Masans | Niederlachen-Untere Au | Rheinquartier | Rossboden | Sand | Sommerau | Tittwiesen | Wiesental diff --git a/tests/html/wikipedia-consequtive-tables.html b/tests/html/wikipedia-consequtive-tables.html new file mode 100644 index 0000000..dbe86d7 --- /dev/null +++ b/tests/html/wikipedia-consequtive-tables.html @@ -0,0 +1,243 @@ + + +
      Hallo
      + + + +
      +
      Monatliche Durchschnittstemperaturen und -niederschläge für Chur 1981–2010
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +Jan +Feb +Mär +Apr +Mai +Jun +Jul +Aug +Sep +Okt +Nov +Dez + + +
      Max. Temperatur (°C) +4,8 +6,4 +11,2 +15,1 +20,0 +22,7 +24,9 +24,1 +20,0 +16,1 +9,5 +5,3 +Ø +15,1 +
      Min. Temperatur (°C) +−2,6 +−2,0 +1,6 +4,6 +8,9 +11,8 +13,8 +13,7 +10,3 +6,6 +1,7 +−1,4 +Ø +5,6 +
      Temperatur (°C) +0,7 +1,8 +5,9 +9,7 +14,3 +17,1 +19,1 +18,5 +14,8 +10,8 +5,2 +1,7 +Ø +10 +
      +
      Niederschlag (mm) +51 +47 +55 +49 +71 +92 +109 +112 +81 +56 +70 +55 +Σ +848 +
      +
      Sonnenstunden (h/d) +3,1 +4,0 +4,5 +4,9 +5,5 +5,9 +6,5 +6,0 +5,2 +4,4 +3,1 +2,6 +Ø +4,6 +
      +
      Regentage (d) +7,3 +6,6 +8,1 +7,5 +9,9 +11,2 +11,0 +11,2 +8,4 +7,0 +8,5 +7,9 +Σ +104,6 +
      + diff --git a/tests/html/wikipedia-consequtive-tables.json b/tests/html/wikipedia-consequtive-tables.json new file mode 100644 index 0000000..9c59689 --- /dev/null +++ b/tests/html/wikipedia-consequtive-tables.json @@ -0,0 +1,32 @@ +{ + "annotation_rules": { + "h1": ["heading"], + "h2": ["heading"], + "h3": ["subheading"], + "h4": ["subheading"], + "h5": ["subheading"], + "i": ["emphasis"], + "b": ["bold"], + "th": ["tableheading"], + "a": ["link"] + }, + "result": [ + ["bold", "Monatliche Durchschnittstemperaturen und -niederschl\u00e4ge f\u00fcr Chur 1981\u20132010"], + ["link", "Temperatur"], + ["bold", "\u00d8"], + ["bold", "15,1"], + ["bold", "\u00d8"], + ["bold", "5,6"], + ["bold", "\u00d8"], + ["bold", "10"], + ["link", "Niederschlag"], + ["bold", "\u03a3"], + ["bold", "848"], + ["link", "Sonnenstunden"], + ["bold", "\u00d8"], + ["bold", "4,6"], + ["link", "Regentage"], + ["bold", "\u03a3"], + ["bold", "104,6"] + ] +} diff --git a/tests/html/wikipedia-enumeration-annotation.html b/tests/html/wikipedia-enumeration-annotation.html new file mode 100644 index 0000000..f6ac43d --- /dev/null +++ b/tests/html/wikipedia-enumeration-annotation.html @@ -0,0 +1,66 @@ +
      +

      Inhaltsverzeichnis

      +
      +Another marker. + + +

      End of enumeration

      + +Closing remarks and an emphasized text portion. diff --git a/tests/html/wikipedia-enumeration-annotation.json b/tests/html/wikipedia-enumeration-annotation.json new file mode 100644 index 0000000..1d066a4 --- /dev/null +++ b/tests/html/wikipedia-enumeration-annotation.json @@ -0,0 +1,19 @@ +{"annotation_rules": { + "h1": ["heading"], + "h2": ["heading"], + "h3": ["heading"], + "b": ["emphasis"], + "table": ["table"], + "th": ["table-heading"], + "td": ["table-cell"] + }, + "result": [ + ["heading", "Inhaltsverzeichnis\n\n"], + ["emphasis", "marker"], + ["emphasis", "marker2"], + ["emphasis", "marker3"], + ["emphasis", "marker31"], + ["heading", "\nEnd of enumeration\n\n"], + ["emphasis", "emphasized text portion"] + ] +} diff --git a/tests/html/wikipedia-enumeration-annotation.txt b/tests/html/wikipedia-enumeration-annotation.txt new file mode 100644 index 0000000..a6b487e --- /dev/null +++ b/tests/html/wikipedia-enumeration-annotation.txt @@ -0,0 +1,45 @@ +Inhaltsverzeichnis + +Another marker. + * 1 Name und Aussprache - marker2 + * 2 Geographie - marker3 + + 2.1 Stadtquartiere - marker31 + + 2.2 Klima + * 3 Geschichte + + 3.1 Vorrömische Zeit + + 3.2 Antike + + 3.3 Mittelalter + + 3.4 Wende zur Neuzeit + + 3.5 Reformation und Dreissigjähriger Krieg + + 3.6 19. Jahrhundert + + 3.7 Moderne und Gegenwart + * 4 Bevölkerung + + 4.1 Sprachen + + 4.2 Religionen + * 5 Wappen + * 6 Politik + + 6.1 Stadtpräsidenten + + 6.2 Partnerstädte + * 7 Wirtschaft und Infrastruktur + + 7.1 Wirtschaft + + 7.2 Land- und Alpwirtschaft + + 7.3 Verkehr + + 7.4 Bildung + + 7.5 Medien + + 7.6 Kultur + + 7.7 Justiz + + 7.8 Friedhöfe + + 7.9 Sportvereine + * 8 Sehenswürdigkeiten und Tourismus + + 8.1 Tourismus + * 9 Besonderes + * 10 Galerie + * 11 Persönlichkeiten + * 12 Siehe auch + * 13 Literatur + * 14 Weblinks + * 15 Einzelnachweise + +End of enumeration + +Closing remarks and an emphasized text portion. diff --git a/tests/html/wikipedia-equation.html b/tests/html/wikipedia-equation.html new file mode 100644 index 0000000..f9851b7 --- /dev/null +++ b/tests/html/wikipedia-equation.html @@ -0,0 +1,10 @@ + + +
      int factorial(int x) {
      +    if (x <= 1)
      +            return 1;
      +
      +                return x * factorial(x - 1);
      +                }
      +
      + diff --git a/tests/html/wikipedia-equation.txt b/tests/html/wikipedia-equation.txt new file mode 100644 index 0000000..cae169e --- /dev/null +++ b/tests/html/wikipedia-equation.txt @@ -0,0 +1,7 @@ +int factorial(int x) { + if (x <= 1) + return 1; + + return x * factorial(x - 1); + } + diff --git a/tests/html/wikipedia-table-bordercase-verticial-alignmnet.html b/tests/html/wikipedia-table-bordercase-verticial-alignmnet.html new file mode 100644 index 0000000..58f54d8 --- /dev/null +++ b/tests/html/wikipedia-table-bordercase-verticial-alignmnet.html @@ -0,0 +1,28 @@ + + + + + +
      + + + + + +
      diff --git a/tests/html/wikipedia-table-bordercase-verticial-alignmnet.json b/tests/html/wikipedia-table-bordercase-verticial-alignmnet.json new file mode 100644 index 0000000..800e196 --- /dev/null +++ b/tests/html/wikipedia-table-bordercase-verticial-alignmnet.json @@ -0,0 +1,31 @@ +{ + "annotation_rules": { + "h1": ["heading"], + "h2": ["heading"], + "h3": ["subheading"], + "h4": ["subheading"], + "h5": ["subheading"], + "i": ["emphasis"], + "b": ["bold"], + "th": ["tableheading"], + "a": ["link"] + }, + "result": [ + ["link", " * Araschgen"], + ["link", " * F\u00fcrstenwald"], + ["link", " * Masans"], + ["link", " * Niederlachen-Untere Au"], + ["link", " * Lacuna"], + ["link", " * Giacomettiquartier"], + ["link", " * Chur West"], + ["link", " * Dreib\u00fcnden"], + ["link", " * Sand"], + ["link", " * Kornquader"], + ["link", " * Rheinquartier"], + ["link", " * Rossboden"], + ["link", "Sommerau"], + ["link", " * Wiesental"], + ["link", " * Tittwiesen"], + ["link", "[8]"] + ] +} diff --git a/tests/html/wikipedia-table-bordercase1.html b/tests/html/wikipedia-table-bordercase1.html new file mode 100644 index 0000000..c751947 --- /dev/null +++ b/tests/html/wikipedia-table-bordercase1.html @@ -0,0 +1,21 @@ +
      +Dieser Artikel behandelt den Bündner Hauptort. Für andere Bedeutungen siehe Chur (Begriffsklärung).
      + + + + + + + + + + + + +
      Chur +
      Wappen von Chur
      +
      Staat: +SchweizSchweiz Schweiz +
      Kanton: +Kanton GraubündenKanton Graubünden Graubünden (GR) +
      diff --git a/tests/html/wikipedia-table-bordercase1.json b/tests/html/wikipedia-table-bordercase1.json new file mode 100644 index 0000000..7f96ed9 --- /dev/null +++ b/tests/html/wikipedia-table-bordercase1.json @@ -0,0 +1,21 @@ +{ + "annotation_rules": { + "h1": ["heading"], + "h2": ["heading"], + "h3": ["subheading"], + "h4": ["subheading"], + "h5": ["subheading"], + "i": ["emphasis"], + "b": ["bold"], + "th": ["tableheading"], + "a": ["link"] + }, + "result": [ + ["link", "Chur (Begriffskl\u00e4rung)"], + ["tableheading", "Chur "], + ["link", "Staat"], + ["link", "Schweiz"], + ["link", "Kanton"], + ["link", "Graub\u00fcnden"] + ] +} diff --git a/tests/html/wikipedia-table.html b/tests/html/wikipedia-table.html index 2474362..6d207ad 100644 --- a/tests/html/wikipedia-table.html +++ b/tests/html/wikipedia-table.html @@ -1,3 +1,6 @@ +

      Ehre sei Gott in der Höhe!

      +und Friede den Menschen, die guten Willens sind. +

      Bevölkerung[Bearbeiten]

      diff --git a/tests/html/wikipedia-table.json b/tests/html/wikipedia-table.json new file mode 100644 index 0000000..d648f07 --- /dev/null +++ b/tests/html/wikipedia-table.json @@ -0,0 +1,33 @@ +{ + "annotation_rules": { + "h1": ["heading"], + "h2": ["heading"], + "h3": ["subheading"], + "h4": ["subheading"], + "h5": ["subheading"], + "i": ["emphasis"], + "b": ["bold"], + "table": ["table"], + "th": ["tableheading"], + "a": ["link"] + }, + "result": [ + ["heading", "Ehre sei Gott in der H\u00f6he!\n\n"], + ["link", "Bearbeiten"], + ["heading", "\nBev\u00f6lkerung[Bearbeiten]\n\n"], + ["table", "Bev\u00f6lkerungsentwicklung[6]\nJahr 1500 1860 1900 1950 1970 2000 2005 2011 2012 \nEinwohner ca. 1500 3990 11'532 19'382 31'193 32'989 32'409 36'690 37'036\n"], + ["link", "[6]"], + ["tableheading", "Bev\u00f6lkerungsentwicklung[6]"], + ["tableheading", "Jahr"], + ["tableheading", "1500"], + ["tableheading", "1860"], + ["tableheading", "1900"], + ["tableheading", "1950"], + ["tableheading", "1970"], + ["tableheading", "2000"], + ["tableheading", "2005"], + ["tableheading", "2011"], + ["tableheading", "2012"], + ["tableheading", "Einwohner"] + ] +} diff --git a/tests/html/wikipedia-table.txt b/tests/html/wikipedia-table.txt index f1fe136..d39986e 100644 --- a/tests/html/wikipedia-table.txt +++ b/tests/html/wikipedia-table.txt @@ -1,5 +1,9 @@ +Ehre sei Gott in der Höhe! + +und Friede den Menschen, die guten Willens sind. + Bevölkerung[Bearbeiten] Bevölkerungsentwicklung[6] Jahr 1500 1860 1900 1950 1970 2000 2005 2011 2012 -Einwohner ca. 1500 3990 11'532 19'382 31'193 32'989 32'409 36'690 37'036 +Einwohner ca. 1500 3990 11'532 19'382 31'193 32'989 32'409 36'690 37'036 \ No newline at end of file diff --git a/tests/test_annotation.py b/tests/test_annotation.py new file mode 100644 index 0000000..b19ddeb --- /dev/null +++ b/tests/test_annotation.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +Tests the Table formatting with different parameters such as width and +alignment +""" + +from inscriptis.annotation import Annotation, horizontal_shift +from inscriptis.html_properties import HorizontalAlignment + + +def test_horizontal_shift(): + a = [Annotation(0, 4, 'test')] + + # no shift + assert horizontal_shift(a, + content_width=5, + line_width=10, + align=HorizontalAlignment.left, + shift=0).pop() == Annotation(0, 4, 'test') + + # shift + assert horizontal_shift(a, + content_width=5, + line_width=10, + align=HorizontalAlignment.left, + shift=3).pop() == Annotation(3, 7, 'test') + + # realignment to the right + assert horizontal_shift(a, + content_width=len('test'), + line_width=10, + align=HorizontalAlignment.right, + shift=0).pop() == Annotation(6, 10, 'test') + assert '{:>10}'.format('test')[6:10] == 'test' + + + # shift + realignment to the right + assert horizontal_shift(a, + content_width=len('test'), + line_width=10, + align=HorizontalAlignment.right, + shift=3).pop() == Annotation(9, 13, 'test') + + # realignment to the center + assert horizontal_shift(a, + content_width=len('test'), + line_width=10, + align=HorizontalAlignment.center, + shift=0).pop() == Annotation(3, 7, 'test') + assert '{:^10}'.format('test')[3:7] == 'test' + + assert horizontal_shift(a, + content_width=len('test'), + line_width=11, + align=HorizontalAlignment.center, + shift=0).pop() == Annotation(3, 7, 'test') + assert '{:^11}'.format('test')[3:7] == 'test' + + # realignment + shift + assert horizontal_shift(a, + content_width=len('test'), + line_width=11, + align=HorizontalAlignment.center, + shift=7).pop() == Annotation(10, 14, 'test') + diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py new file mode 100644 index 0000000..ea025ec --- /dev/null +++ b/tests/test_annotation_output_processor.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +""" +Test the annotation output formatter. +""" + +import pytest + +from inscriptis.annotation.output import AnnotationProcessor +from inscriptis.annotation.output.html import HtmlExtractor +from inscriptis.annotation.output.surface import SurfaceExtractor +from inscriptis.annotation.output.xml import XmlExtractor + +EXAMPLE_OUTPUT = {'text': 'Chur\n\nChur is the capital and largest town of ' + 'the Swiss canton of the Grisons and lies in the ' + 'Grisonian Rhine Valley.', + 'label': [[0, 4, 'heading'], + [0, 4, 'h1'], + [6, 10, 'emphasis']]} + + +def test_abstract_class(): + processor = AnnotationProcessor() + + with pytest.raises(NotImplementedError): + result = processor(EXAMPLE_OUTPUT) + + +def test_surface_annotator(): + processor = SurfaceExtractor() + result = processor(EXAMPLE_OUTPUT) + + # the old keys haven't been changed + assert 'text' in result + assert 'label' in result + + # and we have additional information on surface forms :) + assert result['surface'] == [('heading', 'Chur'), + ('h1', 'Chur'), + ('emphasis', 'Chur')] + + +def test_xml_annotator(): + processor = XmlExtractor() + result = processor(EXAMPLE_OUTPUT) + + # and we have additional information on surface forms :) + assert result == ('' + '

      Chur

      \n\n' + 'Chur is the capital and largest town ' + 'of the Swiss canton of the Grisons and lies in ' + 'the Grisonian Rhine Valley.') + + +def test_html_annotator(): + processor = HtmlExtractor() + result = processor(EXAMPLE_OUTPUT) + + assert result.startswith('' + '
      heading'
      +                           ''
      +                           'h1'
      +                           'Chur
      \n' + '
      \n'
      +                           '
      emphasis'
      +                           'Chur is the capital '
      +                           'and largest town of the Swiss canton of the '
      +                            'Grisons and lies in the Grisonian Rhine Valley.'
      +                           '
      ') + + +def test_trailing_tag_annotation(): + processor = XmlExtractor() + result = processor({'text': 'Ehre sei Gott!', + 'label': [[9, 14, 'emphasis']]}) + + assert result == ('' + 'Ehre sei Gott!') diff --git a/tests/test_annotation_rule_parsing.py b/tests/test_annotation_rule_parsing.py new file mode 100644 index 0000000..fef265a --- /dev/null +++ b/tests/test_annotation_rule_parsing.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +Tests the Table formatting with different parameters such as width and +alignment +""" + +from copy import deepcopy + +from inscriptis.css_profiles import CSS_PROFILES +from inscriptis.annotation.parser import AnnotationModel, ApplyAnnotation +from inscriptis.model.attribute import Attribute +from inscriptis.model.html_element import HtmlElement + + +def test_parse(): + """ + basic rule parsing. + """ + rules = {'table#border=1': ['table'], + 'hr': ['horizontal-line']} + tags, attrs = AnnotationModel._parse(rules) + + assert tags == {'hr': ['horizontal-line']} + + apply_annotation= attrs[0] + assert apply_annotation.match_tag == 'table' + assert apply_annotation.match_value == '1' + assert apply_annotation.attr == 'border' + + e = HtmlElement(tag='table') + apply_annotation.apply('1', e) + assert e.annotation == ('table', ) + + +def test_apply_annotation(): + """ + rule application. + """ + rules = {'table#border=1': ['table'], + 'hr': ['horizontal-line'], + '#color=red': ['red'], + '#bgcolor': ['bgcolor']} + + css = deepcopy(CSS_PROFILES['strict']) + annotation_model = AnnotationModel(css, rules) + assert annotation_model.css['hr'].annotation == ('horizontal-line', ) + + attribute_handler = Attribute() + attribute_handler.merge_attribute_map(annotation_model.css_attr) + assert 'table#border=1' in str(attribute_handler.attribute_mapping['border']) + assert '{any}#color=red' in str(attribute_handler.attribute_mapping['color']) + assert '{any}#bgcolor={any}' in str(attribute_handler.attribute_mapping['bgcolor']) + +def test_merged_attribute(): + """ + test multiple rules per attribute + """ + rules = {'#color=white': ['white'], + '#color=yellow': ['yellow']} + css = deepcopy(CSS_PROFILES['strict']) + annotation_model = AnnotationModel(css, rules) + + attribute_handler = Attribute() + attribute_handler.merge_attribute_map(annotation_model.css_attr) + + e = HtmlElement() + attribute_handler.attribute_mapping['color']('green', e) + assert e.annotation == () + attribute_handler.attribute_mapping['color']('yellow', e) + assert e.annotation == ('yellow', ) + attribute_handler.attribute_mapping['color']('white', e) + assert e.annotation == ('yellow', 'white') + + diff --git a/tests/test_block.py b/tests/test_block.py new file mode 100644 index 0000000..21ac592 --- /dev/null +++ b/tests/test_block.py @@ -0,0 +1,65 @@ +""" +Test cases for the Block class. +""" +from inscriptis.model.canvas.block import Block +from inscriptis.model.canvas.prefix import Prefix + + +def test_merge_normal_text_collapsable_whitespaces(): + """ + test cases where the block has collapsable whitespaces + """ + b = Block(0, Prefix()) + b.merge_normal_text("Hallo") + assert b._content == 'Hallo' + assert not b.collapsable_whitespace + + b = Block(0, Prefix()) + b.merge_normal_text(" Hallo ") + assert b._content == 'Hallo ' + assert b.collapsable_whitespace + + b = Block(0, Prefix()) + b.merge_normal_text('') + assert b._content == '' + assert b.collapsable_whitespace + + b.merge_normal_text(' ') + assert b._content == '' + assert b.collapsable_whitespace + + b.merge_normal_text(' ') + assert b._content == '' + assert b.collapsable_whitespace + + +def test_merge_normal_non_collapsable_whitespaces(): + b = Block(0, Prefix()) + b.collapsable_whitespace = False + b.merge_normal_text("Hallo") + assert b._content == 'Hallo' + assert not b.collapsable_whitespace + + b = Block(0, Prefix()) + b.collapsable_whitespace = False + b.merge_normal_text(" Hallo ") + assert b._content == ' Hallo ' + assert b.collapsable_whitespace + + b = Block(0, Prefix()) + b.collapsable_whitespace = False + b.merge_normal_text('') + assert b._content == '' + assert not b.collapsable_whitespace + + b = Block(0, Prefix()) + b.collapsable_whitespace = False + b.merge_normal_text(' ') + assert b._content == ' ' + assert b.collapsable_whitespace + + b = Block(0, Prefix()) + b.collapsable_whitespace = False + b.merge_normal_text(' ') + assert b._content == ' ' + assert b.collapsable_whitespace diff --git a/tests/test_broken_table_handling.py b/tests/test_broken_table_handling.py index 124c6da..bd210e9 100644 --- a/tests/test_broken_table_handling.py +++ b/tests/test_broken_table_handling.py @@ -14,14 +14,16 @@ def test_forgotten_td_close_tag(): # one line (i.e., missing before the next - html = (u'hallo
      and the next
      ' + html = ('hallo
      ' '' - u'
      12
      echo') - assert get_text(html, config) == u'hallo\n1 2\necho' + '
      echo') + print(html) + # assert get_text(html, config) == u'hallo\n1 2\necho' # two lines (i.e. missing before the and before the - html = (u'hallo' + html = ('hallo
      ' '
      12' '
      34' - u'
      echo') - assert get_text(html, config) == u'hallo\n1 2\n3 4\necho' + 'echo') + print(html) + assert get_text(html, config) == u'hallo\n1 2\n3 4\n\necho' diff --git a/tests/test_double_a.py b/tests/test_double_a.py index 83465b4..24623bd 100644 --- a/tests/test_double_a.py +++ b/tests/test_double_a.py @@ -7,6 +7,7 @@ from inscriptis import get_text + def test_successive_a(): html = 'first' \ 'second' diff --git a/tests/test_empty_string.py b/tests/test_empty_string.py index 4e1a10c..d436dc1 100644 --- a/tests/test_empty_string.py +++ b/tests/test_empty_string.py @@ -7,6 +7,7 @@ from inscriptis import get_text + def test_empty_and_corrupt(): assert get_text("test").strip() == "test" assert get_text(" ") == "" diff --git a/tests/test_engine.py b/tests/test_engine.py new file mode 100644 index 0000000..728191b --- /dev/null +++ b/tests/test_engine.py @@ -0,0 +1,11 @@ +# test borderline cases + +from inscriptis import get_text, get_annotated_text + + +def test_text_from_empty_content(): + assert get_text('') == '' + + +def test_annotations_from_empty_content(): + assert get_annotated_text('') == {} diff --git a/tests/test_html_snippets.py b/tests/test_html_snippets.py index f2efca5..9e7197f 100644 --- a/tests/test_html_snippets.py +++ b/tests/test_html_snippets.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -""" ensures that two successive text contain - a space between each other, if there is a linebreak - or space between the tags. +""" +Test HTML snippets in the project's HTML directory. The corresponding .txt file +contains the reference conversion. """ from os.path import dirname, join from glob import glob @@ -22,18 +22,21 @@ def test_html_snippets(filter_str=''): with open(testcase_txt) as f: reference_txt = f.read().rstrip() - with open(testcase_txt.replace(".txt", ".html")) as f: + with open(testcase_txt.replace('.txt', '.html')) as f: print(f.name) - html = "{}".format(f.read()) + html = '{}'.format(f.read()) converted_txt = get_text(html, ParserConfig( css=CSS_PROFILES['strict'])).rstrip() - reference_txt = '\n'.join([line + "<" for line in reference_txt.split("\n")]) - converted_txt = '\n'.join([line + "<" for line in converted_txt.split("\n")]) if converted_txt != reference_txt: - print("File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}" + print('File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}' .format(testcase_txt, html, reference_txt, converted_txt)) + print('HTML file:', testcase_txt.replace('.txt', '.html')) + print("Visualize differences with `vimdiff reference.txt " + "converted.txt`") + open("reference.txt", "w").write(reference_txt) + open("converted.txt", "w").write(converted_txt) assert converted_txt == reference_txt diff --git a/tests/test_html_snippets_annotations.py b/tests/test_html_snippets_annotations.py new file mode 100644 index 0000000..9655afa --- /dev/null +++ b/tests/test_html_snippets_annotations.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +""" +This test case verifies that annotation are correctly computed. +""" +import os +from json import load +from glob import glob +from typing import List + +from inscriptis import get_annotated_text +from inscriptis.css_profiles import CSS_PROFILES +from inscriptis.model.config import ParserConfig + +TESTCASE_PATTERN = os.path.join(os.path.dirname(__file__), 'html/*.json') + + +def assert_equal_ignoring_whitespace(reference: List[str], + converted: List[str]) -> bool: + for (ref_tag, ref_str), (conv_tag, conv_str) in zip(reference, converted): + + assert ref_tag == conv_tag + assert ''.join(ref_str.split()) == ''.join(conv_str.split()) + + +def test_html_annotations(filter_str=''): + for annotation_file in glob(TESTCASE_PATTERN): + if filter_str not in annotation_file: + continue + + with open(annotation_file) as f: + reference = load(f) + + with open(annotation_file.replace('.json', '.html')) as f: + print(f.name) + html = '{}'.format(f.read()) + + for indentation_strategy in ('strict', 'relaxed'): + result = get_annotated_text(html, ParserConfig( + css=CSS_PROFILES[indentation_strategy], + annotation_rules=reference['annotation_rules'])) + + converted = [[a[2], result['text'][a[0]:a[1]]] + for a in result['label']] + + if reference['result'] != converted: + print("Reference:") + print(reference['result']) + print("\nConverted (indentation strategy: {})".format(indentation_strategy)) + print(converted) + + if indentation_strategy == 'strict': + assert reference['result'] == converted + else: + assert_equal_ignoring_whitespace(reference['result'], + converted) + + +if __name__ == '__main__': + from sys import argv + + filter_str = argv[1] if len(argv) > 1 else '' + test_html_annotations(filter_str) diff --git a/tests/test_limit_whitespace_affixes.py b/tests/test_limit_whitespace_affixes.py index 202451e..20d6666 100644 --- a/tests/test_limit_whitespace_affixes.py +++ b/tests/test_limit_whitespace_affixes.py @@ -4,11 +4,12 @@ Tests different HTML to text conversion options. """ +from copy import copy from inscriptis import get_text from inscriptis.css_profiles import RELAXED_CSS_PROFILE from inscriptis.html_properties import Display, WhiteSpace from inscriptis.model.config import ParserConfig -from inscriptis.model.css import HtmlElement +from inscriptis.model.html_element import HtmlElement def test_html_element_refinement(): @@ -18,24 +19,24 @@ def test_html_element_refinement(): code = HtmlElement('code') # refinement with pre and whitespaces - refined = pre.get_refined_html_element(new) + refined = pre.get_refined_html_element(copy(new)) assert refined.prefix == '' assert refined.suffix == '' # refinement with code and whitespaces - refined = code.get_refined_html_element(new) + refined = code.get_refined_html_element(copy(new)) assert refined.prefix == ' ' assert refined.suffix == ' ' # refinement with pre and non-whitespaces new.prefix = ' 1. ' new.suffix = '<' - refined = pre.get_refined_html_element(new) + refined = pre.get_refined_html_element(copy(new)) assert refined.prefix == ' 1. ' assert refined.suffix == '<' # refinement with code and non-whitespaces - refined = code.get_refined_html_element(new) + refined = code.get_refined_html_element(copy(new)) assert refined.prefix == ' 1. ' assert refined.suffix == '<' @@ -53,7 +54,6 @@ def hallo(): ''' config = ParserConfig(css=RELAXED_CSS_PROFILE) assert get_text(html, config).strip() == \ - 'hallo echo\n' \ - '\n' \ + 'hallo echo\n\n' \ 'def hallo():\n' \ ' print("echo")' diff --git a/tests/test_margin_before_at_start.py b/tests/test_margin_before_at_start.py index 1a4a343..bcadbc5 100644 --- a/tests/test_margin_before_at_start.py +++ b/tests/test_margin_before_at_start.py @@ -15,11 +15,11 @@ def test_content(): def test_margin_before(): html = '

      first

      ' - assert get_text(html) == 'first' + assert get_text(html) == 'first\n' html = 'first

      ' \ 'second

      ' - assert get_text(html) == 'first\nsecond' + assert get_text(html) == 'first\n\nsecond\n' def test_br(): diff --git a/tests/test_margin_handling.py b/tests/test_margin_handling.py new file mode 100644 index 0000000..c09d944 --- /dev/null +++ b/tests/test_margin_handling.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +Tests different white-space handling. +""" + +from inscriptis import get_text +from inscriptis.css_profiles import CSS_PROFILES +from inscriptis.model.config import ParserConfig + +config = ParserConfig(css=CSS_PROFILES['strict']) + + +def test_margin_handling(): + html = u'''Hallo +
      Echo +
      Mecho
      +
      + sei Gott + ''' + assert get_text(html, config) == u'Hallo\n\nEcho\n\n\nMecho\n\nsei Gott' + + html = u'''Hallo +
      Echo
      +
      Mecho
      + sei Gott + ''' + assert get_text(html, config) == u'Hallo\n\nEcho\n\n\nMecho\nsei Gott' + + html = u'''Hallo +
      +
      Ehre
      +
      + sei Gott + ''' + assert get_text(html, config) == u'Hallo\n\n\nEhre\n\nsei Gott' diff --git a/tests/test_metadata.py b/tests/test_metadata.py new file mode 100644 index 0000000..9ffe217 --- /dev/null +++ b/tests/test_metadata.py @@ -0,0 +1,19 @@ +from inscriptis.metadata import (__author__, __author_email__, __copyright__, + __license__, __version__) + + +def test_metadata(): + """Test inscriptis package metadata.""" + assert 'Albert Weichselbraun' in __author__ + assert 'Fabian Odoni' in __author__ + + assert '@' in __author_email__ + + assert '2016-' in __copyright__ + assert 'Albert Weichselbraun' in __copyright__ + assert 'Fabian Odoni' in __copyright__ + + assert __license__ == 'Apache 2.0' + + assert __version__[0].isnumeric() + assert '.' in __version__ diff --git a/tests/test_model_html_element_canvas.py b/tests/test_model_html_element_canvas.py new file mode 100644 index 0000000..ecc4698 --- /dev/null +++ b/tests/test_model_html_element_canvas.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +Tests the rendering of a single table line. +""" + +from inscriptis.model.canvas import Canvas +from inscriptis.model.html_element import HtmlElement +from inscriptis.html_properties import Display + + +def _get_text(html_element): + """ + Returns + the text formatted based on the current HTML element. + """ + c = Canvas() + html_element.canvas = c + + HtmlElement().set_canvas(c).write("first") + + c.open_tag(html_element) + html_element.write("Ehre sei Gott!") + c.close_tag(html_element) + + HtmlElement().set_canvas(c).write("last") + c._flush_inline() + return '\n'.join(c.blocks) + + +def test_formatting(): + # standard line + + h = HtmlElement() + assert _get_text(h) == 'firstEhre sei Gott!last' + + h.display = Display.block + h.margin_before = 1 + h.margin_after = 2 + print(h) + print(_get_text(h)) + assert _get_text(h) == 'first\n\nEhre sei Gott!\n\n\nlast' + + # list bullet without padding_inline + h.list_bullet = "* " + assert _get_text(h) == 'first\n\n* Ehre sei Gott!\n\n\nlast' + + # add a padding_inline + h.padding_inline = 3 + assert _get_text(h) == 'first\n\n * Ehre sei Gott!\n\n\nlast' + + # and prefixes + suffixes + h.prefix = '>>' + h.suffix = '<<' + assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast' diff --git a/tests/test_model_line.py b/tests/test_model_line.py deleted file mode 100644 index 98a7a2f..0000000 --- a/tests/test_model_line.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 - -""" -Tests the rendering of a single table line. -""" - -from inscriptis.model.canvas import Line - - -def test_cell_formatting(): - # standard line - line = Line() - line.margin_before = 0 - line.margin_after = 0 - line.prefix = '' - line.suffix = '' - line.content = 'Ehre sei Gott!' - line.list_bullet = '' - line.padding = 0 - - assert line.get_text() == 'Ehre sei Gott!' - # string representation - assert str(line) == \ - "" - assert repr(line) == str(line) - - # add margins - line.margin_before = 1 - line.margin_after = 2 - assert line.get_text() == '\nEhre sei Gott!\n\n' - - # list bullet without padding - line.list_bullet = "* " - assert line.get_text() == '\n* Ehre sei Gott!\n\n' - - # add a padding - line.padding = 3 - assert line.get_text() == '\n * Ehre sei Gott!\n\n' - - # and prefixes + suffixes - line.prefix = '>>' - line.suffix = '<<' - assert line.get_text() == '\n * >>Ehre sei Gott!<<\n\n' diff --git a/tests/test_model_prefix.py b/tests/test_model_prefix.py new file mode 100644 index 0000000..6682bbb --- /dev/null +++ b/tests/test_model_prefix.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +Tests the rendering of a single table line. +""" + +from inscriptis.model.canvas import Prefix + + +def test_simple_prefix(): + p = Prefix() + + p.register_prefix(5, '1. ') + + # first use + assert p.first == ' 1. ' + + # the prefix has been consumed + assert p.first == '' + + # prefix used to indent lines separated with newlines + assert p.rest == ' ' + + +def test_combined_prefix(): + p = Prefix() + + p.register_prefix(5, '1. ') + p.register_prefix(2, '') + + assert p.first == ' 1. ' + assert p.first == '' + + p.remove_last_prefix() + assert p.first == '' + + p.remove_last_prefix() + # final consumption - no prefix + assert p.first == '' + + # ensure that there are no interactions between different runs with + # bullets + p.consumed = False + p.register_prefix(5, '2. ') + p.register_prefix(2, '- ') + + assert p.first == ' - ' + assert p.first == '' + assert p.rest == ' ' + + p.consumed = False + p.remove_last_prefix() + assert p.first == ' 2. ' + assert p.rest == ' ' + diff --git a/tests/test_parse_css.py b/tests/test_parse_css.py index 3c3c54d..9822644 100644 --- a/tests/test_parse_css.py +++ b/tests/test_parse_css.py @@ -9,13 +9,14 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.html_properties import (Display, WhiteSpace, VerticalAlignment, HorizontalAlignment) -from inscriptis.model.css import CssParse, HtmlElement +from inscriptis.model.css import CssParse +from inscriptis.model.html_element import HtmlElement def test_css_parsing(): html_element = copy(CSS_PROFILES['strict']['div']) CssParse.attr_style('padding_left: 8px; display: block', html_element) - assert html_element.padding == 1 + assert html_element.padding_inline == 1 assert html_element.display == Display.block CssParse.attr_style('margin_before: 8em; display: inline', html_element) @@ -28,13 +29,15 @@ def test_html_element_str(): Tests the string representation of an HtmlElement. ''' html_element = HtmlElement('div', '', '', Display.inline, 0, 0, 0, - WhiteSpace.pre) + '', WhiteSpace.pre) assert str(html_element) == ('
      ') + 'valign=VerticalAlignment.middle, ' + 'annotation=()>') def test_parse_vertical_align(): @@ -46,6 +49,7 @@ def test_parse_vertical_align(): CssParse.attr_vertical_align('unknown', html_element) assert html_element.valign == VerticalAlignment.top + def test_parse_horizontal_align(): html_element = HtmlElement() CssParse.attr_horizontal_align('center', html_element) diff --git a/tests/test_style_parsing.py b/tests/test_style_parsing.py index 4956346..8efce8f 100644 --- a/tests/test_style_parsing.py +++ b/tests/test_style_parsing.py @@ -4,7 +4,8 @@ Tests inscriptis' parsing of CSS style definitions. """ -from inscriptis.model.css import CssParse, HtmlElement +from inscriptis.model.css import CssParse +from inscriptis.model.html_element import HtmlElement def test_style_unit_parsing(): diff --git a/tests/test_table_cell.py b/tests/test_table_cell.py index fce80cb..8c728b2 100644 --- a/tests/test_table_cell.py +++ b/tests/test_table_cell.py @@ -9,48 +9,39 @@ from inscriptis.model.table import TableCell from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment +def test_height(): + cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top) + + cell.blocks = ['hallo'] + cell.normalize_blocks() + assert cell.height == len('\n'.join(cell.blocks).split('\n')) + + cell.blocks = ['hallo', 'echo'] + cell.normalize_blocks() + assert cell.height == 2 + + cell.blocks = ['hallo\necho'] + cell.normalize_blocks() + assert cell.height == 2 + + cell.blocks = ['hallo\necho', 'Ehre sei Gott', 'Jump\n&\nRun!\n\n\n'] + cell.normalize_blocks() + assert cell.height == 9 + assert cell.height == len('\n'.join(cell.blocks).split('\n')) + +def test_width(): + cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top) + + cell.blocks = ['hallo'] + cell.normalize_blocks() + assert cell.width == len(cell.blocks[0]) + + cell.blocks = ['hallo\necho', 'Ehre sei Gott', 'Jump\n&\nRun!\n\n\n'] + cell.normalize_blocks() + assert cell.width == len('Ehre sei Gott') + + # fixed set width + cell.width = 95 + cell.normalize_blocks() + assert cell.width == 95 -def test_horizontal_cell_formatting(): - - canvas = [] - cell = TableCell(canvas=canvas, align=HorizontalAlignment.left, - valign=VerticalAlignment.top) - cell.width = 16 - canvas.append('Ehre sei Gott!') - - # left alignment - assert cell.get_cell_lines() == ['Ehre sei Gott! '] - - # right alignment - cell.align = HorizontalAlignment.right - assert cell.get_cell_lines() == [' Ehre sei Gott!'] - - -def test_vertical_cell_formatting(): - canvas = [] - cell = TableCell(canvas=canvas, align=HorizontalAlignment.left, - valign=VerticalAlignment.top) - cell.width = 16 - cell.height = 4 - canvas.append('Ehre sei Gott!') - - # default top alignment - assert cell.get_cell_lines() == ['Ehre sei Gott! ', - ' ', - ' ', - ' '] - - # bottom alignment - cell.valign = VerticalAlignment.bottom - assert cell.get_cell_lines() == [' ', - ' ', - ' ', - 'Ehre sei Gott! '] - - # middle alignment - cell.valign = VerticalAlignment.middle - print(cell.get_cell_lines()) - assert cell.get_cell_lines() == [' ', - 'Ehre sei Gott! ', - ' ', - ' '] diff --git a/tests/test_table_cell_formatting.py b/tests/test_table_cell_formatting.py new file mode 100644 index 0000000..7062d78 --- /dev/null +++ b/tests/test_table_cell_formatting.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +Tests the Table formatting with different parameters such as width and +alignment +""" + +from inscriptis.model.table import TableCell +from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment + + +def test_horizontal_cell_formatting(): + + cell = TableCell(align=HorizontalAlignment.left, + valign=VerticalAlignment.top) + # left alignment + cell.blocks = ['Ehre sei Gott!'] + cell.width = 16 + assert cell.blocks == ['Ehre sei Gott! '] + + # right alignment + cell.align = HorizontalAlignment.right + cell.blocks = ['Ehre sei Gott!'] + cell.width = 16 + assert cell.blocks == [' Ehre sei Gott!'] + + +def test_vertical_cell_formatting(): + cell = TableCell(align=HorizontalAlignment.left, + valign=VerticalAlignment.top) + + # default top alignment + cell.blocks = ['Ehre sei Gott!'] + cell.width = 16 + cell.height = 4 + assert cell.blocks == ['Ehre sei Gott! ', + '', + '', + ''] + + # bottom alignment + cell.blocks = ['Ehre sei Gott!'] + cell.valign = VerticalAlignment.bottom + cell.width = 16 + cell.height = 4 + assert cell.blocks == ['', + '', + '', + 'Ehre sei Gott! '] + + # middle alignment + cell.blocks = ['Ehre sei Gott!'] + cell.valign = VerticalAlignment.middle + cell.width = 16 + cell.height = 4 + assert cell.blocks == ['', + 'Ehre sei Gott! ', + '', + ''] diff --git a/tests/test_table_row.py b/tests/test_table_row.py new file mode 100644 index 0000000..c09049b --- /dev/null +++ b/tests/test_table_row.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +Test borderline cases for table rows +""" + +from inscriptis.model.table import TableRow + +def test_empty_row(): + tr = TableRow() + + assert tr.width == 0 + assert tr.get_text() == '' diff --git a/tests/test_version_information.py b/tests/test_version_information.py deleted file mode 100644 index f578d58..0000000 --- a/tests/test_version_information.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python - -import sys -import pytest -import importlib - -BLACKLISTED_MODULE = 'lxml' - - -def secure_importer(name, globals=None, locals=None, fromlist=(), level=0): - if name.startswith(BLACKLISTED_MODULE): - raise ImportError("Cannot import module %s." % name) - return importlib.__import__(name, globals, locals, fromlist, level) - - -def test_package_metadata(): - """ - verify that the package metadata is available, even if no dependencies - are installed. - """ - # clear the python search path to verify whether we can import - # inscriptis even if its dependencies are not available - # (required for building the docs and setup.py) - saved_importer = __builtins__['__import__'] - saved_modules = {} - with pytest.warns(UserWarning): - # delete cached modules - saved = {} - for module in list(sys.modules): - if module.startswith('lxml') or module == 'inscriptis': - saved_modules[module] = sys.modules[module] - del sys.modules[module] - - # overwrite import mechanism - __builtins__['__import__'] = secure_importer - from inscriptis import (__version__, __author__, __author_email__, - __copyright__, __license__) - - assert __version__ - assert 'Albert' in __author__ and 'Fabian' in __author__ - assert '@fhgr' in __author_email__ - assert 'Albert' in __copyright__ and 'Fabian' in __copyright__ - assert 'Apache' in __license__ - - sys.modules.update(saved_modules) - __builtins__['__import__'] = saved_importer diff --git a/tests/test_white_space_handling.py b/tests/test_white_space_handling.py index 6f1fead..cf43d4d 100644 --- a/tests/test_white_space_handling.py +++ b/tests/test_white_space_handling.py @@ -32,3 +32,48 @@ def test_white_space(): html = (u'12\n3' u'') assert get_text(html, config) == u'12\n3' + + +def test_borderline_cases(): + """ + testing of borderline cases based on the behavior found in Firefox and + Google Chrome. + """ + # change of whitespace handling between terms; no whitespace + # between the terms + html = u'Halloecho versus' + assert get_text(html, config) == u'Halloecho versus' + + # change of whitespace handling between terms; one whitespace + # between the terms; option 1 + html = u'Hallo echo versus' + assert get_text(html, config) == u'Hallo echo versus' + + # change of whitespace handling between terms; one whitespace + # between the terms; option 2 + html = u'Hallo echo versus' + assert get_text(html, config) == u'Hallo echo versus' + + # change of whitespace handling between terms; two whitespaces + # between the terms + html = u'Hallo echo versus' + assert get_text(html, config) == u'Hallo echo versus' + + # change of whitespace handling between terms; multiple whitespaces + # between the terms + html = u'Hallo echo versus' + assert get_text(html, config) == u'Hallo echo versus' + + # change of whitespace handling between terms; multiple whitespaces + # between the terms + html = u'Hallo echo versus' + assert get_text(html, config) == u'Hallo echo versus' + + +def test_tail(): + """ + ensure that the tail elements are formated based on the container element. + """ + html = (u'Hi 1 3 ' + u' versus 1 3') + assert get_text(html, config) == u'Hi 1 3 versus 1 3' diff --git a/tox.ini b/tox.ini index 142fae6..8f62fd7 100644 --- a/tox.ini +++ b/tox.ini @@ -2,7 +2,8 @@ # standard unit tests [testenv:pytest] deps = pytest -commands = pytest ./tests + pytest-coverage +commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests # python packaging best practices [testenv:pyroma] @@ -16,21 +17,29 @@ deps = flake8 flake8-bandit flake8-bugbear flake8-builtins + flake8-cognitive-complexity flake8-colors flake8-comprehensions + flake8-docstrings + flake8-eradicate + flake8-expression-complexity flake8-mutable flake8-pathlib flake8-pytest + flake8-quotes flake8-raise + flake8-simplify flake8-string-format flake8-tuple flake8-logging-format + pep8-naming commands = flake8 [flake8] exclude = .tox docs benchmarking + setup.py tests # S104 - do not cleanup XML data prior to processing @@ -38,7 +47,15 @@ exclude = .tox # aimed for use with docker. # W503 - replaced with W504 # E402 - required for importing inscriptis metadata in setup.py -ignore = S104, S410, W503, E402 +# D102 - missing docstring in public method +# D105 - missing docstring in magic method (e.g., __str__) +# D107 - missing docstring in __init__ +ignore = S104, S410, W503, E402, D107, D105, D102 show-source = true enable-extensions=G application-import-names = inscriptis + +# flake8 cognitive complexity +max-cognitive-complexity=13 + +#