Merge pull request #64 from jesper-raemaekers/utils-clean-html

Utils clean html
jesper-raemaekers · Apr 16, 2022 · 9fc2531 · 9fc2531
2 parents 3ddb051 + ec005b8
commit 9fc2531
Show file tree

Hide file tree

Showing 6 changed files with 247 additions and 1 deletion.
diff --git a/docs/index.rst b/docs/index.rst
@@ -17,6 +17,7 @@ Welcome to Polarion's documentation!
    testrecord
    document
    plan
+   utils
 
 
 

diff --git a/docs/utils.rst b/docs/utils.rst
@@ -0,0 +1,31 @@
+Polarion Utils
+================
+
+Other useful functions that do not fit anywhere else.
+
+HTML parsing
+------------
+
+You can get the parsed description by using the parser like the example below.
+Passing the project to the parser is only needed when you want to find the title for linked workitems in the HTML text.
+If not supplied it will only list the linked ID.
+
+.. code:: python
+
+    # if you have a project and workitem
+    project = pol.getProject('')
+    workitem = project.getWorkitem('')
+
+    # get the parsed description like this
+    parser = DescriptionParser(project)
+    parser.feed(workitem.getDescription())
+    print(parser.data)
+
+
+Document functions
+------------------
+
+.. autofunction::polarion.utils.clean_html
+
+.. autoclass:: polarion.utils.DescriptionParser
+    :members:
diff --git a/polarion/utils.py b/polarion/utils.py
@@ -0,0 +1,136 @@
+import re
+from abc import ABC
+from html.parser import HTMLParser
+from polarion.project import Project
+from xml.etree import ElementTree
+from texttable import Texttable
+
+
+class DescriptionParser(HTMLParser, ABC):
+
+    def __init__(self, polarion_project: Project = None):
+        """
+        A HTMLParser with to cleaen the HTML tags from a string.
+        Can lookup Polarion links in HTML, present tables in a readable format and extracts formula's to text
+
+        @param polarion_project: A polarion project used to search for the title of a workitem if the link type is 'long'.
+        """
+        super(DescriptionParser, self).__init__()
+        self._polarion_project = polarion_project
+        self._data = ''
+        self._table_start = None
+        self._table_end = None
+
+    @property
+    def data(self):
+        """
+        The parsed data
+        @return: string
+        """
+        return self._data
+
+    def reset(self):
+        """
+        Reset the parsing state
+        @return: None
+        """
+        super(DescriptionParser, self).reset()
+        self._data = ''
+        self._table_start = None
+        self._table_end = None
+
+    def handle_data(self, data):
+        """
+        Handles the data within HTML tags
+        @param data: the data inside a HTML tag
+        @return: None
+        """
+        # handle data outside of table content
+        if self._table_start is None:
+            self._data += data
+
+    def handle_starttag(self, tag, attrs):
+        """
+        Handles the start of a HTML tag. In some cases the start tag is the only tag and then it parses the attributes
+        depending on the tag.
+        @param tag: Tag identifier
+        @param attrs: A tuple of attributes
+        @return: None
+        """
+        # parse attributes to dict
+        attributes = {}
+        for attribute, value in attrs:
+            attributes[attribute] = value
+
+        if tag == 'span' and 'class' in attributes:
+            if attributes['class'] == 'polarion-rte-link':
+                self._handle_polarion_rte_link(attributes)
+            elif attributes['class'] == 'polarion-rte-formula':
+                self._handle_polarion_rte_formula(attributes)
+
+        if tag == 'table':
+            self._table_start = self.getpos()
+
+    def handle_endtag(self, tag):
+        """
+        Handles the end of a tag.
+        @param tag: Name of the tag
+        @return: None
+        """
+        if tag == 'table':
+            self._handle_table()
+
+    def _handle_table(self):
+        """
+        Handles the HTML tables. It parses the table to a readable format.
+        @return: None
+        """
+        # get the table HTML content
+        self._table_end = self.getpos()
+        table_content = self.rawdata.split('\n')
+        correct_lines = table_content[self._table_start[0] - 1:self._table_end[0]]
+        # iterate over table elements and parse to 2d array
+        table = ElementTree.XML(''.join(correct_lines))
+        content = []
+        for tr in table.iter('tr'):
+            content.append([])
+            for th in tr.iter('th'):
+                content[-1].append(th.text)
+            for td in tr.iter('td'):
+                content[-1].append(td.text)
+        self._data += Texttable().add_rows(content).draw()
+        self._table_start = None
+        self._table_end = None
+
+    def _handle_polarion_rte_link(self, attributes):
+        """
+        Gets either the workitem id from a link (short) or the workitem id and title (long)
+        @param attributes: attributes to the link tag
+        @return: None
+        """
+        if attributes['data-option-id'] == 'short' or (
+                attributes['data-option-id'] == 'long' and self._polarion_project is None):
+            self._data += attributes['data-item-id']
+        else:
+            linked_item = self._polarion_project.getWorkitem(attributes['data-item-id'])
+            self._data += str(linked_item)
+
+    def _handle_polarion_rte_formula(self, attributes):
+        """
+        Gets the formula for a polarion formula tag
+        @param attributes: attributes to the formula tag
+        @return: None
+        """
+        self._data += attributes['data-source']
+
+
+
+def strip_html(raw_html):
+    """
+    Strips all HTML tags from HTML code leaving only plain text with no formatting.
+    :param raw_html: HTML string
+    :return: plain text string
+    """
+    clean = re.compile('<.*?>')
+    clean_text = re.sub(clean, '', raw_html)
+    return clean_text
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 requests~=2.25.1
 urllib3~=1.26.6
 zeep~=4.0.0
+texttable~=1.6.4
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
         "Operating System :: OS Independent",
         "Development Status :: 3 - Alpha"
     ],
-    install_requires=["zeep", "lxml"],
+    install_requires=["zeep", "lxml", "texttable"],
     packages=setuptools.find_packages(),
     python_requires='>=3.6',
 )
diff --git a/tests/test_polarion_utils.py b/tests/test_polarion_utils.py
@@ -0,0 +1,77 @@
+import unittest
+from mock import patch
+from polarion.utils import *
+
+
+class TestPolarionUtils(unittest.TestCase):
+
+    def test_clean_html(self):
+        core_text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eu libero ipsum. Nullam eget augue'
+        html_text = f'<p><img src="bla"/>{core_text}</p></br>'
+
+        clean_text = strip_html(html_text)
+
+        self.assertEqual(core_text, clean_text, msg="Cleaned text was not equal to expected core text")
+
+    def test_html_formatter(self):
+        html_text = """big text<br/>
+                        normal text<br/>
+                        <span data-source="a=b" data-inline="false" class="polarion-rte-formula"></span>
+                        <table id="polarion_wiki macro name=table" class="polarion-Document-table" style="width: 80%;margin-left: auto;margin-right: auto;border: 1px solid #CCCCCC;empty-cells: show;border-collapse: collapse;">
+                          <tbody>
+                            <tr>
+                              <th style="font-weight: bold;background-color: #F0F0F0;text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">1</th>
+                              <th style="font-weight: bold;background-color: #F0F0F0;text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">2</th>
+                            </tr>
+                            <tr>
+                              <td style="text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">3</td>
+                              <td style="text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">4</td>
+                            </tr>
+                          </tbody>
+                        </table>
+                        <br/>"""
+
+        expected_output =  ('big text\n'
+                            'normal text\n'
+                            'a=b\n'
+                            '+---+---+\n'
+                            '| 1 | 2 |\n'
+                            '+===+===+\n'
+                            '| 3 | 4 |\n'
+                            '+---+---+\n')
+
+
+        parser = DescriptionParser()
+
+        parser.feed(html_text)
+        actual_output = parser.data.replace(" ", "")  # remove spaces for easier comparison
+        expected_output = expected_output.replace(" ", "")  # remove spaces for easier comparison
+
+        self.assertEqual(expected_output, actual_output, msg='Parser result deviated from expected.')
+
+    @patch('polarion.project.Project')
+    def test_links(self, project_mock):
+        html_text = '<span class="polarion-rte-link" data-type="workItem" id="fake" data-item-id="PYTH-510" data-option-id="long"></span>' \
+                    '<span class="polarion-rte-link" data-type="workItem" id="fake" data-item-id="PYTH-510" data-option-id="short"></span>' \
+                    '</span> <br/>'
+
+        workitem_pyth_510_title = 'title of 510'
+        expected_text = f'{workitem_pyth_510_title}PYTH-510'
+        project_mock.getWorkitem.return_value = workitem_pyth_510_title
+
+        # test with a project mock supplied.\
+        # it should replace the 'long' tag with the title
+        parser = DescriptionParser(project_mock)
+        parser.feed(html_text)
+
+        self.assertEqual(expected_text.strip(), parser.data.strip(),
+                         msg='Parser workitem text did not match using project to find title')
+
+        # with no project supplied it should default back to short
+        expected_text = f'PYTH-510PYTH-510'
+
+        parser = DescriptionParser()
+        parser.feed(html_text)
+
+        self.assertEqual(expected_text.strip(), parser.data.strip(), msg='Parser workitem text did not match')
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,6 +17,7 @@ Welcome to Polarion's documentation! @@
        testrecord
        document
        plan
+       utils
@@ Expand Down @@