Skip to content

Commit

Permalink
Merge pull request #64 from jesper-raemaekers/utils-clean-html
Browse files Browse the repository at this point in the history
Utils clean html
  • Loading branch information
jesper-raemaekers authored Apr 16, 2022
2 parents 3ddb051 + ec005b8 commit 9fc2531
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Welcome to Polarion's documentation!
testrecord
document
plan
utils



Expand Down
31 changes: 31 additions & 0 deletions docs/utils.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Polarion Utils
================

Other useful functions that do not fit anywhere else.

HTML parsing
------------

You can get the parsed description by using the parser like the example below.
Passing the project to the parser is only needed when you want to find the title for linked workitems in the HTML text.
If not supplied it will only list the linked ID.

.. code:: python
# if you have a project and workitem
project = pol.getProject('')
workitem = project.getWorkitem('')
# get the parsed description like this
parser = DescriptionParser(project)
parser.feed(workitem.getDescription())
print(parser.data)
Document functions
------------------

.. autofunction::polarion.utils.clean_html
.. autoclass:: polarion.utils.DescriptionParser
:members:
136 changes: 136 additions & 0 deletions polarion/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import re
from abc import ABC
from html.parser import HTMLParser
from polarion.project import Project
from xml.etree import ElementTree
from texttable import Texttable


class DescriptionParser(HTMLParser, ABC):

def __init__(self, polarion_project: Project = None):
"""
A HTMLParser with to cleaen the HTML tags from a string.
Can lookup Polarion links in HTML, present tables in a readable format and extracts formula's to text
@param polarion_project: A polarion project used to search for the title of a workitem if the link type is 'long'.
"""
super(DescriptionParser, self).__init__()
self._polarion_project = polarion_project
self._data = ''
self._table_start = None
self._table_end = None

@property
def data(self):
"""
The parsed data
@return: string
"""
return self._data

def reset(self):
"""
Reset the parsing state
@return: None
"""
super(DescriptionParser, self).reset()
self._data = ''
self._table_start = None
self._table_end = None

def handle_data(self, data):
"""
Handles the data within HTML tags
@param data: the data inside a HTML tag
@return: None
"""
# handle data outside of table content
if self._table_start is None:
self._data += data

def handle_starttag(self, tag, attrs):
"""
Handles the start of a HTML tag. In some cases the start tag is the only tag and then it parses the attributes
depending on the tag.
@param tag: Tag identifier
@param attrs: A tuple of attributes
@return: None
"""
# parse attributes to dict
attributes = {}
for attribute, value in attrs:
attributes[attribute] = value

if tag == 'span' and 'class' in attributes:
if attributes['class'] == 'polarion-rte-link':
self._handle_polarion_rte_link(attributes)
elif attributes['class'] == 'polarion-rte-formula':
self._handle_polarion_rte_formula(attributes)

if tag == 'table':
self._table_start = self.getpos()

def handle_endtag(self, tag):
"""
Handles the end of a tag.
@param tag: Name of the tag
@return: None
"""
if tag == 'table':
self._handle_table()

def _handle_table(self):
"""
Handles the HTML tables. It parses the table to a readable format.
@return: None
"""
# get the table HTML content
self._table_end = self.getpos()
table_content = self.rawdata.split('\n')
correct_lines = table_content[self._table_start[0] - 1:self._table_end[0]]
# iterate over table elements and parse to 2d array
table = ElementTree.XML(''.join(correct_lines))
content = []
for tr in table.iter('tr'):
content.append([])
for th in tr.iter('th'):
content[-1].append(th.text)
for td in tr.iter('td'):
content[-1].append(td.text)
self._data += Texttable().add_rows(content).draw()
self._table_start = None
self._table_end = None

def _handle_polarion_rte_link(self, attributes):
"""
Gets either the workitem id from a link (short) or the workitem id and title (long)
@param attributes: attributes to the link tag
@return: None
"""
if attributes['data-option-id'] == 'short' or (
attributes['data-option-id'] == 'long' and self._polarion_project is None):
self._data += attributes['data-item-id']
else:
linked_item = self._polarion_project.getWorkitem(attributes['data-item-id'])
self._data += str(linked_item)

def _handle_polarion_rte_formula(self, attributes):
"""
Gets the formula for a polarion formula tag
@param attributes: attributes to the formula tag
@return: None
"""
self._data += attributes['data-source']



def strip_html(raw_html):
"""
Strips all HTML tags from HTML code leaving only plain text with no formatting.
:param raw_html: HTML string
:return: plain text string
"""
clean = re.compile('<.*?>')
clean_text = re.sub(clean, '', raw_html)
return clean_text
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
requests~=2.25.1
urllib3~=1.26.6
zeep~=4.0.0
texttable~=1.6.4
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"Operating System :: OS Independent",
"Development Status :: 3 - Alpha"
],
install_requires=["zeep", "lxml"],
install_requires=["zeep", "lxml", "texttable"],
packages=setuptools.find_packages(),
python_requires='>=3.6',
)
77 changes: 77 additions & 0 deletions tests/test_polarion_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import unittest
from mock import patch
from polarion.utils import *


class TestPolarionUtils(unittest.TestCase):

def test_clean_html(self):
core_text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eu libero ipsum. Nullam eget augue'
html_text = f'<p><img src="bla"/>{core_text}</p></br>'

clean_text = strip_html(html_text)

self.assertEqual(core_text, clean_text, msg="Cleaned text was not equal to expected core text")

def test_html_formatter(self):
html_text = """big text<br/>
normal text<br/>
<span data-source="a=b" data-inline="false" class="polarion-rte-formula"></span>
<table id="polarion_wiki macro name=table" class="polarion-Document-table" style="width: 80%;margin-left: auto;margin-right: auto;border: 1px solid #CCCCCC;empty-cells: show;border-collapse: collapse;">
<tbody>
<tr>
<th style="font-weight: bold;background-color: #F0F0F0;text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">1</th>
<th style="font-weight: bold;background-color: #F0F0F0;text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">2</th>
</tr>
<tr>
<td style="text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">3</td>
<td style="text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">4</td>
</tr>
</tbody>
</table>
<br/>"""

expected_output = ('big text\n'
'normal text\n'
'a=b\n'
'+---+---+\n'
'| 1 | 2 |\n'
'+===+===+\n'
'| 3 | 4 |\n'
'+---+---+\n')


parser = DescriptionParser()

parser.feed(html_text)
actual_output = parser.data.replace(" ", "") # remove spaces for easier comparison
expected_output = expected_output.replace(" ", "") # remove spaces for easier comparison

self.assertEqual(expected_output, actual_output, msg='Parser result deviated from expected.')

@patch('polarion.project.Project')
def test_links(self, project_mock):
html_text = '<span class="polarion-rte-link" data-type="workItem" id="fake" data-item-id="PYTH-510" data-option-id="long"></span>' \
'<span class="polarion-rte-link" data-type="workItem" id="fake" data-item-id="PYTH-510" data-option-id="short"></span>' \
'</span> <br/>'

workitem_pyth_510_title = 'title of 510'
expected_text = f'{workitem_pyth_510_title}PYTH-510'
project_mock.getWorkitem.return_value = workitem_pyth_510_title

# test with a project mock supplied.\
# it should replace the 'long' tag with the title
parser = DescriptionParser(project_mock)
parser.feed(html_text)

self.assertEqual(expected_text.strip(), parser.data.strip(),
msg='Parser workitem text did not match using project to find title')

# with no project supplied it should default back to short
expected_text = f'PYTH-510PYTH-510'

parser = DescriptionParser()
parser.feed(html_text)

self.assertEqual(expected_text.strip(), parser.data.strip(), msg='Parser workitem text did not match')

0 comments on commit 9fc2531

Please sign in to comment.