-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #64 from jesper-raemaekers/utils-clean-html
Utils clean html
- Loading branch information
Showing
6 changed files
with
247 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ Welcome to Polarion's documentation! | |
testrecord | ||
document | ||
plan | ||
utils | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
Polarion Utils | ||
================ | ||
|
||
Other useful functions that do not fit anywhere else. | ||
|
||
HTML parsing | ||
------------ | ||
|
||
You can get the parsed description by using the parser like the example below. | ||
Passing the project to the parser is only needed when you want to find the title for linked workitems in the HTML text. | ||
If not supplied it will only list the linked ID. | ||
|
||
.. code:: python | ||
# if you have a project and workitem | ||
project = pol.getProject('') | ||
workitem = project.getWorkitem('') | ||
# get the parsed description like this | ||
parser = DescriptionParser(project) | ||
parser.feed(workitem.getDescription()) | ||
print(parser.data) | ||
Document functions | ||
------------------ | ||
|
||
.. autofunction::polarion.utils.clean_html | ||
.. autoclass:: polarion.utils.DescriptionParser | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import re | ||
from abc import ABC | ||
from html.parser import HTMLParser | ||
from polarion.project import Project | ||
from xml.etree import ElementTree | ||
from texttable import Texttable | ||
|
||
|
||
class DescriptionParser(HTMLParser, ABC): | ||
|
||
def __init__(self, polarion_project: Project = None): | ||
""" | ||
A HTMLParser with to cleaen the HTML tags from a string. | ||
Can lookup Polarion links in HTML, present tables in a readable format and extracts formula's to text | ||
@param polarion_project: A polarion project used to search for the title of a workitem if the link type is 'long'. | ||
""" | ||
super(DescriptionParser, self).__init__() | ||
self._polarion_project = polarion_project | ||
self._data = '' | ||
self._table_start = None | ||
self._table_end = None | ||
|
||
@property | ||
def data(self): | ||
""" | ||
The parsed data | ||
@return: string | ||
""" | ||
return self._data | ||
|
||
def reset(self): | ||
""" | ||
Reset the parsing state | ||
@return: None | ||
""" | ||
super(DescriptionParser, self).reset() | ||
self._data = '' | ||
self._table_start = None | ||
self._table_end = None | ||
|
||
def handle_data(self, data): | ||
""" | ||
Handles the data within HTML tags | ||
@param data: the data inside a HTML tag | ||
@return: None | ||
""" | ||
# handle data outside of table content | ||
if self._table_start is None: | ||
self._data += data | ||
|
||
def handle_starttag(self, tag, attrs): | ||
""" | ||
Handles the start of a HTML tag. In some cases the start tag is the only tag and then it parses the attributes | ||
depending on the tag. | ||
@param tag: Tag identifier | ||
@param attrs: A tuple of attributes | ||
@return: None | ||
""" | ||
# parse attributes to dict | ||
attributes = {} | ||
for attribute, value in attrs: | ||
attributes[attribute] = value | ||
|
||
if tag == 'span' and 'class' in attributes: | ||
if attributes['class'] == 'polarion-rte-link': | ||
self._handle_polarion_rte_link(attributes) | ||
elif attributes['class'] == 'polarion-rte-formula': | ||
self._handle_polarion_rte_formula(attributes) | ||
|
||
if tag == 'table': | ||
self._table_start = self.getpos() | ||
|
||
def handle_endtag(self, tag): | ||
""" | ||
Handles the end of a tag. | ||
@param tag: Name of the tag | ||
@return: None | ||
""" | ||
if tag == 'table': | ||
self._handle_table() | ||
|
||
def _handle_table(self): | ||
""" | ||
Handles the HTML tables. It parses the table to a readable format. | ||
@return: None | ||
""" | ||
# get the table HTML content | ||
self._table_end = self.getpos() | ||
table_content = self.rawdata.split('\n') | ||
correct_lines = table_content[self._table_start[0] - 1:self._table_end[0]] | ||
# iterate over table elements and parse to 2d array | ||
table = ElementTree.XML(''.join(correct_lines)) | ||
content = [] | ||
for tr in table.iter('tr'): | ||
content.append([]) | ||
for th in tr.iter('th'): | ||
content[-1].append(th.text) | ||
for td in tr.iter('td'): | ||
content[-1].append(td.text) | ||
self._data += Texttable().add_rows(content).draw() | ||
self._table_start = None | ||
self._table_end = None | ||
|
||
def _handle_polarion_rte_link(self, attributes): | ||
""" | ||
Gets either the workitem id from a link (short) or the workitem id and title (long) | ||
@param attributes: attributes to the link tag | ||
@return: None | ||
""" | ||
if attributes['data-option-id'] == 'short' or ( | ||
attributes['data-option-id'] == 'long' and self._polarion_project is None): | ||
self._data += attributes['data-item-id'] | ||
else: | ||
linked_item = self._polarion_project.getWorkitem(attributes['data-item-id']) | ||
self._data += str(linked_item) | ||
|
||
def _handle_polarion_rte_formula(self, attributes): | ||
""" | ||
Gets the formula for a polarion formula tag | ||
@param attributes: attributes to the formula tag | ||
@return: None | ||
""" | ||
self._data += attributes['data-source'] | ||
|
||
|
||
|
||
def strip_html(raw_html): | ||
""" | ||
Strips all HTML tags from HTML code leaving only plain text with no formatting. | ||
:param raw_html: HTML string | ||
:return: plain text string | ||
""" | ||
clean = re.compile('<.*?>') | ||
clean_text = re.sub(clean, '', raw_html) | ||
return clean_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
requests~=2.25.1 | ||
urllib3~=1.26.6 | ||
zeep~=4.0.0 | ||
texttable~=1.6.4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import unittest | ||
from mock import patch | ||
from polarion.utils import * | ||
|
||
|
||
class TestPolarionUtils(unittest.TestCase): | ||
|
||
def test_clean_html(self): | ||
core_text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc eu libero ipsum. Nullam eget augue' | ||
html_text = f'<p><img src="bla"/>{core_text}</p></br>' | ||
|
||
clean_text = strip_html(html_text) | ||
|
||
self.assertEqual(core_text, clean_text, msg="Cleaned text was not equal to expected core text") | ||
|
||
def test_html_formatter(self): | ||
html_text = """big text<br/> | ||
normal text<br/> | ||
<span data-source="a=b" data-inline="false" class="polarion-rte-formula"></span> | ||
<table id="polarion_wiki macro name=table" class="polarion-Document-table" style="width: 80%;margin-left: auto;margin-right: auto;border: 1px solid #CCCCCC;empty-cells: show;border-collapse: collapse;"> | ||
<tbody> | ||
<tr> | ||
<th style="font-weight: bold;background-color: #F0F0F0;text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">1</th> | ||
<th style="font-weight: bold;background-color: #F0F0F0;text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">2</th> | ||
</tr> | ||
<tr> | ||
<td style="text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">3</td> | ||
<td style="text-align: left;vertical-align: top;height: 12px;border: 1px solid #CCCCCC;padding: 5px;">4</td> | ||
</tr> | ||
</tbody> | ||
</table> | ||
<br/>""" | ||
|
||
expected_output = ('big text\n' | ||
'normal text\n' | ||
'a=b\n' | ||
'+---+---+\n' | ||
'| 1 | 2 |\n' | ||
'+===+===+\n' | ||
'| 3 | 4 |\n' | ||
'+---+---+\n') | ||
|
||
|
||
parser = DescriptionParser() | ||
|
||
parser.feed(html_text) | ||
actual_output = parser.data.replace(" ", "") # remove spaces for easier comparison | ||
expected_output = expected_output.replace(" ", "") # remove spaces for easier comparison | ||
|
||
self.assertEqual(expected_output, actual_output, msg='Parser result deviated from expected.') | ||
|
||
@patch('polarion.project.Project') | ||
def test_links(self, project_mock): | ||
html_text = '<span class="polarion-rte-link" data-type="workItem" id="fake" data-item-id="PYTH-510" data-option-id="long"></span>' \ | ||
'<span class="polarion-rte-link" data-type="workItem" id="fake" data-item-id="PYTH-510" data-option-id="short"></span>' \ | ||
'</span> <br/>' | ||
|
||
workitem_pyth_510_title = 'title of 510' | ||
expected_text = f'{workitem_pyth_510_title}PYTH-510' | ||
project_mock.getWorkitem.return_value = workitem_pyth_510_title | ||
|
||
# test with a project mock supplied.\ | ||
# it should replace the 'long' tag with the title | ||
parser = DescriptionParser(project_mock) | ||
parser.feed(html_text) | ||
|
||
self.assertEqual(expected_text.strip(), parser.data.strip(), | ||
msg='Parser workitem text did not match using project to find title') | ||
|
||
# with no project supplied it should default back to short | ||
expected_text = f'PYTH-510PYTH-510' | ||
|
||
parser = DescriptionParser() | ||
parser.feed(html_text) | ||
|
||
self.assertEqual(expected_text.strip(), parser.data.strip(), msg='Parser workitem text did not match') | ||
|