Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use a better parser #1

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
vendor
composer.lock
/test.*
/test/changed/
/test/changed/
.phpunit.result.cache
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"ext-xml": "*",
"ext-mbstring": "*",
"psr/log": "^1.0",
"masterminds/html5": "^2.0",
"mensbeam/html-parser": "^1.2.0",
"league/uri": "^6.4"
},
"require-dev": {
Expand Down
2 changes: 1 addition & 1 deletion src/Nodes/DOM/DOMDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class DOMDocument extends \DOMDocument
{
use NodeTrait;

public function __construct($version, $encoding)
public function __construct($version = "1.0", $encoding = "")
{
parent::__construct($version, $encoding);

Expand Down
76 changes: 43 additions & 33 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
use fivefilters\Readability\Nodes\DOM\DOMText;
use fivefilters\Readability\Nodes\NodeUtility;
use Psr\Log\LoggerInterface;
use \Masterminds\HTML5;
use League\Uri\Http;
use League\Uri\UriResolver;
use MensBeam\HTML\Parser;
use MensBeam\HTML\Parser\Config as ParserConfig;

/**
* Class Readability.
Expand Down Expand Up @@ -286,48 +287,52 @@ private function loadHTML($html)
{
$this->logger->debug('[Loading] Loading HTML...');

// To avoid throwing a gazillion of errors on malformed HTMLs
libxml_use_internal_errors(true);

//$html = preg_replace('/(<br[^>]*>[ \n\r\t]*){2,}/i', '</p><p>', $html);

if ($this->configuration->getParser() === 'html5') {
$this->logger->debug('[Loading] Using HTML5 parser...');
$html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]);
$dom = $html5->loadHTML($html);
$config = new ParserConfig();
$config->documentClass = DOMDocument::class;
$config->encodingFallback = "UTF-8";
$dom = Parser::parse($html, "", $config)->document;
//TODO: Improve this so it looks inside <html><head><base>, not just any <base>
$base = $dom->getElementsByTagName('base');
if ($base->length > 0) {
$base = $base->item(0);
$base = $base->getAttribute('href');
$base = $base->item(0)->getAttribute('href');
if ($base != '') {
$this->baseURI = $base;
}
}
} else {
$this->logger->debug('[Loading] Using libxml parser...');
$dom = new DOMDocument('1.0', 'utf-8');
if ($this->configuration->getNormalizeEntities()) {
$this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
// Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
}
}

if (!$this->configuration->getSubstituteEntities()) {
// Keep the original HTML entities
$dom->substituteEntities = false;
}
// To avoid throwing a gazillion of errors on malformed HTMLs
$libxml_err = libxml_use_internal_errors(true);
try {
$dom = new DOMDocument('1.0', 'utf-8');
if ($this->configuration->getNormalizeEntities()) {
$this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
// Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
}
if (!$this->configuration->getSubstituteEntities()) {
// Keep the original HTML entities
$dom->substituteEntities = false;
}
if ($this->configuration->getSummonCthulhu()) {
$this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
$html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
}

if ($this->configuration->getSummonCthulhu()) {
$this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
$html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
}
// Prepend the XML tag to avoid having issues with special characters. Should be harmless.
$dom->loadHTML('<?xml encoding="UTF-8">' . $html);
$this->baseURI = $dom->baseURI;
} finally {
if (!$libxml_err) {
libxml_clear_errors();
libxml_use_internal_errors(false);
}
}

// Prepend the XML tag to avoid having issues with special characters. Should be harmless.
if ($this->configuration->getParser() !== 'html5') {
$dom->loadHTML('<?xml encoding="UTF-8">' . $html);
$this->baseURI = $dom->baseURI;
}
$dom->encoding = 'UTF-8';

Expand All @@ -341,6 +346,11 @@ private function loadHTML($html)

$this->prepDocument($dom);

// remove any DOCTYPE so that it is not later printed
if ($dom->doctype) {
$dom->removeChild($dom->doctype);
}

$this->logger->debug('[Loading] Loaded HTML successfully.');

return $dom;
Expand Down Expand Up @@ -1836,7 +1846,7 @@ public function _fixLazyImages(DOMDocument $article)
**/
public function _cleanStyles($node)
{
if (property_exists($node, 'tagName') && $node->tagName === 'svg') {
if (property_exists($node, 'localName') && $node->localName === 'svg' && property_exists($node, 'namespaceURI') && $node->namespaceURI === 'http://www.w3.org/2000/svg') {
return;
}

Expand Down Expand Up @@ -2294,10 +2304,10 @@ protected function setTitle($title)
public function getContent()
{
if ($this->content instanceof DOMDocument) {
$html5 = new HTML5(['disable_html_ns' => true]);
// by using childNodes below we make sure HTML5PHP's serialiser
// doesn't output the <!DOCTYPE html> string at the start.
return $html5->saveHTML($this->content->childNodes);
$config = new ParserConfig;
$config->serializeForeignVoidEndTags = false;
$config->serializeBooleanAttributeValues = false;
return Parser::serialize($this->content, $config);
} else {
return null;
}
Expand Down
12 changes: 6 additions & 6 deletions test/test-pages/citylab-1/expected.html
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
<article itemscope="itemscope" itemtype="https://schema.org/NewsArticle" xmlns:xlink="http://www.w3.org/1999/xlink">
<article itemscope itemtype="https://schema.org/NewsArticle">
<meta itemprop="datePublished" content="2019-04-30T13:39:00-04:00">
<meta itemprop="dateModified" content="2019-04-30T13:40:00-04:00">
<meta itemprop="mainEntityOfPage" content="https://www.citylab.com/design/2019/04/neon-signage-20th-century-history/588400/">

<figure itemprop="image" itemscope="itemscope" itemtype="http://schema.org/ImageObject">
<figure itemprop="image" itemscope itemtype="http://schema.org/ImageObject">
<picture><source srcset="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/940.jpg?mod=1556645448" media="(min-width: 1024px)"> <source srcset="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/lead_large.jpg?mod=1556645448" media="(min-width: 576px)"></picture>
<meta itemprop="height" content="128">
<meta itemprop="width" content="300">
<meta itemprop="url" content="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/300.jpg?mod=1556645448"><picture><source srcset="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/300.jpg?mod=1556645448" media="(max-width: 575px)"><img src="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/300.jpg?mod=1556645448" alt srcset="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/300.jpg?mod=1556645448"></picture>
<meta itemprop="url" content="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/300.jpg?mod=1556645448"><picture><source srcset="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/300.jpg?mod=1556645448" media="(max-width: 575px)"><img src="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/300.jpg?mod=1556645448" alt="" srcset="https://cdn.citylab.com/media/img/citylab/2019/04/mr1/300.jpg?mod=1556645448"></picture>
<figcaption>
<span itemprop="caption">The Moulin Rouge cabaret in Paris</span> <span itemprop="creator">Benoit Tessier/Reuters</span>
</figcaption>
Expand Down Expand Up @@ -52,7 +52,7 @@ <h2 itemprop="description">
In the following decades, neon’s nonstop glow and vibrant colors turned ordinary buildings and surfaces into 24/7 billboards for businesses, large and small, that wanted to convey a sense of always being open. The first examples of neon in the United States debuted in Los Angeles, where the Packard Motor Car Company commissioned two large blue-and-orange <span>Packard</span> signs that literally stopped traffic because they distracted motorists. The lighting also featured heavily at the Chicago Century of Progress Exposition in 1933 and at the 1939 World’s Fair in New York. At the latter event, a massive neon sign reading <span>Futurama</span> lit the way to a General Motors exhibition that heralded “The World of Tomorrow.”
</p>
<figure>
<picture><img alt data-srcset="https://cdn.theatlantic.com/assets/media/img/posts/2019/04/AP_8912060228/cbd32b0e1.jpg" src="https://cdn.theatlantic.com/assets/media/img/posts/2019/04/AP_8912060228/cbd32b0e1.jpg"></picture>
<picture><img alt="" data-srcset="https://cdn.theatlantic.com/assets/media/img/posts/2019/04/AP_8912060228/cbd32b0e1.jpg" src="https://cdn.theatlantic.com/assets/media/img/posts/2019/04/AP_8912060228/cbd32b0e1.jpg"></picture>
<figcaption>
Workers remove a hammer and sickle from a neon sign that reads “Glory to Communism,” visible on the roof of the Communist-run electricity-board headquarters in Czechoslovakia in 1989. (AP)
</figcaption>
Expand All @@ -76,7 +76,7 @@ <h2>
De Miranda understands this evolution by zooming out and looking at the 1900s as the “neon century.” The author draws a parallel between the physical form of neon lights, which again are essentially containers for electrified gases, and that of a glass capsule—suggesting they are a kind of message in a bottle from a time before the First World War. “Since then, [neon lights] have witnessed all the transformations that have created the world we live in,” de Miranda writes. “Today, they sometimes seem to maintain a hybrid status, somewhere between junkyards and museums, not unlike European capitals themselves.”
</p>
<figure>
<picture><img alt data-srcset="https://cdn.theatlantic.com/assets/media/img/posts/2019/04/AP_945361213236/888fdd750.jpg" src="https://cdn.theatlantic.com/assets/media/img/posts/2019/04/AP_945361213236/888fdd750.jpg"></picture>
<picture><img alt="" data-srcset="https://cdn.theatlantic.com/assets/media/img/posts/2019/04/AP_945361213236/888fdd750.jpg" src="https://cdn.theatlantic.com/assets/media/img/posts/2019/04/AP_945361213236/888fdd750.jpg"></picture>
<figcaption>
Martin Wartman, a student at Northern Kentucky University, works on a neon sign at the Neonworks of Cincinnati workshop connected to the American Sign Museum, in 2016. (John Minchillo / AP)
</figcaption>
Expand All @@ -95,7 +95,7 @@ <h2>
<h4>
About the Author
</h4>
<div itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person">
<div itemprop="author" itemscope itemtype="http://schema.org/Person">
<h5 itemprop="name">
<a href="https://www.citylab.com/authors/sarah-archer/">Sarah Archer</a>
</h5>
Expand Down
Loading