-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathContent.php
178 lines (154 loc) · 4.81 KB
/
Content.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
<?php
declare(strict_types=1);
namespace RedditImage;
use RedditImage\Exception\InvalidContentException;
class Content
{
private string $preprocessed = '';
private string $metadata = '';
private ?string $contentLink = null;
private ?string $commentsLink = null;
private string $raw;
private string $real = '';
public function __construct(string $content)
{
$this->raw = $content;
$this->splitContent($content);
$this->extractMetadata();
$this->extractLinks();
$this->extractReal();
if (!$this->isValid()) {
throw new InvalidContentException($content);
}
}
private function isValid(): bool
{
if ($this->metadata === '') {
return false;
}
if ($this->contentLink === null) {
return false;
}
if ($this->commentsLink === null) {
return false;
}
return true;
}
public function getContentLink(): ?string
{
return $this->contentLink;
}
public function getCommentsLink(): ?string
{
return $this->commentsLink;
}
public function getPreprocessed(): string
{
return $this->preprocessed;
}
public function getMetadata(): string
{
return $this->metadata;
}
public function getRaw(): string
{
return $this->raw;
}
public function getReal(): string
{
return $this->real;
}
public function hasBeenPreprocessed(): bool
{
return '' !== $this->preprocessed;
}
public function hasReal(): bool
{
return '' !== $this->real;
}
/**
* Split the content when needed
*
* The content can be preprocessed to save time for resources that can not be
* fetch quickly. For instance when API calls are involved. Thus we need to
* separate the feed raw content from the preprocessed content.
*/
private function splitContent(string $content): void
{
$dom = new \DomDocument('1.0', 'UTF-8');
$dom->loadHTML(
htmlspecialchars_decode(htmlentities(html_entity_decode($content))),
LIBXML_NOERROR
);
$xpath = new \DOMXpath($dom);
$redditImage = $xpath->query("//div[contains(@class,'reddit-image')]");
if ($redditImage !== false && $redditImage->length === 1) {
$node = $redditImage->item(0);
$this->preprocessed = $dom->saveHTML($node->parentNode->firstChild) ?: '';
$this->raw = $dom->saveHTML($node->parentNode->lastChild) ?: '';
}
}
/**
* Extract metadata available in the feed raw content
*
* Here the search is done with a regex instead of the DOM since the raw content
* has different ways to represent its content. The metadata contains the link
* to the author page, the link to the current message, and the link to the
* current message comment section.
*/
private function extractMetadata(): void
{
if (preg_match('#(?P<metadata>\s*?submitted.*</span>)#', $this->raw, $matches)) {
$this->metadata = $matches['metadata'];
}
}
/**
* Extract links available in the feed raw content
*
* At the moment, those are the extracted links:
* - content link.
* - comments link.
*/
private function extractLinks(): void
{
$dom = new \DomDocument('1.0', 'UTF-8');
$dom->loadHTML(
htmlspecialchars_decode(htmlentities(html_entity_decode($this->raw))),
LIBXML_NOERROR
);
$links = $dom->getElementsByTagName('a');
foreach ($links as $link) {
switch ($link->textContent) {
case '[link]':
$this->contentLink = $link->getAttribute('href');
break;
case '[comments]':
$this->commentsLink = $link->getAttribute('href');
// no break
default:
break;
}
}
}
/**
* Extract the real content from the feed raw content
*
* The real content is contained in a div with the md class attribute. The
* class attribute is sanitized to data-sanitized-class attribute when
* processed by SimplePie.
*/
private function extractReal(): void
{
$dom = new \DomDocument('1.0', 'UTF-8');
$dom->loadHTML(
htmlspecialchars_decode(htmlentities(html_entity_decode($this->raw))),
LIBXML_NOERROR
);
$xpath = new \DOMXpath($dom);
$mdNode = $xpath->query("//div[contains(@data-sanitized-class,'md')]");
if ($mdNode !== false && $mdNode->length === 1) {
$node = $mdNode->item(0);
$this->real = $dom->saveHTML($node) ?: '';
}
}
}