Skip to content

Commit

Permalink
Updated the process of getting information about the page by adding t…
Browse files Browse the repository at this point in the history
…he `Embed` ext.
  • Loading branch information
Igor Chepurnoy committed May 19, 2016
1 parent 9a9e4ce commit ee725b8
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 226 deletions.
218 changes: 28 additions & 190 deletions Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@

namespace yii2mod\linkpreview;

use yii\helpers\ArrayHelper;
use yii2mod\linkpreview\helpers\ContentHelper;
use yii2mod\linkpreview\helpers\MediaHelper;
use yii2mod\linkpreview\helpers\UrlHelper;
use yii\base\Exception;
use Embed\Adapters\Adapter;
use Embed\Embed;
use Embed\Exceptions\InvalidUrlException;
use yii\base\InvalidConfigException;
use yii\base\Object;
use yii\helpers\HtmlPurifier;
use yii\helpers\ArrayHelper;

/**
* Class Crawler
Expand All @@ -17,212 +16,51 @@
class Crawler extends Object
{
/**
* @var string content given from widget
* @var string content given from the widget
*/
public $content;

/**
* @var array default curl options
*/
public $curlOptions = [];

/**
* @var array html purifier settings
*/
public $htmlPurifierSettings = [
'HTML.Allowed' => ''
];

/**
* @var string page url
* @var array Embed config
*/
protected $url;
public $config = [];

/**
* @var string page title
* @var string url regex
*/
protected $title;
public $regexUrl = '/https?\:\/\/[^\" ]+/i';

/**
* @var string page description
* Return page info
* @return array|Adapter
* @throws InvalidConfigException
*/
protected $description;

/**
* @var string image url from page content
*/
protected $imageUrl;

/**
* Initialize object
*/
public function init()
public function getPageInfo()
{
$this->curlOptions = ArrayHelper::merge([
CURLOPT_USERAGENT => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36',
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_AUTOREFERER => true,
CURLOPT_CONNECTTIMEOUT => 120,
CURLOPT_TIMEOUT => 120,
CURLOPT_MAXREDIRS => 10,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => false,
CURLOPT_ENCODING => 'UTF-8'
], $this->curlOptions);

parent::init();
}

/**
* Return page preview array data in json format
* @return null|array
*/
public function getPagePreview()
{
$this->url = $this->getUrlFromContent();
if ($this->url !== null) {
if (ContentHelper::isImageUrl($this->url)) {
$this->imageUrl = $this->url;
} else {
$pageData = $this->performRequest();
if (!$pageData["content"] && strpos($this->url, "//www.") === false) {
if (strpos($this->url, "http://") !== false) {
$this->url = str_replace("http://", "http://www.", $this->url);
} elseif (strpos($this->url, "https://") !== false) {
$this->url = str_replace("https://", "https://www.", $this->url);
}
$pageData = $this->performRequest();
}
if ($pageData === null) {
return $this->getResponseData();
}
$this->url = $pageData['url'];
$content = $pageData['content'];
$metaTags = ContentHelper::getMetaTags($content);
$this->title = $this->getTitle($content, $metaTags);
$this->description = $this->getDescription($content, $metaTags);
$media = $this->getMedia();
$this->imageUrl = count($media) === 0 ? ContentHelper::trimText($metaTags["image"]) : $media['imgUrl'];
if (empty($this->imageUrl)) {
$this->imageUrl = ContentHelper::getImageSrc($content, $this->url);
}
}
return $this->getResponseData();
if (empty($this->content)) {
throw new InvalidConfigException("The 'content' property is required.");
}
return null;
}

/**
* Get link from content
* @param null $default
* @return mixed|null
*/
protected function getUrlFromContent($default = null)
{
$this->content = str_replace("\n", " ", $this->content);
if (preg_match(ContentHelper::$regexList['url'], $this->content, $match)) {
if (strpos($match[0], " ") === 0) {
$match[0] = "http://" . substr($match[0], 1);
$url = $this->getUrlFromContent();
if (!empty($url)) {
try {
return Embed::create($url, $this->config);
} catch (InvalidUrlException $e) {
// Invalid url
}
return str_replace("https://", "http://", $match[0]);
}
return $default;
}

/**
* Performs HTTP request
* Return page content, url and header info
*
* @throws Exception if request failed
* @return mixed
*/
protected function performRequest()
{
$response = [];
$curl = curl_init($this->url);
curl_setopt_array($curl, $this->curlOptions);
$body = curl_exec($curl);
$header = curl_getinfo($curl);
if ($body !== false) {
$responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
curl_close($curl);
if ($responseCode >= 200 && $responseCode < 300) {
$response['content'] = $body;
$response['url'] = $header['url'];
return $response;
}
}
return null;
}

/**
* Get page media data
* @return array
*/
protected function getMedia()
{
$result = [];
foreach (MediaHelper::$videoServiceConfig as $domainName => $methodName) {
if (strpos($this->url, $domainName) !== false) {
$result = MediaHelper::$methodName($this->url);
}
}
return $result;
}

/**
* Get page title
* @param $content
* @param $metaTags
* @return string
*/
protected function getTitle($content, $metaTags)
{
$title = ContentHelper::trimText($metaTags["title"]);
if (empty($title)) {
if (preg_match(ContentHelper::$regexList['title'], str_replace("\n", " ", $content), $matching)) {
$title = $matching[2];
}
}
if (ContentHelper::isJson($title)) {
$title = "";
}
return ContentHelper::trimText($title);
return [];
}

/**
* Get page description
* @param $content
* @param $metaTags
* @return mixed|string
* Get link from content
* @return mixed|null
*/
protected function getDescription($content, $metaTags)
protected function getUrlFromContent()
{
$description = ContentHelper::trimText($metaTags["description"]);
if (empty($description)) {
$description = ContentHelper::parse($content);
}
if (ContentHelper::isJson($description)) {
$description = "";
}
$description = HtmlPurifier::process($description, $this->htmlPurifierSettings);

return ContentHelper::trimText($description);
}
preg_match($this->regexUrl, $this->content, $matches);

/**
* Return response array data
* @return array
*/
protected function getResponseData()
{
return [
'status' => 'success',
'title' => $this->title,
'url' => $this->url,
'canonicalUrl' => UrlHelper::canonicalPage($this->url),
'description' => $this->description,
'image' => $this->imageUrl
];
return ArrayHelper::getValue($matches, 0);
}
}
22 changes: 13 additions & 9 deletions actions/LinkPreviewAction.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
use Yii;
use yii\base\Action;
use yii\web\Response;
use yii2mod\linkpreview\Crawler;
use yii2mod\linkpreview\models\LinkPreviewModel;

/**
Expand All @@ -15,8 +14,14 @@
class LinkPreviewAction extends Action
{
/**
* Template view path
* @var string
* @var array crawler config
*/
public $crawlerConfig = [
'class' => 'yii2mod\linkpreview\Crawler'
];

/**
* @var string Template view path
*/
public $view = '@vendor/yii2mod/yii2-link-preview/views/template';

Expand All @@ -26,16 +31,15 @@ class LinkPreviewAction extends Action
public function run()
{
Yii::$app->response->format = Response::FORMAT_JSON;

$linkPreviewModel = new LinkPreviewModel();
$content = Yii::$app->request->post('content');
$linkPreview = new Crawler([
'content' => $content
]);
$result = $linkPreview->getPagePreview();
$crawler = Yii::createObject($this->crawlerConfig);
$crawler->content = Yii::$app->request->post('content');
$pageInfo = $crawler->getPageInfo();
$pjaxContainerId = str_replace('#', '', Yii::$app->request->post('_pjax'));

return $this->controller->render($this->view, [
'result' => $result,
'pageInfo' => $pageInfo,
'linkPreviewModel' => $linkPreviewModel,
'pjaxContainerId' => $pjaxContainerId
]);
Expand Down
17 changes: 16 additions & 1 deletion assets/css/linkPreview.css
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#preview-url {
font-size: 1em;
color: #bbb;
text-transform: uppercase;
}

#preview-title {
Expand All @@ -40,10 +41,24 @@
}
}

@media only screen and (min-width: 992px) {
@media (max-width: 475px) {
#preview-description {
display: none !important;
}

#preview-image {
max-width: 100px;
}
}

@media (min-width: 475px) {
#preview-image {
max-width: 180px;
}

#preview-description {
display: block;
}
}

.close-preview-btn {
Expand Down
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
],
"require": {
"yiisoft/yii2": ">=2.0.4",
"symfony/dom-crawler": "~2.4,!=2.4.5"
"embed/embed": "^2.6",
"yii2mod/yii2-behaviors": "*"
},
"autoload": {
"psr-4": {
Expand Down
3 changes: 2 additions & 1 deletion migrations/m150213_182853_init_link_preview_table.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@ public function up()
if ($this->db->driverName === 'mysql') {
$tableOptions = 'CHARACTER SET utf8 COLLATE utf8_unicode_ci ENGINE=InnoDB';
}

$this->createTable('{{%LinkPreview}}', [
'id' => Schema::TYPE_PK,
'title' => Schema::TYPE_TEXT,
'description' => Schema::TYPE_TEXT,
'url' => Schema::TYPE_STRING . ' NOT NULL',
'canonicalUrl' => Schema::TYPE_STRING . ' NOT NULL',
'image' => Schema::TYPE_TEXT,
'code' => Schema::TYPE_TEXT,
'createdAt' => Schema::TYPE_INTEGER . ' NOT NULL',
'updatedAt' => Schema::TYPE_INTEGER . ' NOT NULL',
], $tableOptions);
Expand Down
Loading

0 comments on commit ee725b8

Please sign in to comment.