Skip to content

Commit

Permalink
附件回调函数添加文件更多信息
Browse files Browse the repository at this point in the history
  • Loading branch information
YangZetao authored and YangZetao committed Sep 25, 2016
1 parent dc306a0 commit 1bce83a
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 79 deletions.
3 changes: 2 additions & 1 deletion config/inc_mimetype.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
*/

$GLOBALS['config']['mimetype'] = array(
'application/octet-stream' => '二进制',
'application/octet-stream' => 'binary',
//'text/xml' => 'xml',
//'text/html' => 'html',
//'text/htm' => 'htm',
Expand All @@ -20,6 +20,7 @@
'application/pdf' => 'pdf',
'audio/mp3' => 'mp3',
'video/avi' => 'avi',
'video/mp4' => 'mp4',
'application/x-msdownload' => 'exe',
'application/vnd.iphone' => 'ipa',
'application/x-bittorrent' => 'torrent',
Expand Down
39 changes: 28 additions & 11 deletions core/phpspider.php
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,9 @@ public function start()
$this->collect_page($link);
}

echo date("H:i:s")." 爬取完成\n";
$spider_time_use = round(microtime(true) - self::$spider_time_start, 3);
echo date("H:i:s")." 爬取完成 \n";
echo "总耗时:{$spider_time_use}\n";
echo "总共爬取链接数:".self::$collect_url_num."\n";
echo "成功爬取链接数:".self::$collected_urls_num."\n";
}
Expand Down Expand Up @@ -581,6 +583,8 @@ public function request_url($url, $options = array())
// 如果定义了获取附件回调函数,直接拦截了
if ($this->on_attachment_file)
{
$mime_types = $GLOBALS['config']['mimetype'];

stream_context_set_default(
array(
'http' => array(
Expand All @@ -595,19 +599,38 @@ public function request_url($url, $options = array())
$url = $headers['Location'];
$headers = get_headers($url, 1);
}
//print_r($headers);
$fileinfo = array();
$pathinfo = pathinfo($url);
$fileinfo = array(
'basename' => isset($pathinfo['basename']) ? $pathinfo['basename'] : '',
'filename' => isset($pathinfo['filename']) ? $pathinfo['filename'] : '',
'fileext' => isset($pathinfo['extension']) ? $pathinfo['extension'] : '',
'filesize' => isset($headers['Content-Length']) ? $headers['Content-Length'] : 0,
//'filesize' => isset($headers['Content-Length']) ? util::format_bytes($headers['Content-Length']) : 0,
'atime' => isset($headers['Date']) ? strtotime($headers['Date']) : time(),
'mtime' => isset($headers['Last-Modified']) ? strtotime($headers['Last-Modified']) : time(),
);

$mime_type = 'html';
$content_type = isset($headers['Content-Type']) ? $headers['Content-Type'] : '';
if (!empty($content_type))
{
$mime_type = isset($GLOBALS['config']['mimetype'][$content_type]) ? $GLOBALS['config']['mimetype'][$content_type] : $mime_type;
}
$mime_types_flip = array_flip($mime_types);
// 判断一下是不是文件名被加什么后缀了,比如 http://www.xxxx.com/test.jpg?token=xxxxx
if (!isset($mime_types_flip[$fileinfo['fileext']]))
{
$fileinfo['fileext'] = $mime_type;
$fileinfo['basename'] = $fileinfo['filename'].'.'.$mime_type;
}

// 如果不是html
if ($mime_type != 'html')
{
echo util::colorize(date("H:i:s")." 发现{$mime_type}文件:".$url."\n");
call_user_func($this->on_attachment_file, $url, $mime_type);
call_user_func($this->on_attachment_file, $url, $fileinfo);
return false;
}
}
Expand Down Expand Up @@ -1067,7 +1090,6 @@ public function get_fields($confs, $html, $url, $page)
*/
public function get_fields_xpath($html, $selector, $fieldname)
{
//var_dump($html);
$dom = new DOMDocument();
@$dom->loadHTML('<?xml encoding="UTF-8">'.$html);
//libxml_use_internal_errors(true);
Expand All @@ -1080,9 +1102,7 @@ public function get_fields_xpath($html, $selector, $fieldname)
//}

$xpath = new DOMXpath($dom);
//$selector = "//*[@id='single-next-link']//div[contains(@class,'content')]/text()[1]";
$elements = @$xpath->query($selector);
//var_dump($elements);exit;
if ($elements === false)
{
echo util::colorize(date("H:i:s") . " field(\"{$fieldname}\")中selector的xpath(\"{$selector}\")语法错误\n\n", 'fail');
Expand All @@ -1094,9 +1114,11 @@ public function get_fields_xpath($html, $selector, $fieldname)
{
foreach ($elements as $element)
{
//var_dump($element);
$nodeName = $element->nodeName;
$nodeType = $element->nodeType; // 1.Element 2.Attribute 3.Text
//$nodeAttr = $element->getAttribute('src');
//$nodes = util::node_to_array($dom, $element);
//echo $nodes['@src']."\n";
// 如果是img标签,直接取src值
if ($nodeType == 1 && in_array($nodeName, array('img')))
{
Expand All @@ -1115,11 +1137,6 @@ public function get_fields_xpath($html, $selector, $fieldname)
$content = preg_replace(array("#^<{$nodeName}.*>#isU","#</{$nodeName}>$#isU"), array('', ''), $content);
}
$array[] = trim($content);
//$nodes = util::node_to_array($dom, $element);
//echo $nodes['@src']."\n";
//echo "name: ".$element->nodeName."\n";
//echo "value: ".$element->nodeValue."\n";
//echo "attr: ".$element->getAttribute('src')."\n\n";
}
}
return $array;
Expand Down
67 changes: 0 additions & 67 deletions demo/attachment_download.php

This file was deleted.

31 changes: 31 additions & 0 deletions demo/attachment_file.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?php
ini_set("memory_limit", "1024M");
require dirname(__FILE__).'/../core/init.php';

/* Do NOT delete this comment */
/* 不要删除这段注释 */

$spider = new phpspider();

$spider->on_attachment_file = function($attachment_url, $fileinfo)
{
// 输出文件URL地址和文件类型
//var_dump($attachment_url, $fileinfo);

if ($fileinfo['fileext'] == 'jpg')
{
// 以纳秒为单位生成随机数
$filename = uniqid();
// 在data目录下生成图片
$filepath = PATH_DATA."/{$filename}.jpg";
// 用系统自带的下载器wget下载
exec("wget {$attachment_url} -O {$filepath}");

// 用PHP函数下载,容易耗尽内存,慎用
//$data = file_get_contents($attachment_url);
//file_put_contents($filepath, $attachment_url);
}
};

$url = "http://ocnt0imhl.bkt.clouddn.com/imgs/1637/2015-07/k306n1wzvkq669nm.jpg?token=xxx";
$spider->request_url($url);

0 comments on commit 1bce83a

Please sign in to comment.