From 1bce83a6db33728aeb05ed727f401bf339b96bf7 Mon Sep 17 00:00:00 2001 From: YangZetao Date: Sun, 25 Sep 2016 13:38:13 +0800 Subject: [PATCH] =?UTF-8?q?=E9=99=84=E4=BB=B6=E5=9B=9E=E8=B0=83=E5=87=BD?= =?UTF-8?q?=E6=95=B0=E6=B7=BB=E5=8A=A0=E6=96=87=E4=BB=B6=E6=9B=B4=E5=A4=9A?= =?UTF-8?q?=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/inc_mimetype.php | 3 +- core/phpspider.php | 39 +++++++++++++++------ demo/attachment_download.php | 67 ------------------------------------ demo/attachment_file.php | 31 +++++++++++++++++ 4 files changed, 61 insertions(+), 79 deletions(-) delete mode 100644 demo/attachment_download.php create mode 100644 demo/attachment_file.php diff --git a/config/inc_mimetype.php b/config/inc_mimetype.php index c07d316..c1d157b 100644 --- a/config/inc_mimetype.php +++ b/config/inc_mimetype.php @@ -5,7 +5,7 @@ */ $GLOBALS['config']['mimetype'] = array( - 'application/octet-stream' => '二进制', + 'application/octet-stream' => 'binary', //'text/xml' => 'xml', //'text/html' => 'html', //'text/htm' => 'htm', @@ -20,6 +20,7 @@ 'application/pdf' => 'pdf', 'audio/mp3' => 'mp3', 'video/avi' => 'avi', + 'video/mp4' => 'mp4', 'application/x-msdownload' => 'exe', 'application/vnd.iphone' => 'ipa', 'application/x-bittorrent' => 'torrent', diff --git a/core/phpspider.php b/core/phpspider.php index 83c3ad7..ed68881 100644 --- a/core/phpspider.php +++ b/core/phpspider.php @@ -424,7 +424,9 @@ public function start() $this->collect_page($link); } - echo date("H:i:s")." 爬取完成\n"; + $spider_time_use = round(microtime(true) - self::$spider_time_start, 3); + echo date("H:i:s")." 爬取完成 \n"; + echo "总耗时:{$spider_time_use} 秒\n"; echo "总共爬取链接数:".self::$collect_url_num."\n"; echo "成功爬取链接数:".self::$collected_urls_num."\n"; } @@ -581,6 +583,8 @@ public function request_url($url, $options = array()) // 如果定义了获取附件回调函数,直接拦截了 if ($this->on_attachment_file) { + $mime_types = $GLOBALS['config']['mimetype']; + stream_context_set_default( array( 'http' => array( @@ -595,6 +599,18 @@ public function request_url($url, $options = array()) $url = $headers['Location']; $headers = get_headers($url, 1); } + //print_r($headers); + $fileinfo = array(); + $pathinfo = pathinfo($url); + $fileinfo = array( + 'basename' => isset($pathinfo['basename']) ? $pathinfo['basename'] : '', + 'filename' => isset($pathinfo['filename']) ? $pathinfo['filename'] : '', + 'fileext' => isset($pathinfo['extension']) ? $pathinfo['extension'] : '', + 'filesize' => isset($headers['Content-Length']) ? $headers['Content-Length'] : 0, + //'filesize' => isset($headers['Content-Length']) ? util::format_bytes($headers['Content-Length']) : 0, + 'atime' => isset($headers['Date']) ? strtotime($headers['Date']) : time(), + 'mtime' => isset($headers['Last-Modified']) ? strtotime($headers['Last-Modified']) : time(), + ); $mime_type = 'html'; $content_type = isset($headers['Content-Type']) ? $headers['Content-Type'] : ''; @@ -602,12 +618,19 @@ public function request_url($url, $options = array()) { $mime_type = isset($GLOBALS['config']['mimetype'][$content_type]) ? $GLOBALS['config']['mimetype'][$content_type] : $mime_type; } + $mime_types_flip = array_flip($mime_types); + // 判断一下是不是文件名被加什么后缀了,比如 http://www.xxxx.com/test.jpg?token=xxxxx + if (!isset($mime_types_flip[$fileinfo['fileext']])) + { + $fileinfo['fileext'] = $mime_type; + $fileinfo['basename'] = $fileinfo['filename'].'.'.$mime_type; + } // 如果不是html if ($mime_type != 'html') { echo util::colorize(date("H:i:s")." 发现{$mime_type}文件:".$url."\n"); - call_user_func($this->on_attachment_file, $url, $mime_type); + call_user_func($this->on_attachment_file, $url, $fileinfo); return false; } } @@ -1067,7 +1090,6 @@ public function get_fields($confs, $html, $url, $page) */ public function get_fields_xpath($html, $selector, $fieldname) { - //var_dump($html); $dom = new DOMDocument(); @$dom->loadHTML(''.$html); //libxml_use_internal_errors(true); @@ -1080,9 +1102,7 @@ public function get_fields_xpath($html, $selector, $fieldname) //} $xpath = new DOMXpath($dom); - //$selector = "//*[@id='single-next-link']//div[contains(@class,'content')]/text()[1]"; $elements = @$xpath->query($selector); - //var_dump($elements);exit; if ($elements === false) { echo util::colorize(date("H:i:s") . " field(\"{$fieldname}\")中selector的xpath(\"{$selector}\")语法错误\n\n", 'fail'); @@ -1094,9 +1114,11 @@ public function get_fields_xpath($html, $selector, $fieldname) { foreach ($elements as $element) { - //var_dump($element); $nodeName = $element->nodeName; $nodeType = $element->nodeType; // 1.Element 2.Attribute 3.Text + //$nodeAttr = $element->getAttribute('src'); + //$nodes = util::node_to_array($dom, $element); + //echo $nodes['@src']."\n"; // 如果是img标签,直接取src值 if ($nodeType == 1 && in_array($nodeName, array('img'))) { @@ -1115,11 +1137,6 @@ public function get_fields_xpath($html, $selector, $fieldname) $content = preg_replace(array("#^<{$nodeName}.*>#isU","#$#isU"), array('', ''), $content); } $array[] = trim($content); - //$nodes = util::node_to_array($dom, $element); - //echo $nodes['@src']."\n"; - //echo "name: ".$element->nodeName."\n"; - //echo "value: ".$element->nodeValue."\n"; - //echo "attr: ".$element->getAttribute('src')."\n\n"; } } return $array; diff --git a/demo/attachment_download.php b/demo/attachment_download.php deleted file mode 100644 index e72039a..0000000 --- a/demo/attachment_download.php +++ /dev/null @@ -1,67 +0,0 @@ - array( - 'ocnt0imhl.bkt.clouddn.com', - ), - 'scan_urls' => array( - //"http://ocnt0imhl.bkt.clouddn.com/imgs/1637/2015-07/k306n1wzvkq669nm.jpg", - ), - 'list_url_regexes' => array( - ), - 'content_url_regexes' => array( - ), - 'fields' => array( - ), -); - -$spider = new phpspider($configs); - -$spider->on_start = function($phpspider) -{ - $url = "http://ocnt0imhl.bkt.clouddn.com/imgs/1637/2015-07/k306n1wzvkq669nm.jpg"; - $phpspider->request_url($url); -}; -$spider->on_attachment_file = function($attachment_url, $mime_type) -{ - // 输出文件URL地址和文件类型 - //var_dump($attachment_url, $mime_type); - - if ($mime_type == 'jpg') - { - // 以纳秒为单位生成随机数 - $filename = uniqid(); - // 在data目录下生成图片 - $filepath = PATH_DATA."/{$filename}.jpg"; - // 用系统自带的下载器wget下载 - exec("wget {$attachment_url} -O {$filepath}"); - - // 用PHP函数下载,容易耗尽内存,慎用 - //$data = file_get_contents($attachment_url); - //file_put_contents($filepath, $attachment_url); - } -}; - -$spider->on_extract_field = function($fieldname, $data, $page) -{ - if ($fieldname == 'contents') - { - if (!empty($data)) - { - $contents = $data; - $data = ""; - foreach ($contents as $content) - { - $data .= $content['page_content']; - } - } - } - return $data; -}; - -$spider->start(); diff --git a/demo/attachment_file.php b/demo/attachment_file.php new file mode 100644 index 0000000..460a786 --- /dev/null +++ b/demo/attachment_file.php @@ -0,0 +1,31 @@ +on_attachment_file = function($attachment_url, $fileinfo) +{ + // 输出文件URL地址和文件类型 + //var_dump($attachment_url, $fileinfo); + + if ($fileinfo['fileext'] == 'jpg') + { + // 以纳秒为单位生成随机数 + $filename = uniqid(); + // 在data目录下生成图片 + $filepath = PATH_DATA."/{$filename}.jpg"; + // 用系统自带的下载器wget下载 + exec("wget {$attachment_url} -O {$filepath}"); + + // 用PHP函数下载,容易耗尽内存,慎用 + //$data = file_get_contents($attachment_url); + //file_put_contents($filepath, $attachment_url); + } +}; + +$url = "http://ocnt0imhl.bkt.clouddn.com/imgs/1637/2015-07/k306n1wzvkq669nm.jpg?token=xxx"; +$spider->request_url($url);