1 Star 0 Fork 44

dongfang / spider-utils-for-php

forked from mz / spider-utils-for-php 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
spider.class.php 44.49 KB
一键复制 编辑 原始数据 按行查看 历史
mz 提交于 2019-12-28 15:43 . update spider.class.php.

<?php
class spider
{
/**
* @var int record last response code
*/
public static $last_response_code = -1;
/**
* @var string last redirect url
*/
public static $url = '';
/**
* @var array last response header
*/
public static $last_header = array();
/**
* remove all html tag
*
* @param $html
*
* @return mixed
*/
public static function no_html($html) {
return self::reg_replace($html, array('<(*)>' => ''));
}
/**
* convert html to text
*
* @param $html
*
* @return mixed|string
*/
public static function html2txt($html) {
// html_entity_decode 中 &nbsp; 会导致乱码
$html = strtr($html, array(
'&nbsp;' => ' ',
'&rdquo;' => '”',
'&ldquo;' => '“',
//"\xA0" => ' ',
));
$html = preg_replace('/^[\s\t]+/is', ' ', $html);
$html = preg_replace('#<?xml[\s\S]*?>#is', '', $html);
$html = preg_replace('#<!--[\s\S]*?-->#is', '', $html);
$html = preg_replace('#<!doc[\s\S]*?>#is', '', $html);
$html = preg_replace('#<(head|script|iframe|frame|noscript|noframes|option|style)[\s\S]*?</\1>#is', '', $html);
$html = preg_replace('#<(br|hr|li|ol|ul|dl|h\d|dd|dt|center|form|table|tr|marquee|div|pre|p|blockquote).*?>#is', "\n", $html);
// 用 strip_tag 会乱码,纠结...
$html = self::strip_tags($html);
// decode entities
$html = html_entity_decode($html, ENT_COMPAT, 'UTF-8');
$html = preg_replace('#([\r\n]\s+[\r\n])+#is', "\n", $html);
$html = str_replace(array("\r", "\n\n"), "\n", $html);
while (strpos($html, "\n\n") !== false) {
$html = str_replace("\n\n", "\n", $html);
}
return $html;
}
/**
* alias for strip_tag / fix strip_tag unicode bug
*
* @param $text
* @param string $tags
*
* @return mixed
*/
public static function strip_tags($text, $tags = '') {
preg_match_all('/<([\w\-\.]+)[\s]*\/?[\s]*>/si', strtolower(trim($tags)), $tags);
$tags = array_unique($tags[1]);
$searches = array();
static $block_set = array(
'head' => 1,
'script' => 1,
'iframe' => 1,
'frame' => 1,
'noscript' => 1,
'noframes' => 1,
'option' => 1,
'style' => 1,
);
//注释
$searches[] = '#<!--[\s\S]*?-->#is';
//ie 判断
$searches[] = '#<\!--[if[^\]]*?\]>[\S\s]<\!\[endif\]-->#is';
if (is_array($tags) && count($tags) > 0) {
$line_tags = $block_tags = '';
foreach ($tags as $tag) {
if (!$tag) {
continue;
}
if (isset($block_set[$tag])) {
unset($block_set[$tag]);
}
$line_tags .= $tag . '|';
}
$block_set = array_keys($block_set);
$block_tags = implode('|', $block_set);
if ($block_tags) {
$searches[] = '#<(' . $block_tags . ')\b[\s\S]*?</\1>#is';
}
if ($line_tags) {
$line_tags = substr($line_tags, 0, -1);
$searches[] = '#<(?!(?:' . $line_tags . ')|\/(?:' . $line_tags . ')\b)[^>]*?>#si';
}
return preg_replace($searches, '', $text);
} else {
$searches[] = '#<(' . implode('|', $block_set) . ')\b[\s\S]*?</\1>#is';
$searches[] = '#<\/?[^>]*?>#si';
return preg_replace($searches, '', $text);
}
}
/**
* cut string from $start to $end
*
* @param $html
* @param string $start
* @param string $end
*
* @return string
*/
public static function cut_str($html, $start = '', $end = '') {
if ($start) {
$html = stristr($html, $start, false);
$html = substr($html, strlen($start));
}
$end && $html = stristr($html, $end, true);
return $html;
}
//
/*
*/
/**
* mask match string:
*
* spider::mask_match('123abc123', '123(*)123') = abc
* spider::mask_match('abc123', '(*)123') = abc
* spider::mask_match('123abcabc', '(*)abc') = 123
* spider::mask_match('123abcdef', '(*)abc', true) = 123abc
*
* @param $html
* @param $pattern
* @param bool|false $returnfull
*
* @return string
*/
public static function mask_match($html, $pattern, $returnfull = false) {
$part = explode('(*)', $pattern);
if (count($part) == 1) {
return '';
} else {
if ($part[0] && $part[1]) {
$res = self::cut_str($html, $part[0], $part[1]);
if ($res) {
return $returnfull ? $part[0] . $res . $part[1] : $res;
}
} else {
//pattern=xxx(*)
if ($part[0]) {
if (strpos($html, $part[0]) !== false) {
$html = explode($part[0], $html);
if ($html[1]) {
return $returnfull ? $part[0] . $html[1] : $html[1];
}
}
} elseif ($part[1]) {
//pattern=(*)xxx
if (strpos($html, $part[1]) !== false) {
$html = explode($part[1], $html);
if ($html[0]) {
return $returnfull ? $html[0] . $part[1] : $html[0];
}
}
}
}
return '';
}
}
//
/*
//replace single mode
*/
/**
* replace by array replace_from => replace_to (support reg & str & mask)
*
* example :
* spider::reg_replace('abcdefg', 'e(*)') = abcd
* spider::reg_replace('abcdefg', array('#e.+$#is'=> 'hij')) = abcdhij
* spider::reg_replace('abcd123', array('#\d+#s'=> '')) = abcd
* spider::reg_replace('abcd123', array('cd'=> 'dc')) = abdc123
* //replace multi pattern
* spider::reg_replace('abcd123', array(
* 'cd'=> 'dc',
* '1(*)'=> '321',
* '#\d+#s'=> '111',
* )) = abdc111
*
* @param $html
* @param $patterns
*
* @return mixed
*/
public static function reg_replace($html, $patterns) {
if (!is_array($patterns)) {
$patterns = array($patterns => '');
}
foreach ($patterns as $search => $replace) {
// mask mastch replace
if (strpos($search, '(*)') !== false) {
while ($searchhtml = self::mask_match($html, $search, true)) {
if ($searchhtml) {
$html = str_replace($searchhtml, $replace, $html);
continue;
}
break;
}
} elseif (preg_match('/^([\#\/\|\!\@]).+\\1([ismSMI]+)?$/is', $search)) {
//regexp replace
$html = preg_replace($search, $replace, $html);
} else {
//str replace
$html = str_replace($search, $replace, $html);
}
}
return $html;
}
//match
/*
*/
/**
* match string from pattern
*
*
* $url = 'http://www.sogou.com/web?query='.urlencode($key).'&ie=utf8';
* $html = spider::fetch_url($url, '', array('Referer'=>'http://www.sogou.com/'));
*
* #useage 1
* // get title by regexp
* $list = spider::match($html, array('listblock' => array('title' => '/<title>(.*?)<\/title>/is',)));
* // get title by mask match
* $list = spider::match($html, array('listblock' => array('title2' => '<title>(*)</title>',)));
*
*
* #useage 2
*
* $keywordlist = spider::match($html, array('list'=>array(
* 'cut' => '相关搜索</caption>(*)</tr></table>',
* 'pattern' => '#id="sogou_\d+_\d+">(?<key>[^>]*?)</a>#is',
* )));
* $newarr = array();
* foreach($keywordlist['list'] as $key=>$val){
* $newarr[$val['key']] = array('key'=>$val['key']);
* }
*
* @param $html
* @param $patterns
* @param array $option
*
* @return array
*/
public static function match($html, $patterns, $option = array('url' => '')) {
$tmplist = array();
//sleep
if (isset($patterns['_sleep'])) {
usleep($patterns['_sleep']);
unset($patterns['_sleep']);
}
//pre process =replace
if (isset($patterns['_replace'])) {
if (!is_array($patterns['_replace'])) {
$patterns['_replace'] = array($patterns['_replace'] => '');
}
$html = self::reg_replace($html, $patterns['_replace']);
unset($patterns['_replace']);
}
$extractor = NULL;
$dom = NULL;
//next fetch
$fetchqueue = array();
foreach ($patterns as $key => $val) {
$value = NULL;
if (!is_array($val)) {
$val = array($val);
}
if (isset($val['pattern'])) {
//pre process
$matchhtml = self::match_pre_process($html, $val);
//support multi pattern
if (!is_array($val['pattern'])) {
$val['pattern'] = array($val['pattern']);
}
//regexp match it
foreach ($val['pattern'] as $pattern) {
if (strpos($pattern, '(*)') === false) {
$value = self::reg_match($matchhtml, $pattern);
if ($value) {
if (is_string($value)) {
self::match_process($value, $val['process']);
} else if (is_array($value)) {
// process each field
foreach ($value as &$data) {
foreach ($data as $value_field => &$value_item) {
if (isset($val['process'][$value_field])) {
self::match_process($value_item, $val['process'][$value_field]);
}
}
}
unset($value_item, $data);
}
break;
}
} else {
// match field by mask_match
$value = self::mask_match($matchhtml, $pattern);
if ($value) {
self::match_process($value, $val['process']);
break;
}
}
}
} elseif (isset($val['selector'])) {
} else {
//multi mask match pattern
foreach ($val as &$pattern_array) {
if (!is_array($pattern_array) || !isset($pattern_array['pattern'])) {
$pattern_array = array(
array('pattern' => array($pattern_array))
);
}
$find_value = false;
foreach ($pattern_array as $pattern_info) {
if (!isset($pattern_info['pattern'])) {
continue;
}
//pre process
$matchhtml = self::match_pre_process($html, $val);
//not html to match then match next pattern
if (!$matchhtml) {
continue;
}
foreach ($pattern_info['pattern'] as $pattern) {
/*
if($pattern == 'extract'){
// get extract
if($extractor == NULL){
$extractor = new textExtract($html);
}
$value = $extractor->getContent();
$value = $value['content'];
break;
}elseif($pattern == 'extract_title'){
// get title
if($extractor == NULL){
$extractor = new textExtract($html);
}
$value = $extractor->getTitle();
break;
}else{
*/
// string match
$value = self::str_match($html, $pattern, $dom, $option);
//}
if ($value) {
$find_value = true;
// when find processor
self::match_process($value, $pattern_info['process']);
break;
}
//or match next pattern
}
}
if ($find_value) {
break;
}
}
}
$tmplist[$key] = $value;
}
//unset dom
if ($dom) {
//@$dom->unloadDocument();
}
//next fetch
if ($fetchqueue) {
foreach ($fetchqueue as $url) {
$html = self::fetch_url($url['url']);
$matches = self::match($html, $url['patterns']);
$tmplist[$url['key']][$url['index']]['fetched'] = $matches;
}
}
return $tmplist;
}
/**
* after match value process
*
* @param $value
* @param $process
*/
private static function match_process(&$value, $process) {
if ($process) {
if (!is_array($process)) {
$process = array($process);
}
foreach ($process as $index => $processor) {
if ($processor instanceof Closure) {
$value = $processor($value);
} else {
$param = explode('|', $processor);
$method = $param[0];
$param = array_slice($param, 1);
if ($param) {
foreach ($param as &$val) {
if (strpos($val, '_VALUE_') !== false) {
$val = strtr($val, array('_VALUE_' => $value));
}
}
} else {
$param = [$value];
}
unset($val);
$value = call_user_func_array($method, $param);
}
}
}
}
/**
* before match value process
*
* @param $html
* @param $pattern_info
*
* @return mixed|string
*/
private static function match_pre_process($html, &$pattern_info) {
$matchhtml = $html;
// cut it short and run faster
if (isset($pattern_info['cut'])) {
// support multi patterns
if (!is_array($pattern_info['cut'])) {
$pattern_info['cut'] = array($pattern_info['cut']);
}
// until find match html
foreach ($pattern_info['cut'] as $pattern) {
$matchhtml = self::mask_match($html, $pattern);
if ($matchhtml) {
break;
}
}
}
//replace html
if (isset($pattern_info['_replace'])) {
if (!is_array($pattern_info['_replace'])) {
$pattern_info['_replace'] = array($pattern_info['_replace'] => '');
}
$matchhtml = self::reg_replace($matchhtml, $pattern_info['_replace']);
}
return $matchhtml;
}
/**
* string match
*
* spider::str_match('123', '1(*)3') = 2
* spider::str_match('123', '1(\d+)3') = 2
*
* @param $str
* @param $pattern
* @param $dom
* @param $option
*
* @return mixed|string
*/
public static function str_match($str, $pattern, &$dom, $option) {
$value = '';
//array mask pattern
if (strpos($pattern, '(*)') !== false) {
$value = self::mask_match($str, $pattern);
} elseif (substr($pattern, 0, 4) == 'DOM:') {
return self::dom_match($str, $pattern, $dom, $option);
} elseif (strpos($pattern, '(') !== false) {
//has reg match field
preg_match_all($pattern, $str, $value);
//return first match group
$value = $value[1][0];
}
return $value;
}
/**
* match by dom (deprecated)
*
* @param $html
* @param $pattern
* @param $dom
* @param $option
*
* @return mixed
*/
public static function dom_match($html, $pattern, &$dom, $option) {
if (function_exists('dom_match')) {
return call_user_func('dom_match', $html, $pattern, $dom, $option);
}
return '';
}
/**
* match by regexp
*
* @param $html
* @param $reg
* @param int $return_index
*
* @return array
*/
public static function reg_match($html, $reg, $return_index = -1) {
$list = array();
preg_match_all($reg, $html, $list);
// has group name
if (strpos($reg, '(?<') !== false) {
self::filter_list($list);
if ($return_index == -1) {
return $list;
} else {
return $list[$return_index];
}
} else {
return $list[1][0];
}
}
/**
* filter number index in list
*
* @param $list
*/
private static function filter_list(&$list) {
foreach ($list as $key => $val) {
if (is_numeric($key)) {
unset($list[$key]);
}
}
$keys = array_keys($list);
foreach ($keys as $idx => $key) {
if (is_numeric($key)) continue;
foreach ($list[$key] as $index => $value) {
$list[$index][$key] = $value;
}
unset($list[$key]);
}
}
/**
* relative path to absolute
*
* @param $base_url
* @param $src_url
*
* @return string
*/
public static function abs_url($base_url, $src_url) {
if (!$src_url) {
return '';
}
$base_info = parse_url($base_url);
// start with //
if (strpos($src_url, '//') === 0) {
$src_url = $base_info['scheme'] . ':' . $src_url;
}
$src_info = parse_url($src_url);
if (isset($src_info['scheme'])) {
return $src_url;
}
$url = $base_info['scheme'] . '://' . $base_info['host'];
if (!isset($src_info['path'])) {
$src_info['path'] = '';
}
if (substr($src_info['path'], 0, 1) == '/') {
$path = $src_info['path'];
} else {
//fixed only ?
if (empty($src_info['path'])) {
$path = ($base_info['path']);
} else {
// fix dirname
if (substr($base_info['path'], -1) == '/') {
$path = $base_info['path'] . $src_info['path'];
} else {
$path = (dirname($base_info['path']) . '/') . $src_info['path'];
}
}
}
$rst = array();
$path_array = explode('/', $path);
if (!$path_array[0]) {
$rst[] = '';
}
foreach ($path_array as $key => $dir) {
if ($dir == '..') {
if (end($rst) == '..') {
$rst[] = '..';
} elseif (!array_pop($rst)) {
$rst[] = '..';
}
} elseif (strlen($dir) > 0 && $dir != '.') {
$rst[] = $dir;
}
}
if (!end($path_array)) {
$rst[] = '';
}
$url .= implode('/', $rst);
$url = str_replace('\\', '/', $url);
$url = str_ireplace('&amp;', '&', $url);
return $url . ($src_info['query'] ? '?' . $src_info['query'] : '');
}
/**
* HTTP GET
*
* @param $url
* @param array $headers
* @param int $timeout
* @param int $deep
*
* @return bool|string
* @throws Exception
*/
public static function GET($url, $headers = array(), $timeout = 5, $deep = 0) {
return self::fetch_url($url, '', $headers, $timeout, $deep);
}
/**
* HTTP POST
*
* @param $url
* @param $post
* @param array $headers
* @param int $timeout
* @param int $deep
*
* @return bool|string
* @throws Exception
*/
public static function POST($url, $post, $headers = array(), $timeout = 5, $deep = 0) {
return self::fetch_url($url, $post, $headers, $timeout, $deep);
}
/**
* HTTP PUT
*
* @param $url
* @param $post
* @param array $headers
* @param int $timeout
* @param int $deep
*
* @return bool|string
* @throws Exception
*/
public static function PUT($url, $post, $headers = array(), $timeout = 5, $deep = 0) {
$headers['method'] = 'PUT';
return self::fetch_url($url, $post, $headers, $timeout, $deep);
}
/**
* HTTP DELETE
*
* @param $url
* @param $post
* @param array $headers
* @param int $timeout
* @param int $deep
*
* @return bool|string
* @throws Exception
*/
public static function DELETE($url, $post, $headers = array(), $timeout = 5, $deep = 0) {
$headers['method'] = 'DELETE';
return self::fetch_url($url, $post, $headers, $timeout, $deep);
}
/**
* HTTP DELETE
*
* @param $url
* @param $post
* @param array $headers
* @param int $timeout
* @param int $deep
*
* @return bool|string
* @throws Exception
*/
public static function HEAD($url, $headers = array(), $timeout = 5, $deep = 0) {
$headers['method'] = 'HEAD';
return self::fetch_url($url, '', $headers, $timeout, $deep);
}
/**
* fetch url
*
* @param $url
* @param string $post
* @param array $headers
* @param int $timeout
* @param int $deep
*
* @return bool|string
* @throws Exception
*/
public static function fetch_url($url, $post = '', $headers = array(), $timeout = 5, $deep = 0) {
if ($deep > 5) throw new Exception('超出 fetch_url() 最大递归深度!');
static $stream_wraps = null;
if ($stream_wraps == null) {
$stream_wraps = stream_get_wrappers();
}
static $allow_url_fopen = null;
if ($allow_url_fopen == null) {
$allow_url_fopen = strtolower(ini_get('allow_url_fopen'));
$allow_url_fopen = (empty($allow_url_fopen) || $allow_url_fopen == 'off') ? 0 : 1;
}
!is_array($headers) && $headers = array();
//headers
$HTTP_USER_AGENT = isset($_SERVER['HTTP_USER_AGENT']) ? $_SERVER['HTTP_USER_AGENT'] : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)';
$matches = parse_url($url);
$host = $matches['host'];
$path = isset($matches['path']) ? $matches['path'] . (!empty($matches['query']) ? '?' . $matches['query'] : '') : '/';
$port = !empty($matches['port']) ? $matches['port'] : 80;
$https = $matches['scheme'] == 'https' ? true : false;
$charset = '';
$defheaders = array(
'Accept' => '*/*',
'User-Agent' => $HTTP_USER_AGENT,
'Accept-Encoding' => 'gzip, deflate',
'Host' => $host,
'Connection' => 'Close',
'Accept-Language' => 'zh-cn',
);
if (!empty($post)) {
$defheaders['Cache-Control'] = 'no-cache';
$out = "POST {$path} HTTP/1.0\r\n";
} else {
$out = "GET {$path} HTTP/1.0\r\n";
}
$socketmode = !$https && function_exists('fsockopen') && function_exists('mime_content_type') ? true : false;
// curl or socket
$fetchmode = function_exists('curl_init') || isset($headers['curl']) ? 'curl' : ($socketmode ? 'socket' : '');
//set support
if ($headers['charset']) {
$charset = $headers['charset'];
}
unset($headers['curl'], $headers['charset']);
// merge headers
if (is_array($headers) && $headers) {
foreach ($headers as $key => $val) {
$defheaders[$key] = $val;
}
}
if ($fetchmode == 'socket') {
$limit = 1024000000;
$ip = '';
$return = '';
$defheaders['Content-Type'] = 'application/x-www-form-urlencode';
// build post
if (is_array($post)) {
$boundary = '';
$post_body = '';
foreach ($post as $k => $v) {
if ($v[0] == '@') {
$v = substr($v, 1);
if ($v && is_file($v)) {
if (!$boundary) {
$boundary = '---------------upload' . uniqid('spider');
}
$mime = mime_content_type($v);
$post_body .= "\r\n" . 'Content-Disposition: form-data; name="' . $k . '"; filename="' . $v . '"' . "\r\n"
. 'Content-Type: ' . $mime . "\r\n\r\n" . file_get_contents($v) . "\r\n--" . $boundary;
unset($post[$k]);
}
}
}
if ($boundary) {
if ($post) {
foreach ($post as $k => $v) {
$post_body .= "\r\n" . 'Content-Disposition: form-data; name="' . $k . '"' . "\r\n\r\n" . $v . "\r\n--" . $boundary;
}
}
$post_body = '--' . $boundary . $post_body . '--';
$post = $post_body;
$defheaders['Content-Type'] = 'multipart/form-data; boundary=' . $boundary;
} else {
$post = http_build_query($post);
}
$defheaders['Content-Length'] = strlen($post);
}
foreach ($defheaders as $hkey => $hval) {
$out .= $hkey . ': ' . $hval . "\r\n";
}
$out .= "\r\n";
//append post body
if (!empty($post)) {
$out .= $post;
}
$host == 'localhost' && $ip = '127.0.0.1';
$fp = @fsockopen(($ip ? $ip : $host), $port, $errno, $errstr, $timeout);
if (!$fp) {
return FALSE;
} else {
stream_set_blocking($fp, TRUE);
stream_set_timeout($fp, $timeout);
@fwrite($fp, $out);
$status = stream_get_meta_data($fp);
$gzip = false;
if (!$status['timed_out']) {
$starttime = time();
while (!feof($fp)) {
if (($header = @fgets($fp)) && ($header == "\r\n" || $header == "\n")) {
break;
} else {
$header = strtolower($header);
if (substr($header, 0, 9) == 'location:') {
$location = trim(substr($header, 9));
self::$url = $location;
return self::fetch_url($location, $timeout, $post, $headers, $deep + 1);
} else if (strpos($header, 'content-encoding:') !== false
&& strpos($header, 'gzip') !== false
) {
//is gzip
$gzip = true;
} else if (strpos($header, 'content-type:') !== false) {
preg_match('@Content-Type:\s+([\w/+]+)(;\s+charset=([\w-]+))?@i', $header, $charsetmatch);
if (isset($charsetmatch[3])) {
$charset = $charsetmatch[3];
}
}
}
}
$stop = false;
while (!feof($fp) && !$stop) {
$data = fread($fp, ($limit == 0 || $limit > 8192 ? 8192 : $limit));
$return .= $data;
if ($limit) {
$limit -= strlen($data);
$stop = $limit <= 0;
}
if (time() - $starttime > $timeout) break;
}
if ($gzip) {
$return = self::gzdecode($return);
}
}
@fclose($fp);
return self::convert_html_charset($return, $charset);
}
} elseif ($fetchmode == 'curl') {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLINFO_HEADER_OUT, 1);
if ($https) {
// 证书
if (isset($defheaders[CURLOPT_SSLCERT])) {
$ssl_verifypeer = 1;
//使用证书:cert 与 key 分别属于两个.pem文件
curl_setopt($ch, CURLOPT_SSLCERT, realpath($defheaders[CURLOPT_SSLCERT]));
curl_setopt($ch, CURLOPT_SSLKEY, realpath($defheaders[CURLOPT_SSLKEY]));
if (isset($defheaders[CURLOPT_SSLCERTTYPE])) {
curl_setopt($ch, CURLOPT_SSLCERTTYPE, $defheaders[CURLOPT_SSLCERTTYPE]);
}
if (isset($defheaders[CURLOPT_SSLKEYTYPE])) {
curl_setopt($ch, CURLOPT_SSLKEYTYPE, $defheaders[CURLOPT_SSLKEYTYPE]);
}
// unset ssl index
unset($defheaders[CURLOPT_SSLCERTTYPE], $defheaders[CURLOPT_SSLCERT], $defheaders[CURLOPT_SSLKEYTYPE], $defheaders[CURLOPT_SSLKEY]);
} else if (isset($defheaders[CURLOPT_SSLCERT])) {
$ssl_verifypeer = 1;
// 单证书模式
// 严格检查证书
curl_setopt($ch, CURLOPT_SSLCERT, realpath($defheaders[CURLOPT_SSLCERT]));
// unset ssl index
unset($defheaders[CURLOPT_SSLCERT]);
} else {
$ssl_verifypeer = 0;
}
// support cainfo
if (isset($defheaders[CURLOPT_CAINFO])) {
curl_setopt($ch, CURLOPT_CAINFO, realpath($defheaders[CURLOPT_CAINFO]));
unset($defheaders[CURLOPT_CAINFO]);
}
// support capath
if (isset($defheaders[CURLOPT_CAPATH])) {
curl_setopt($ch, CURLOPT_CAPATH, realpath($defheaders[CURLOPT_CAPATH]));
unset($defheaders[CURLOPT_CAPATH]);
}
// 严格检查证书
if (isset($defheaders[CURLOPT_SSL_VERIFYPEER])) {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $defheaders[CURLOPT_SSL_VERIFYPEER]);
unset($defheaders[CURLOPT_SSL_VERIFYPEER]);
} else {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $ssl_verifypeer);
}
// 从证书中检查SSL加密算法是否存在
if (isset($defheaders[CURLOPT_SSL_VERIFYHOST])) {
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, $defheaders[CURLOPT_SSL_VERIFYHOST]);
unset($defheaders[CURLOPT_SSL_VERIFYHOST]);
} else {
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
}
}
// fix IN PHP 5.6
if (defined('CURLOPT_SAFE_UPLOAD')) {
@curl_setopt($ch, CURLOPT_SAFE_UPLOAD, false);
}
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
// support method
if ($defheaders['method']) {
switch (strtoupper($defheaders['method'])) {
case 'HEAD':
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0);
curl_setopt($ch, CURLOPT_NOBODY, 1);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'HEAD');
break;
case 'DELETE':
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'DELETE');
break;
case 'PUT':
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT');
$post_is_array = is_array($post);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post_is_array ? json_encode($post, 320) : $post);
if ($post_is_array) {
curl_setopt($ch, CURLOPT_HTTPHEADER, 'Content-Type: application/json');
}
break;
}
unset($defheaders['method']);
} else {
$post && curl_setopt($ch, CURLOPT_POST, 1);
// post
if ($post) {
//find out post file use multipart/form-data
$is_multi_part = 0;
if (is_array($post)) {
$is_curl_file = version_compare(phpversion(), '5.5.0') >= 0 && class_exists('CURLFile') ? true : false;
foreach ($post as $index => $value) {
if ($value[0] == '@') {
if ($is_curl_file) {
$post[$index] = new CURLFile(realpath(substr($value, 1)));
}
$is_multi_part = 2;
}
}
} else {
//is string
$is_multi_part = 1;
}
curl_setopt($ch, CURLOPT_POSTFIELDS, $is_multi_part ? $post : http_build_query($post));
}
}
//多ip下,设置出口ip
if (isset($defheaders['ip'])) {
curl_setopt($ch, CURLOPT_INTERFACE, $defheaders['ip']);
unset($defheaders['ip']);
}
//gzip compress
if (isset($defheaders['Accept-Encoding'])) {
curl_setopt($ch, CURLOPT_ENCODING, $defheaders['Accept-Encoding']);
unset($defheaders['Accept-Encoding']);
}
//使用代理
/*
'proxy' =>array(
'type' => '', //HTTP or SOCKET
'host' => 'ip:port',
'auth' => 'BASIC:user:pass',
);
*/
if ($defheaders['proxy']) {
$proxy_type = strtoupper($defheaders['proxy']['type']) == 'SOCKET' ? CURLPROXY_SOCKS5 : CURLPROXY_HTTP;
curl_setopt($ch, CURLOPT_PROXYTYPE, $proxy_type);
curl_setopt($ch, CURLOPT_PROXY, $defheaders['proxy']['host']);
//代理要认证
if ($headers['proxy']['auth']) {
list($auth_type, $auth_user, $auth_pass) = explode(':', $headers['proxy']['auth']);
$auth_type = $auth_type == 'NTLM' ? CURLAUTH_BASIC : CURLAUTH_NTLM;
curl_setopt($ch, CURLOPT_PROXYAUTH, $auth_type);
$user = "" . $auth_user . ":" . $auth_pass . "";
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $user);
}
}
unset($defheaders['proxy']);
// set version 1.0
//curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);
curl_setopt($ch, CURLOPT_MAXREDIRS, $deep ? $deep : 5);
//build curl headers
$header_array = array();
foreach ($defheaders as $key => $val) {
$header_array[] = $key . ': ' . $val;
}
curl_setopt($ch, CURLOPT_HTTPHEADER, $header_array);
$data = curl_exec($ch);
if (curl_errno($ch)) {
//throw new Exception('Errno'.curl_error($ch));//捕抓异常
}
if (!$data) {
curl_close($ch);
return '';
}
//for debug request header
//print_r($header_array);$info = curl_getinfo($ch, CURLINFO_HEADER_OUT );print_r($info);echo is_array($post) ? http_build_query($post) : $post;exit;
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
self::$last_response_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
self::$url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$header = substr($data, 0, $header_size);
$data = substr($data, $header_size);
//extract last response header
self::$last_header = self::extract_header($header);
$header = explode("\r\n\r\n", trim($header));
$header = array_pop($header);
//match charset
if (!$charset) {
preg_match('@Content-Type:\s*([\w\/]+)(;\s+charset\s*=\s*([\w-]+))?@is', $header, $charsetmatch);
if (isset($charsetmatch[3])) {
$charset = $charsetmatch[3];
}
}
return self::convert_html_charset($data, $charset);
} elseif ($https && $allow_url_fopen && in_array('https', $stream_wraps)) {
if (extension_loaded('openssl')) {
return file_get_contents($url);
} else {
throw new Exception('unopen openssl extension');
}
} elseif ($allow_url_fopen && empty($post) && empty($cookie)
&& in_array('http', $stream_wraps)
) {
// 尝试连接
$opts = array('http' => array('method' => 'GET', 'timeout' => $timeout));
$context = stream_context_create($opts);
$html = file_get_contents($url, false, $context);
return self::convert_html_charset($html, $charset);
} else {
return FALSE;
}
}
/**
* extract last response header
*
* @param $header
*
* @return array
*/
private static function extract_header($header) {
$lines = explode("\n", $header);
$result = array();
foreach ($lines as $line) {
list($key, $val) = explode(":", $line, 2);
$key = trim(strtolower($key));
switch ($key) {
case 'set-cookie':
if (!isset($result['cookie'])) {
$result['cookie'] = array();
}
$result['cookie'][] = $val;
break;
default:
$result[$key] = trim($val);
break;
}
}
return $result;
}
/**
* gzdecode
*
* @param $data
*
* @return string
*/
private static function gzdecode($data) {
return gzinflate(substr($data, 10, -8));
}
/**
* convert html charset (detect html charset)
*
* @param $html
* @param $charset
* @param string $tocharset
*
* @return string
*/
private static function convert_html_charset($html, $charset, $tocharset = 'utf-8') {
if ($charset == 'bin') {
return $html;
}
//取html中的charset
$detect_charset = '';
//html file
if ($charset) {
//优先取 http header中的charset
$detect_charset = $charset;
} else {
if (stripos($html, '<meta') !== false) {
if (strpos($html, 'charset=') !== false) {
$head = self::mask_match(strtolower($html), '(*)</head>');
if ($head) {
$head = strtolower($head);
$head = self::reg_replace($head, array(
'<script(*)/script>' => '',
'<style(*)/style>' => '',
'<link(*)>' => '',
"\r" => '',
"\n" => '',
"\t" => '',
" " => '',
"'" => ' ',
"\"" => ' ',
));
preg_match_all('/charset\s*?=\s*?([\-\w]+)/', $head, $matches);
} else {
preg_match_all('/<meta[^>]*?content=("|\'|).*?\bcharset=([\w\-]+)\b/is', $html, $matches);
}
if (isset($matches[1][0]) && !empty($matches[1][0])) {
$detect_charset = $matches[1][0];
}
}
}
//xml file
if (stripos($html, '<?xml') !== false) {
//<?xml version="1.0" encoding="UTF-8"
if (stripos($html, 'encoding=') !== false) {
$head = self::mask_match($html, '<' . '?xml(*)?' . '>');
preg_match_all('/encoding=["\']?([-\w]+)/is', $head, $matches);
if (isset($matches[1][0]) && !empty($matches[1][0])) {
$detect_charset = $matches[1][0];
}
}
}
}
//alias
if (in_array(strtolower($detect_charset), array('gb2312', 'iso-8859-1'))) {
$detect_charset = 'gbk';
}
if ($detect_charset) {
return mb_convert_encoding($html, $tocharset, $detect_charset);
//return iconv($detect_charset . '//ignore', $tocharset . '//ignore', $html);
} else {
return $html;
}
}
/**
* multi thread fetch url(only support curl)
*
* @param $urls
*
* @return array
* @throws Exception
*/
public static function multi_fetch_url($urls) {
if (!function_exists('curl_multi_init')) {
$data = array();
foreach ($urls as $k => $url) {
$data[$k] = self::fetch_url($url);
}
return $data;
}
$multi_handle = curl_multi_init();
$conn = $data = array();
foreach ($urls as $i => $url) {
$conn[$i] = curl_init($url);
curl_setopt($conn[$i], CURLOPT_ENCODING, '');
curl_setopt($conn[$i], CURLOPT_RETURNTRANSFER, 1);
$timeout = 3;
curl_setopt($conn[$i], CURLOPT_CONNECTTIMEOUT, $timeout); // 超时 seconds
curl_setopt($conn[$i], CURLOPT_FOLLOWLOCATION, 1);
curl_multi_add_handle($multi_handle, $conn[$i]);
}
do {
$mrc = curl_multi_exec($multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active and $mrc == CURLM_OK) {
if (curl_multi_select($multi_handle) != -1) {
do {
$mrc = curl_multi_exec($multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
foreach ($urls as $i => $url) {
$data[$i] = curl_multi_getcontent($conn[$i]);
curl_multi_remove_handle($multi_handle, $conn[$i]);
curl_close($conn[$i]);
}
return $data;
}
}
?>
PHP
1
https://gitee.com/401429542/spider-utils-for-php.git
git@gitee.com:401429542/spider-utils-for-php.git
401429542
spider-utils-for-php
spider-utils-for-php
master

搜索帮助