1 Star 10 Fork 3

青梧商城 / PHP行政地址采集

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
index.php 3.49 KB
一键复制 编辑 原始数据 按行查看 历史
青梧商城 提交于 2020-07-22 22:02 . 20200722
<?php
require 'vendor/autoload.php';
use QL\QueryList;
use QL\Ext\CurlMulti;
header("Content-type:text/html;charset=utf-8");
$area_list = [];
$url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/';
set_time_limit(400);
$client = new GuzzleHttp\Client();
$res = $client->request('GET', $url);
$html = (string)$res->getBody();
$html = mb_convert_encoding($html,'UTF-8','gbk');
// 定义采集规则 省份
$rules = [
// 链接
'url' => ['a','href'],
// 内容
'name' => ['a','text'],
];
$range = '.provincetr td';
// 定义采集规则 城市
$rules1 = [
// 链接
'url' => ['td:eq(0) a','href'],
// 内容
'name' => ['td:eq(1) a','text'],
// 编码
'code' => ['td:eq(0) a','text'],
];
$range1 = '.citytr';
// 定义采集规则 区县
$rules2 = [
// 内容
'name' => ['td:eq(1)','text'],
// 编码
'code' => ['td:eq(0)','text'],
];
$range2 = '.countytr';
$data = QueryList::html($html)
->rules($rules)
->range($range)
->query()->getData(function($q) use($url){
$q['code'] = str_pad(str_replace('.html','',$q['url']),12,"0",STR_PAD_RIGHT);
$q['url'] = $url.$q['url'];
$q['pid'] = 0;
$q['deep'] = 0;
return $q;
});
$province = $data->all(); // 省份
$city = []; // 城市
$region = []; // 区县
$ql = QueryList::getInstance();
$ql->use(CurlMulti::class);
$province_url = [];
foreach($province as $v){ // 省份地址
if(empty($v['name'])){
continue;
}
$province_url[] = $v['url'];
}
$ql->rules($rules1)->curlMulti($province_url)->success(function(QueryList $ql,CurlMulti $curl,$r) use($range1,$url,&$city){
$data = $ql->setHtml(mb_convert_encoding($r['body'],'UTF-8','gbk'))->range($range1)->query()->getData(function($q) use($url){
$pid = explode('/',$q['url'])[0];
$q['url'] = $url.$q['url'];
$q['pid'] = str_pad($pid,12,"0",STR_PAD_RIGHT);
$q['deep'] = 1;
return $q;
});
if(!empty($data->all())){
$city = array_merge($city,$data->all());
}
QueryList::destructDocuments();
})->start([
// 最大并发数,这个值可以运行中动态改变。
'maxThread' => 50,
// 触发curl错误或用户错误之前最大重试次数,超过次数$error指定的回调会被调用。
'maxTry' => 3,
]);
$city_url = [];
foreach($city as $v){ // 城市地址
if(empty($v['name'])){
continue;
}
$city_url[] = $v['url'];
}
$ql->rules($rules2)->curlMulti($city_url)->success(function(QueryList $ql,CurlMulti $curl,$r) use($range2,$url,&$region){
$sendUrl = $r['info']['url'];
$data = $ql->setHtml(mb_convert_encoding($r['body'],'UTF-8','gbk'))->range($range2)->query()->getData(function($q) use($url,$sendUrl){
$arr = explode('/',$sendUrl);
$pid = str_replace('.html','',$arr[count($arr)-1]);
$q['pid'] = str_pad($pid,12,'0',STR_PAD_RIGHT);
$q['deep'] = 2;
return $q;
});
if(!empty($data->all())){
$region = array_merge($region,$data->all());
}
QueryList::destructDocuments();
})->start([
// 最大并发数,这个值可以运行中动态改变。
'maxThread' => 80,
// 触发curl错误或用户错误之前最大重试次数,超过次数$error指定的回调会被调用。
'maxTry' => 3,
]);
$area_list = array_merge($province,$city,$region);
file_put_contents(getcwd().'/area.txt','');
foreach($area_list as $v){
if(empty($v['name'])){
continue;
}
$str = ',("'.$v['code'].'","'.$v['name'].'","'.$v['pid'].'",'.$v['deep'].')';
file_put_contents(getcwd().'/area.txt',$str,FILE_APPEND);
}
echo '爬到'.count($area_list).'条数据.存放在根目录【area.txt】';
?>
1
https://gitee.com/qingwuitcn/address_collection_php.git
git@gitee.com:qingwuitcn/address_collection_php.git
qingwuitcn
address_collection_php
PHP行政地址采集
master

搜索帮助