- 代码:
/**
* @desc:单线程爬虫类
* @author [Lee] <[]>
* @property
* 1、callcontent 获取给定url页面中的内容的回调函数
* 2、calltodo 处理业务逻辑的回调函数 如:把抓取到的内容处理后存到数据库
* @method
* run 执行爬虫程序
* @param depth 深度 默认2
* @return void
*/
class crawl{
public $callcontent = 'getcontent'; # 获取给定url页面中的内容的回调函数
public $calltodo = 'todo'; # 处理业务逻辑的回调函数 如:把抓取到的内容处理后存到数据库
private $url; # 内部属性:当前处理中的url
/*
@desc:内部方法,调用回调函数获取页面内容
@param url 传入到回调函数的参数
@return ret 页面内容
*/
private function getcontent($url){
$callback = $this->callcontent;
$ret = call_user_func($callback,$url);
return $ret;
}
/*
@desc:内部方法,调用回调函数进行业务处理
@param content 传入到回调函数的参数
*/
private function todo($content){
$callback = $this->calltodo;
call_user_func($callback,$content);
}
/*
@desc:内部方法,获取页面中的超链接
@param content 页面内容
@return urls 获取到的超链接
*/
private function geturl($content){
$preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';
$bool = preg_match_all($preg,$content,$res);
$urls = array();
if($bool){
$urls = $res[1];
}
$urls = array_unique($urls);
return $urls;
}
/*
@desc:内部方法,修复不完整的url
@param url 原始url
@param url 修复好的url
*/
private function reviseurl($url){
$info = parse_url($url);
$scheme = $info["scheme"]?:'http';
$user = $info["user"];
$pass = $info["pass"];
$host = $info["host"];
$port = $info["port"];
$path = $info["path"];
$url = $scheme . '://';
if ($user && $pass) {
$url .= $user . ":" . $pass . "@";
}
$url .= $host;
if ($port) {
$url .= ":" . $port;
}
$url .= $path;
return $url;
}
/*
@desc:构造方法,初始化url
*/
public function __construct($url){
$this->url = $url;
}
/*
@desc:主方法,执行程序
@param depth 挖掘深度 默认2
*/
public function run($depth = 2){
$url = $this->url;
if($depth > 0){
$depth--;
$content = $this->getcontent($url);
// 业务处理开始
$this->todo($content);
// 业务处理结束
$urls = $this->geturl($content);
$url = $this->reviseurl($url);
if (is_array($urls) && !empty($urls)) {
foreach ($urls as $u) {
if (preg_match('/^http/', $u)) {
$returl = $u;
} else {
$real = $url . '/' . $u;
$returl = $real;
}
$crawl = new crawl($returl);
$crawl->run($depth);
}
}
}
}
}
- 测试:
$scrawl = new scrawl('https://blog.51cto.com/12173069');
$scrawl->run(1);
/*
@desc:获取内容的回调
*/
function getcontent($url){
$content = file_get_contents($url);
return $content;
}
/*
@desc:处理业务逻辑的回调
*/
function todo($content){
$preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/i';
$bool = preg_match_all($preg,$content,$res);
$urls = array();
if($bool){
$urls = $res[1];
}
$urls = array_unique($urls);
var_dump($urls);
}
- 输出:
array(72) {
[0]=>
string(22) "https://blog.51cto.com/"
[2]=>
string(30) "https://blog.51cto.com/original"
[3]=>
string(34) "https://blog.51cto.com/cloumn/index"
[4]=>
string(28) "https://blog.51cto.com/expert"
[5]=>
string(35) "https://blog.51cto.com/blogger/index"
[6]=>
string(19) "javascript:void(0);"
[7]=>
string(20) "http://edu.51cto.com"
[8]=>
string(21) "https://blog.51cto.com"
[9]=>
string(21) "http://down.51cto.com"
[10]=>
string(21) "http://home.51cto.com"
[11]=>
string(20) "http://bbs.51cto.com"
[12]=>
string(18) "http://x.51cto.com"
[13]=>
string(0) ""
[14]=>
string(20) "http://wot.51cto.com"
[15]=>
string(20) "http://www.51cto.com"
[16]=>
string(89) "http://home.51cto.com/user/register?reback=http%253A%252F%252Fblog.51cto.com%252F12173069"
[17]=>
string(78) "https://blog.51cto.com/user/login?reback=http%3A%2F%2Fblog.51cto.com%2F12173069"
[18]=>
string(12) "javascript:;"
[19]=>
string(34) "https://blog.51cto.com/search/index"
[23]=>
string(40) "http://home.51cto.com/space?uid=12163069"
[27]=>
string(37) "https://blog.51cto.com/12173069?type=1"
[28]=>
string(37) "https://blog.51cto.com/12173069?type=2"
[29]=>
string(37) "https://blog.51cto.com/12173069?type=3"
[30]=>
string(30) "https://blog.51cto.com/12173069"
[37]=>
string(33) "https://blog.51cto.com/12173069?s="
[38]=>
string(34) "https://blog.51cto.com/12173069?s=3"
[39]=>
string(34) "https://blog.51cto.com/12173069?s=4"
[40]=>
string(34) "https://blog.51cto.com/12173069?s=5"
[41]=>
string(34) "https://blog.51cto.com/12173069?s=6"
[50]=>
string(38) "https://blog.51cto.com/12173069/2126752"
[55]=>
string(38) "https://blog.51cto.com/12173069/2126693"
[60]=>
string(38) "https://blog.51cto.com/12173069/2126661"
[65]=>
string(38) "https://blog.51cto.com/12173069/2126657"
[70]=>
string(38) "https://blog.51cto.com/12173069/2126596"
[75]=>
string(38) "https://blog.51cto.com/12173069/2126591"
[80]=>
string(38) "https://blog.51cto.com/12173069/2126496"
[85]=>
string(38) "https://blog.51cto.com/12173069/2126420"
[90]=>
string(38) "https://blog.51cto.com/12173069/2126324"
[95]=>
string(38) "https://blog.51cto.com/12173069/2126210"
[100]=>
string(38) "https://blog.51cto.com/12173069/2126090"
[105]=>
string(38) "https://blog.51cto.com/12173069/2125724"
[110]=>
string(38) "https://blog.51cto.com/12173069/2125666"
[115]=>
string(38) "https://blog.51cto.com/12173069/2125424"
[120]=>
string(38) "https://blog.51cto.com/12173069/2125359"
[125]=>
string(38) "https://blog.51cto.com/12173069/2124937"
[130]=>
string(38) "https://blog.51cto.com/12173069/2124923"
[135]=>
string(38) "https://blog.51cto.com/12173069/2124720"
[140]=>
string(38) "https://blog.51cto.com/12173069/2124693"
[145]=>
string(38) "https://blog.51cto.com/12173069/2124499"
[147]=>
string(33) "https://blog.51cto.com/12173069/p1"
[148]=>
string(33) "https://blog.51cto.com/12173069/p2"
[149]=>
string(33) "https://blog.51cto.com/12173069/p3"
[150]=>
string(33) "https://blog.51cto.com/12173069/p4"
[151]=>
string(33) "https://blog.51cto.com/12173069/p5"
[152]=>
string(33) "https://blog.51cto.com/12173069/p6"
[153]=>
string(33) "https://blog.51cto.com/12173069/p7"
[154]=>
string(33) "https://blog.51cto.com/12173069/p8"
[156]=>
string(34) "https://blog.51cto.com/12173069/p19"
[159]=>
string(39) "https://blog.51cto.com/ityouknow/2124403"
[160]=>
string(35) "https://blog.51cto.com/wyait/2125708"
[161]=>
string(39) "https://blog.51cto.com/lumay0526/2124116"
[162]=>
string(38) "https://blog.51cto.com/11010461/2123639"
[163]=>
string(35) "https://blog.51cto.com/qiuyt/2124456"
[164]=>
string(30) "https://blog.51cto.com/13716231"
[166]=>
string(30) "https://blog.51cto.com/13108471"
[168]=>
string(30) "https://blog.51cto.com/10316297"
[170]=>
string(30) "https://blog.51cto.com/13718637"
[172]=>
string(30) "https://blog.51cto.com/13681316"
[174]=>
string(20) "http://www.51CTO.com"
[175]=>
string(37) "https://blog.51cto.com/blogger/publish"
[176]=>
string(71) "http://wpa.qq.com/msgrd?v=3&uin=3591348659&site=qq&menu=yes"
[177]=>
string(39) "https://blog.51cto.com/51ctoblog/2057444"
}