emmm,没什么好说的,逻辑通了就很好解决
上代码:
ini_set('memory_limit','2000M');//设置PHP内存
set_time_limit(360000);//PHP连接时间为100小时
/*
*截取网站描述内容
* @param string $html
* @return string $description
* @author L
* */
function getDescription($url){
header("content-type:text/html; charset=xxx");
$meta_array = @get_meta_tags($url);
return @$meta_array;
}
/*
* 将百度的link?url类连接转化为原连接
* @return $rewrite string
* @author L
* @time 2019年2月24日20:41:01
* */
function linkUrl($url){
$info = parse_url($url);
@$fp = fsockopen($info['host'], 80,$errno, $errstr, 30);
@fputs($fp,"GET {$info['path']}?{$info['query']} HTTP/1.1\r\n");
@fputs($fp, "Host: {$info['host']}\r\n");
@fputs($fp, "Connection: close\r\n\r\n");
$rewrite = '';
while(!feof($fp)) {
$line = fgets($fp);
if($line != "\r\n" ) {
if(strpos($line,'Location:') !== false) {
$rewrite = str_replace(array("\r","\n","Location: "),'',$line);
}
}else {
break;
}
}
return $rewrite;
}
/*
* 模拟百度蜘蛛爬取
* @param string $url 需要模拟爬取的连接
* @return array $temp 蜘蛛爬取的页面
* @author L
* */
function vspider_get($url){
$ch2 = curl_init();
$user_agent = "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)";//这里模拟的是百度蜘蛛
curl_setopt($ch2, CURLOPT_URL, $url);
curl_setopt($ch2, CURLOPT_HEADER, false);
curl_setopt($ch2, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch2, CURLOPT_REFERER, 'www.baidu.com');//1这里写一个来源地址,可以写要抓的页面的首页
curl_setopt($ch2, CURLOPT_USERAGENT, $user_agent);
$temp=curl_exec($ch2);
return $temp;
}
/*
* 编码转化
* @param string $data 需要转化的初始字体
* @return string $data 转化完成的字体
* @author L
* */
function characet($data){
if( !empty($data) ){
$fileType = mb_detect_encoding($data , array('UTF-8','GB2312','LATIN1','BIG5')) ;
if( $fileType != 'UTF-8'){
$data = mb_convert_encoding($data ,'utf-8' , $fileType);
}
}
return $data;
}
function getTitle($contents){
$match='/<TITLE>([\w\W]*?)<\/TITLE>/si';
$ru=preg_match_all($match,$contents,$ma);
$res = str_replace(' ', '', @$ma[1][0]);
$strArr = explode('_',$res);
$newa = '/[\x{4e00}-\x{9fa5}]/u';
$ru=preg_match($newa,$strArr[0]);
if($ru<1){
return html_entity_decode($res);
}else{
return $res;
}
}
require 'vendor/phpQuery.php';
require 'vendor/QueryList.php';
use QL\QueryList;
if (!empty($_POST)) {
$post = $_POST;
$strArr = explode(',',$post['keyword']);
foreach ($strArr as $v){
$file = date('Y-m-d H:i:s',time());
for ($b = 0; $b <= @$post['max']; $b++) {//循环取出title和连接
$url = 'http://www.baidu.com/s?wd=' . urlencode($v) . '&pn=' . $b * 10;
$data = QueryList::Query($url, array("url" => array('h3 a', 'href'), "title" => array('h3', 'text')))->data;
foreach ($data as $val) {
$newUrl = linkUrl($val['url']);//得到编译出来的连接
$html = vspider_get($newUrl);//蜘蛛爬取到网页
$title = getTitle($html);//截取页面内容
//echo $title;
//$content = get_description($html);//截取到内容
$content = getDescription($val['url']);//截取到内容
if(in_array("1", $post['check']) && in_array("2", $post['check']) && in_array("3", $post['check'])){
$string = $title . '||' . $newUrl . '||' . $content['description'];
}else if(in_array("1", $post['check']) && in_array("2", $post['check']) && !in_array("3", $post['check'])){
$string = $title . '||' . $newUrl ;
}else if(in_array("1", $post['check']) && !in_array("2", $post['check']) && in_array("3", $post['check'])){
$string = $title . '||' . $content['description'];
}else if(!in_array("1", $post['check']) && in_array("2", $post['check']) && in_array("3", $post['check'])){
$string = $newUrl . '||' . $content['description'];
}else if(in_array("1", $post['check']) && !in_array("2", $post['check']) && !in_array("3", $post['check'])){
$string = $title;
}else if(!in_array("1", $post['check']) && !in_array("2", $post['check']) && in_array("3", $post['check'])){
$string = $content['description'];
}else if(!in_array("1", $post['check']) && in_array("2", $post['check']) && !in_array("3", $post['check'])){
$string = $newUrl;
};
$strData = characet($string);
//var_dump($strData);die;
file_put_contents('./' . $v . ".txt", $strData . PHP_EOL, FILE_APPEND);
unset($strData);
}
unset($data);
}
}
die;
}
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>模拟蜘蛛爬取title</title>
<style>
html, body, h1, form, fieldset, legend, ol ,li{
padding:0;
margin:0;
}
ol{
list-style:none;
}
body{
background:#fff;
color:#111;
padding:20px;
}
form#payment{
background:#9cbc2c;
-webkit-border-radius:5px;
border-radius:5px;
padding:20px;
width:400px;
}
form#payment fieldset{
border:none;
margin-bottom:10px;
}
form#payment fieldset:last-of-type{ margin-bottom:0; }
form#payment legend{
color:#384313;
font-size:16px;
font-weight:bold;
padding-bottom:10;
text-shadow:0px 1px 1px #c0d576;
}
form#payment > fieldset>legend:before{
content:"Step" counter(fieldset)":";
counter-increment:fieldsets;
}
form#payment fieldset fieldset legend{
color:#111;
font-size:13px;
font-weight:normal;
padding-bottom:0;
}
form#payment ol li{
background:#b9cf6a;
background:rgba(255, 255, 255, 0.3);
border:#e3ebc3;
border-color:rgba(255, 255, 255, 0.6);
border-style:solid;
border-width:2px;
-webkit-border-radius:5px;
line-height:30px;
padding:5px 10px;
margin-bottom:2px;
}
form#payment ol ol li{
bakcground:none;
border:none;
float:left;
}
form#payment label{
float:left;
font-size:13px;
width:110px;
}
form#payment fieldset fieldset label{
background:none no-repeat left 50%;
line-height:20px;
padding:0 0 0 30px;
width:auto;
}
form#payment fieldset fieldset label:hover{cursor:pointer;}
form#payment input:not([type=radio]), form#payment textarea{
background:#fff;
border:#fc3 solid 1px;
-webkit-border-radius:3px;
outline:none;
padding:5px;
}
</style>
</head>
<body>
<form id=payment action="" method="POST">
<fieldset>
<legend>模拟蜘蛛爬取title</legend>
<ol>
<li>
<label for="name">关键字:</label>
<input type="text" id="name" name="keyword" placeholder="输入需要爬取title的网址" required autofocus>
</li>
<li>
<label for="phone">最大页数</label>
<input type="tel" placeholder="多条请用英文‘,’隔开" id="phone" name="max">
</li>
<li>
<label for="phone">网址:</label>
<input name="check[]" type="checkbox" value="1" />标题
<input name="check[]" type="checkbox" value="2" />连接
<input name="check[]" type="checkbox" value="3" />描述
</li>
</ol>
</fieldset>
<fieldset>
<button type="submit">提交</button>
</fieldset>
</form>
</body>
</html>