代码如下:
ini_set('memory_limit','2000M');//设置PHP内存
set_time_limit(360000);//PHP连接时间为100小时
/*
* 模拟百度蜘蛛爬取
* @param string $url 需要模拟爬取的连接
* @return array $temp 蜘蛛爬取的页面
* @author L
* */
function vspider_get($url){
$ch2 = curl_init();
$user_agent = "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)";//这里模拟的是百度蜘蛛
curl_setopt($ch2, CURLOPT_URL, $url);
curl_setopt($ch2, CURLOPT_HEADER, false);
curl_setopt($ch2, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch2, CURLOPT_REFERER, 'www.baidu.com');//1这里写一个来源地址,可以写要抓的页面的首页
curl_setopt($ch2, CURLOPT_USERAGENT, $user_agent);
$temp=curl_exec($ch2);
return $temp;
}
/*
* 随机数生成
* @param string $length 长度
* @return string $key 随机数
* @author L
* */
function randomkeys($length)
{
$pattern = '1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLOMNOPQRSTUVWXYZ';
$key = '';
for ($i = 0; $i < $length; ++$i) {
$key .= $pattern{mt_rand(0, 62)};
}
return $key;
}
if(!empty($_POST)){
$arrayData = explode(',',$_POST['key']);
foreach ($arrayData as $v ){
for($a = 0;$a <= 100000;$a++){
$randLenth = rand(6,12);
$url = 'http://'.$_POST['url'].'/'.$v.randomkeys($randLenth);
$contents=vspider_get($url);
//$match='/<TITLE>([\w\W]*?)_/si';
$match='/<TITLE>([\w\W]*?)<\/TITLE>/si';
$ru=preg_match_all($match,$contents,$ma);
$res = str_replace(' ', '', $ma[1][0]);
$strArr = explode('_',$res);
$newa = '/[\x{4e00}-\x{9fa5}]/u';
$ru=preg_match($newa,$strArr[0]);
if($ru<1){
$str = html_entity_decode($strArr[0]);
}else{
$str = $strArr[0];
}
@iconv('GB2312', 'UTF-8', $str);
file_put_contents( './' . $_POST['url'] . ".txt", $v.':'.$str . PHP_EOL, FILE_APPEND );
unset($ma);
unset($str);
}
}
}
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>模拟蜘蛛爬取title</title>
<style>
html, body, h1, form, fieldset, legend, ol ,li{
padding:0;
margin:0;
}
ol{
list-style:none;
}
body{
background:#fff;
color:#111;
padding:20px;
}
form#payment{
background:#9cbc2c;
-webkit-border-radius:5px;
border-radius:5px;
padding:20px;
width:400px;
}
form#payment fieldset{
border:none;
margin-bottom:10px;
}
form#payment fieldset:last-of-type{ margin-bottom:0; }
form#payment legend{
color:#384313;
font-size:16px;
font-weight:bold;
padding-bottom:10;
text-shadow:0px 1px 1px #c0d576;
}
form#payment > fieldset>legend:before{
content:"Step" counter(fieldset)":";
counter-increment:fieldsets;
}
form#payment fieldset fieldset legend{
color:#111;
font-size:13px;
font-weight:normal;
padding-bottom:0;
}
form#payment ol li{
background:#b9cf6a;
background:rgba(255, 255, 255, 0.3);
border:#e3ebc3;
border-color:rgba(255, 255, 255, 0.6);
border-style:solid;
border-width:2px;
-webkit-border-radius:5px;
line-height:30px;
padding:5px 10px;
margin-bottom:2px;
}
form#payment ol ol li{
bakcground:none;
border:none;
float:left;
}
form#payment label{
float:left;
font-size:13px;
width:110px;
}
form#payment fieldset fieldset label{
background:none no-repeat left 50%;
line-height:20px;
padding:0 0 0 30px;
width:auto;
}
form#payment fieldset fieldset label:hover{cursor:pointer;}
form#payment input:not([type=radio]), form#payment textarea{
background:#fff;
border:#fc3 solid 1px;
-webkit-border-radius:3px;
outline:none;
padding:5px;
}
</style>
</head>
<body>
<form id=payment action="" method="POST">
<fieldset>
<legend>模拟蜘蛛爬取title</legend>
<ol>
<li>
<label for="name">网址:</label>
<input type="text" id="name" name="url" placeholder="输入需要爬取title的网址" required autofocus>
</li>
<li>
<label for="phone">后缀关键字母</label>
<input type="tel" placeholder="多条请用英文‘,’隔开" id="phone" name="key">
</li>
</ol>
</fieldset>
<fieldset>
<button type="submit">提交</button>
</fieldset>
</form>
</body>
</html>