在完成这个功能之前呢,我们首先要了解下需要使用的工具
1.queryList V3 不会的小伙伴请自行百度,或者私聊我请教QQ:3319647017
2.下载百度的通用翻译API,PHP版本。然后申请百度的通用翻译API的IP和sec
接下来就是逻辑:
1.用queryList爬取网站的url和title
2.foreach变量,然后调用百度的API翻译title。
3.讲翻译结果写入到txt
4.再用queryList爬取详情文章连接
5.foreach得到的结果
6.翻译文章
7.将翻译结果写入txt
以下是代码:
/*
* * .....................我佛慈悲........................
* _oo0oo_
* o8888888o
* 88" . "88
* (| -_- |)
* 0\ = /0
* ___/`---'\___
* .' \\| |// '.
* / \\||| : |||// \
* / _||||| -卍-|||||- \
* | | \\\ - /// | |
* | \_| ''\---/'' |_/ |
* \ .-\__ '-' ___/-. /
* ___'. .' /--.--\ `. .'___
* ."" '< `.___\_<|>_/___.' >' "".
* | | : `- \`.;`\ _ /`;.`/ - ` : | |
* \ \ `_. \_ __\ /__ _/ .-` / /
* =====`-.____`.___ \_____/___.-`___.-'=====
* `=---='
* -卍-卍-卍-卍-卍-卍-卍-卍-卍-卍-卍-卍-卍-卍-卍-卍-卍-
*
*..................佛祖开光 ,永无BUG...................
* */
ini_set('memory_limit','2000M');//设置PHP内存
set_time_limit(360000);//PHP连接时间为100小时
require 'baiduApi.php';
require 'vendor/phpQuery.php';
require 'vendor/QueryList.php';
use QL\QueryList;
if(!empty($_POST)) {
$url = 'https://' . $_POST['listUrl'];//列表页面
$listRule = $_POST['listRule'];
$titleRule = $_POST['titleRule'];
$contentRule = $_POST['contentRule'];
$data = @QueryList::Query($url, array("url" => array($listRule, 'href'), "title" => array($titleRule, 'text')))->data;//开始爬取数据并得到连接和标题
$domain = parse_url($url, PHP_URL_HOST);//得到主域
foreach ($data as $v) {//开始遍历列表页数组
if (strpos(@$v['url'], $domain) == false) {//判断需要抓取的页面a标签是否完整
@file_put_contents('./' . $domain . ".txt", PHP_EOL, FILE_APPEND);//手动换行
@file_put_contents('./' . $domain . ".txt", PHP_EOL, FILE_APPEND);
@$newUrl = 'http://' . $domain . $v['url'];
$zhTitle = translate($v['title'], 'en', 'zh');//翻译英文标题
@file_put_contents('./' . $domain . ".txt", $zhTitle['trans_result']['0']['dst'] . PHP_EOL, FILE_APPEND);//写入中文标题
$urlData = @QueryList::Query($newUrl, array("content" => array($contentRule , 'text')))->data;//抓取详情页面的信息
foreach ($urlData as $value) {
$a[] = translate($value['content'], 'en', 'zh');//执行翻译
}
foreach ($a as $value) {
@file_put_contents('./' . $domain . ".txt", $value['trans_result']['0']['dst'] . PHP_EOL, FILE_APPEND);//讲翻译结果写入txt
}
unset($a);
unset($urlData);
unset($newUrl);
unset($zhTitle);
} else {
@file_put_contents('./' . $domain . ".txt", PHP_EOL, FILE_APPEND);//手动换行
@file_put_contents('./' . $domain . ".txt", PHP_EOL, FILE_APPEND);
$newUrl = 'http://' . $v['url'];
$zhTitle = translate($v['title'], 'en', 'zh');//翻译英文标题
@file_put_contents('./' . $domain . ".txt", $zhTitle['trans_result']['0']['dst'] . PHP_EOL, FILE_APPEND);//写入中文标题
$urlData = @QueryList::Query($newUrl, array("content" => array($contentRule , 'text')))->data;//抓取详情页面的信息
foreach ($urlData as $value) {
$a[] = translate($value['content'], 'en', 'zh');//执行翻译
}
foreach ($a as $value) {
@file_put_contents('./' . $domain . ".txt", $value['trans_result']['0']['dst'] . PHP_EOL, FILE_APPEND);//讲翻译结果写入txt
}
unset($a);
unset($urlData);
unset($newUrl);
unset($zhTitle);
}
}
}
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>英文站采集</title>
<style>
html, body, h1, form, fieldset, legend, ol ,li{
padding:0;
margin:0;
}
ol{
list-style:none;
}
body{
background:#fff;
color:#111;
padding:20px;
}
form#payment{
background:#9cbc2c;
-webkit-border-radius:5px;
border-radius:5px;
padding:20px;
width:400px;
}
form#payment fieldset{
border:none;
margin-bottom:10px;
}
form#payment fieldset:last-of-type{ margin-bottom:0; }
form#payment legend{
color:#384313;
font-size:16px;
font-weight:bold;
padding-bottom:10;
text-shadow:0px 1px 1px #c0d576;
}
form#payment > fieldset>legend:before{
content:"Step" counter(fieldset)":";
counter-increment:fieldsets;
}
form#payment fieldset fieldset legend{
color:#111;
font-size:13px;
font-weight:normal;
padding-bottom:0;
}
form#payment ol li{
background:#b9cf6a;
background:rgba(255, 255, 255, 0.3);
border:#e3ebc3;
border-color:rgba(255, 255, 255, 0.6);
border-style:solid;
border-width:2px;
-webkit-border-radius:5px;
line-height:30px;
padding:5px 10px;
margin-bottom:2px;
}
form#payment ol ol li{
bakcground:none;
border:none;
float:left;
}
form#payment label{
float:left;
font-size:13px;
width:110px;
}
form#payment fieldset fieldset label{
background:none no-repeat left 50%;
line-height:20px;
padding:0 0 0 30px;
width:auto;
}
form#payment fieldset fieldset label:hover{cursor:pointer;}
form#payment input:not([type=radio]), form#payment textarea{
background:#fff;
border:#fc3 solid 1px;
-webkit-border-radius:3px;
outline:none;
padding:5px;
}
</style>
</head>
<body>
<form id=payment action="" method="POST">
<fieldset>
<legend>英文站采集</legend>
<ol>
<li>
<label for="phone">列表页连接:</label>
<input type="tel" placeholder="请输入列表页连接" id="phone" name="listUrl">
</li>
<li>
<label for="phone">列表连接规则:</label>
<input type="tel" placeholder="请输入列表连接规则" id="phone" name="listRule">
</li>
<li>
<label for="phone">列表标题:</label>
<input type="tel" placeholder="请输入列表标题规则" id="phone" name="titleRule">
</li>
<li>
<label for="phone">内容页规则:</label>
<input type="tel" placeholder="请输入内容页规则" id="phone" name="contentRule">
</li>
</ol>
</fieldset>
<fieldset>
<button type="submit">提交</button>
</fieldset>
</form>
</body>
</html>