时间:2021-07-01 10:21:17 帮助过:35人阅读
- <br><!--?php <BR-->session_start(); <br>header("content-type:text/html;charset=gbk"); <br>require("stole_config.php"); <br>require("conn.php"); <br>require("keyword.php"); <br>$searchStr=$_GET["searchStr"]; <br>$ss=explode(" ",$searchStr);//拆分搜索关键字 <br>$word="";//关键字设为空 <br>foreach($ss as $key=>$t) <br>{ <br>if($key>0) <br>{ <br>$word .="+"; <br>} <br>$word .=urlencode($t); <br>} <br>$jl=intval($_GET['jl']); <br>if(isset($_GET['page'])) <br>{ <br>$page=intval($_GET['page']); <br>}else{ <br>$page=1; <br>} <br>$rs=intval($_GET['rs']); <br>if($rs>=10) <br>{ <br>$rs=0; <br>$page++; <br>} <br>if($page>76) <br>{ <br>echo "采集完毕 ${jl}"; <br>exit(); <br>} <br>if(!empty($searchStr))//如果搜索 <br>{ <br>//获取问题页面 <br>$content=@file_get_contents("http://zhidao.baidu.com/q?ct=17&lm=0&tn=ikaslist&pn=".(($page-1)*10)."&rn=10&word=".$word); <br>//获取问题列表 <br>preg_match_all("/$uid=$uid[1];//获取详细页文章 <br>$uid=$uid[$rs]; <br>//判断数据是否存在 <br>$suid="bd{$uid}"; <br>$sct=mysql_query("select count(*) from {$table_prefix}c_article where suid='$suid' "); <br>$sct=mysql_fetch_array($sct); <br>$sct=$sct[0]; <br>if($sct==0) <br>{ <br>$content=@file_get_contents("http://zhidao.baidu.com/question/".$uid.".html") ; <br>$arr=explode('<cq>',$content); <br>$art_title=$arr[1]; <br>$arr=explode('</cq>',$art_title); <br>$art_title=$arr[0];//获取标题结束 <br>//判断内容是否符合 <br>$word_arr=explode(",",$cj_word); <br>$word_allow=false;//初始化是否允许采集 <br>$word_count=count($word_arr);//关键字总数 <br>for($i=0;$i<$word_count;$i++) <br>{ <br>if(substr_count($art_title,$word_arr[$i])>0) <br>{ <br>$word_allow=1; <br>$i=$word_count; <br>} <br>} <br>if($word_allow)//如果满足条件 <br>{ <br>$arr=explode('<cd><pre class="brush:php;toolbar:false layui-box layui-code-view layui-code-notepad"><ol class="layui-code-ol"><li>',$content); <br>$contentQuestion=$arr[1]; <br>$arr=explode('</li></ol></pre></cd>',$contentQuestion); <br>$contentQuestion=$arr[0]; <br>echo "开始采集内容<br>"; <br>echo "$art_title<br>"; <br>@preg_match_all('/(<ca>|<cn>)<pre class="brush:php;toolbar:false layui-box layui-code-view layui-code-notepad"><ol class="layui-code-ol"><li>(.*)<\/pre>(<\/ca>|<\/cn>)/iUs',$content,$answerArr); <br>$answerArr=$answerArr[2]; <br>if($arr_order==1)//随机排序 <br>{ <br>shuffle($answerArr); <br>} <br>if($arr_order==2)//倒序 <br>{ <br>$answerArr=krsort($answerArr);//倒序 <br>} <br>foreach($answerArr as $t) <br>{ <br>$answerTemp=str_replace('<ca><pre class="brush:php;toolbar:false layui-box layui-code-view layui-code-notepad"><ol class="layui-code-ol"><li>','',$t); <br>$answerTemp=str_replace('</li></ol></pre></ca>','',$answerTemp); <br>$answerTemp=str_replace('<cn><pre class="brush:php;toolbar:false layui-box layui-code-view layui-code-notepad"><ol class="layui-code-ol"><li>','',$answerTemp); <br>$answerTemp=str_replace('</li></ol></pre></cn>','',$answerTemp); <br>if(strlen($answerTemp)>$min_t1) <br>{ <br>$art_content .=$answerTemp."<br>"; <br>} <br>} <br>//去除链接 <br>$s1="/()(.*)<\/a>/iUs"; <br>$art_content=preg_replace($s1,${2},trim($art_content)); <br>$art_content=str_replace("\n\r","<br>",$art_content); <br>if(strlen($art_content)>$min_t2) <br>{ <br>$title_ct=mysql_query("select count(*) from {$table_prefix}c_article where art_title ='$art_title' ");//查看标题是否重复 <br>$title_ct=@mysql_fetch_array($title_ct); <br>$title_ct=$title_ct[0]; <br>if($title_ct>0) <br>{ <br>$art_title .="{$same_title}{$title_ct}"; <br>} <br>$art_time=date("Y-m-d"); <br>$art_content=strtr($art_content,$keyword); <br>$sql="insert into {$table_prefix}c_article(art_title,art_content,art_time,art_author,suid) values('$art_title','$art_content','$art_time','$art_author','$suid')";//插入采集表 <br>mysql_query($sql); <br>if(empty($t_catx_id))//如果无分类 <br>{ <br>$sql2="insert into {$t_table}({$t_art_title},{$t_art_content},{$t_art_time},{$t_artx_author}) values('$art_title','$art_content','$art_time','$art_author')"; <br>}else <br>{ <br>$sql2="insert into {$t_table}({$t_art_title},{$t_art_content},{$t_art_time},{$t_artx_author},{$t_catx_id}) values('$art_title','$art_content','$art_time','$art_author','$cat_id')"; <br>} <br>mysql_query($sql2);//插入文章表 <br>$jl++; <br>//数据库处理完毕 <br>}else <br>{ <br>echo "内容长度不够"; <br>} <br>//获取文章内容结束 <br>}else <br>{ <br>echo "主题不符合要求"; <br>} <br>}else <br>{ <br>echo "已经存在"; <br>}$rs++; <br>file_put_contents("bd.txt","采集{$searchStr}到第{$page}第{$rs}条"); <br>echo ""; <br>exit(); <br>} <br>?> <br> <br><br><br><br><br><table width="700" border="0" align="center" cellspacing="1" bgcolor="#CCCCCC"> <tbody><tr> <td height="50" align="center" bgcolor="#00CC00"><h2>荐礼啦知道问问采集插件</h2></td> </tr> </tbody></table> <br><br><br><br><br><table width="700" border="0" align="center" cellspacing="1" bgcolor="#CCCCCC" style="margin-top:6px; margin-bottom:6px;"> <tbody><tr> <td height="30" align="center" bgcolor="#FFFFFF">采集设置 卸载采集 查看采集记录 采集帮助 知道采集 问问采集</td> </tr> </tbody></table> <br><table width="537" height="45" align="center" style="margin-top:30px;"><tbody><tr><td height="39"> <br> <br></td></tr></tbody></table> <br> <br>问问采集代码: <br><span style="CURSOR: pointer" onclick="doCopy('code89459')"><u></u></span> 代码如下:<pre class="brush:php;toolbar:false layui-box layui-code-view layui-code-notepad"><ol class="layui-code-ol"><li><br><!--?php <BR-->session_start(); <br>header("content-type:text/html;charset=utf-8"); <br>require("stole_config.php"); <br>require("conn.php"); <br>require("keyword.php"); <br>if(!empty($_POST['ask'])) <br>{ <br>$ask=urlencode(trim($_POST['ask']));//获取表单提交的问题 <br>$sp="S".$ask; <br>}else <br>{ <br>$sp=urlencode($_GET['sp']); <br>} <br>if(empty($_GET['jl'])) <br>{ <br>$_GET['jl']=1; <br>} <br>$jl=$_GET['jl']; <br>$pg=intval($_GET['pg']);//获取页数 <br>$rs=intval($_GET['rs']);//获得 记录的参数 <br>if($rs>9) <br>{ <br>$rs=0; <br>$pg++; <br>} <br>if($pg>51) <br>{ <br>echo "采集完毕! 总共采集 ".urldecode($sp)." ".$jl."条记录"; <br>exit(); <br>} <br>if($sp)//有设定答案才开始 <br>{ <br>$str=@file_get_contents("http://wenwen.soso.com/z/Search.e?sp={$sp}&pg={$pg}"); <br>@preg_match("/<ol class="\"result_list\"">(.*)<\/ol>/iUs",$str,$asklist);//获取问答列表 <br>//echo $asklist[1]; <br>$url="/@preg_match_all($url,$asklist[1],$urllist);//获取 所有的问题 <br>$t=$urllist[1][$rs]; <br>$uid=$t; <br>$suid="ww{$uid}"; <br>$sct=mysql_query("select count(*) from {$table_prefix}c_article where suid='$suid' "); <br>$sct=mysql_fetch_array($sct); <br>$sct=$sct[0]; <br>if($sct==0) <br>{ <br>$html=@file_get_contents("http://wenwen.soso.com/z/${t}"); <br>$html=str_replace("<pre class="brush:php;toolbar:false layui-box layui-code-view layui-code-notepad"><ol class="layui-code-ol"><li>","",str_replace("</li></ol></pre>","",$html)); <br>$html=str_replace("<br><br><br>","<br><br>",$html); <br>//echo $html; <br>@preg_match("/.*<h3>(.*)<\/h3>/iUs",$html,$ask_title); <br>$art_title=$ask_title[1]; <br>@preg_match("/(.*)<\/div>/iUs",$html,$answer); <br>$j=count($answer)-1; <br>$art_content="";//商品详细 <br>for($i=$j;$i>=1;$i--) <br>{ <br>if(strlen($answer[$i])>$min_t1) <br>{ <br>$art_content .= $answer[$i]; <br>} <br>} <br>$art_content=trim($art_content); <br>$s1="/()(.*)<\/a>/iUs"; <br>$art_content=preg_replace($s1,${2},trim($art_content)); <br>$word_arr=explode(",",iconv("gbk","utf-8",$cj_word)); <br>$word_allow=false;//初始化是否允许采集 <br>$word_count=count($word_arr);//总数 <br>for($i=0;$i<$word_count;$i++) <br>{ <br>if(substr_count($art_title,$word_arr[$i])>0) <br>{ <br>$word_allow=1; <br>$i=$word_count; <br>} <br>} <br>if($word_allow)//如果合法 <br>{ //开始处理数据库 <br>if(strlen($art_content)>$min_t2) <br>{ <br>echo "<font color="red">添加中............................</font><br>"; <br>echo $art_title."<br>"; <br>$art_title=iconv('utf-8','gbk', $art_title); <br>$title_ct=mysql_query("select count(*) from {$table_prefix}c_article where art_title ='$art_title' ");//查看标题是否重复 <br>$title_ct=@mysql_fetch_array($title_ct); <br>$title_ct=$title_ct[0]; <br>if($title_ct>0) <br>{ <br>$art_title .="{$same_title}{$title_ct}"; <br>} <br>$art_content=iconv('utf-8','gbk',str_replace("\r\n","<br>",$art_content)); <br>$art_content=strtr($art_content,$keyword); <br>$art_time=date("Y-m-d"); <br>$sql="insert into {$table_prefix}c_article(art_title,art_content,art_time,art_author,suid) values('$art_title','$art_content','$art_time','$art_author','$suid')";//插入采集表 <br>mysql_query($sql); <br>if(empty($t_catx_id))//如果无分类 <br>{ <br>$sql2="insert into {$t_table}({$t_art_title},{$t_art_content},{$t_art_time},{$t_artx_author}) values('$art_title','$art_content','$art_time','$art_author')"; <br>}else <br>{ <br>$sql2="insert into {$t_table}({$t_art_title},{$t_art_content},{$t_art_time},{$t_artx_author},{$t_catx_id}) values('$art_title','$art_content','$art_time','$art_author','$cat_id')"; <br>} <br>mysql_query($sql2);//插入文章表 <br>$jl++;//如果存放数据库中 则记录加1 <br>//处理数据库结束 <br>}else <br>{ <br>echo "长度不够"; <br>} <br>}else <br>{ <br>echo "主题不符合要求"; <br>} <br>}else <br>{ <br>echo "已经存在"; <br>} <br>$rs++; <br>//记录下本次采集 的状况 <br>$f_tt= urldecode($sp)."--页数".$pg." 记录数 ".$jl ; <br>file_put_contents("ss.txt",$f_tt); <br>echo ""; <br>exit(); <br>} <br>?> <br> <br><br><br><br><br><table width="700" border="0" align="center" cellspacing="1" bgcolor="#CCCCCC"> <tbody><tr> <td height="50" align="center" bgcolor="#00CC00"><h2>荐礼啦知道问问采集插件</h2></td> </tr> </tbody></table> <br><br><br><br><br><table width="700" border="0" align="center" cellspacing="1" bgcolor="#CCCCCC" style="margin-top:6px; margin-bottom:6px;"> <tbody><tr> <td height="30" align="center" bgcolor="#FFFFFF">采集设置 卸载采集 查看采集记录 采集帮助 知道采集 问问采集</td> </tr> </tbody></table> <br> <br></li><li><p></p></li><li><p align="left"><span id="url" itemprop="url">http://www.bkjia.com/PHPjc/322561.html</span><span id="indexUrl" itemprop="indexUrl">www.bkjia.com</span><span id="isOriginal" itemprop="isOriginal">true</span><span id="isBasedOnUrl" itemprop="isBasedOnUrl">http://www.bkjia.com/PHPjc/322561.html</span><span id="genre" itemprop="genre">TechArticle</span><span id="description" itemprop="description">最近发现知道和问问小偷的版本越来越多了!! 看过一个百度小偷的网站也达到了pr6。收录十万多!! 在经过 荐礼啦 四十天的实践之后...</span></p></li><li> </h3></ol></li></ol></pre></li></ol></pre></cn></ca>