当前位置:Gxlcms > PHP教程 > 基于Snoopy的PHP近似完美获取网站编码

基于Snoopy的PHP近似完美获取网站编码

时间:2021-07-01 10:21:17 帮助过:5人阅读

基于Snoopy的PHP近似完美获取网站编码
用于php爬虫,获取编码准确率99.9%, 还有部分不能获取,求大牛完善
代码来源: 站云网 www.siteyun.com
先要到网上下载Snoopy.class.php
调用方法: echo $go->getCharset(); ?>

[code]url=$url; } //打开网站 private function open($url) { if($this->request!==null) { if($this->request->status==200) { return true; } else { return false; } } else { $this->request=new Snoopy(); $this->request->fetch($url); if($this->request->status==200) { $this->request->results=strtolower($this->request->results); $charset=$this->getCharset(); if($charset!="utf-8") { if($charset=="windows-1252") { $this->request->results=$this->uni_decode($this->request->results); } else { $this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset); } } return true; } else { return false; } } } //获取网站title,keywords,description public function getWebinfo() { $info=array( 'title'=>'', 'keywords'=>'', 'desc'=>'', 'ip'=>'' ); if(!$this->open($this->url)){return $info;exit;} // print_r($this->request->results);exit; preg_match('/([^>]*)<\/title>/si', $this->request->results, $titlematch ); if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) { $info['title'] = strip_tags($titlematch[1]); } preg_match_all('/<[\s]*meta[\s]*name="?' . '([^>"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); $ft=0; foreach($match[1] as $mt) { if($mt=="keywords" || $mt=="description") { $ft=1; } } if($ft==0) { preg_match_all('/<[\s]*meta[\s]*content="?([^>"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[2]; $values = $match[1]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0, $limiti=count($names); $i < $limiti; $i++) { $metaTags[$names[$i]] = array ( 'html' => htmlentities($originals[$i]), 'value' => $values[$i] ); } } } } else { if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[1]; $values = $match[2]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0, $limiti=count($names); $i < $limiti; $i++) { $metaTags[$names[$i]] = array ( 'html' => htmlentities($originals[$i]), 'value' => $values[$i] ); } } } } $result = array ( 'metaTags' => $metaTags ); if(isset($result['metaTags']['keywords']['value'])) { $info['keywords']=$result['metaTags']['keywords']['value']; } else { $info['keywords']=""; } if(isset($result['metaTags']['description']['value'])) { $info['desc']=$result['metaTags']['description']['value']; } else { $info['desc']=""; } $domain=preg_replace('/http\:\/\//si', '', $this->url); $ip=@gethostbyname($domain); $ip_arr=explode(".", $ip); if(count($ip_arr)==4) { $info['ip']=$ip; } return $info; } public function t($string,$o) { for($i=0;$i<strlen($string);$i++) { if(ord($string{$i})<128) continue; if((ord($string{$i})&224)==224) { //第一个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { //第二个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { $encoding = "UTF-8"; break; } } } if((ord($string{$i})&192)==192) { //第一个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { //第二个字节判断通过 $encoding = "GB2312"; break; } } } return strtolower($encoding); } function uni_decode ($str, $code = 'utf-8'){ $str = json_decode(preg_replace_callback('/&#(\d{5});/', create_function('$dec', 'return \'\\u\'.dechex($dec[1]);'), '"'.$str.'"')); if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); } return $str; } //获取网站编码 public function getCharset() { if(!$this->open($this->url)){return false;exit;} //首先从html获取编码 preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$this->request->results,$temp) ? strtolower($temp[1]):""; if($temp[1]!="") { if(in_array($temp[1], $this->charset_arr)) { if($temp[1]=="gb2312") { $tmp_charset=$this->t($this->request->results,$temp[1]); if($tmp_charset==$temp[1]) { return $temp[1]; } } else { return $temp[1]; } } } if(!empty($this->request->headers)) { //从header中获取编码 $hstr=strtolower(implode("</td></tr></table> </div> <div class=""> <ul class="m-news-opt fix"> <li class="opt-item"> <a href='/PHPjiqiao-178816.html' target='_blank'><p>< 上一篇</p><p class="ellipsis">根据汉字取拼音,并附有编码功能</p></a> </li> <li class="opt-item ta-r"> <a href='/PHPjiqiao-178818.html' target='_blank'><p>下一篇 ></p><p class="ellipsis">php实现DOS攻击</p></a> </li> </ul> </div> </div> </div> <div class="g-title fix"> <h2 class="title-txt">人气教程排行</h2> </div> <div class="m-rank u-dashed mb40"> <ul> <li class="rank-item"> <a href="/PHPjiqiao-379253.html" title='php如何获取跳转前的url' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num top">1</span> php如何获取跳转前的url </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-379019.html" title='php格林威治时间转换成当前时间的方法' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num second">2</span> php格林威治时间转换成当前时间的方法 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-366629.html" title='为什么php不能做大型系统?' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num third">3</span> 为什么php不能做大型系统? </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-207623.html" title='range函数怎么用' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">4</span> range函数怎么用 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-162433.html" title='php中计算页面加载时间几种方法总结_PHP教程' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">5</span> php中计算页面加载时间几种方法总结_PHP教程 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-140221.html" title='求帮助,关于paypal支付返回值修改订单状态' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">6</span> 求帮助,关于paypal支付返回值修改订单状态 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-103588.html" title='typecho怎么配置文章内容页?' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">7</span> typecho怎么配置文章内容页? </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-99213.html" title='PhpStorm左侧structure不显示文件的方法列表是这么回事?' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">8</span> PhpStorm左侧structure不显示文件的方法列表是这么回事? </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-92208.html" title='查看PHP的环境变量_PHP' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">9</span> 查看PHP的环境变量_PHP </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-170.html" title='PHP Primary script unknown 解决方法总结' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">10</span> PHP Primary script unknown 解决方法总结 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-148.html" title='php的命名空间与自动加载实现方法' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">11</span> php的命名空间与自动加载实现方法 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-133.html" title='解决laravel 出现ajax请求419(unknown status)的问题' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">12</span> 解决laravel 出现ajax请求419(unknown status)的问题 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-462817.html" title='php 如何删除mysql记录' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">13</span> php 如何删除mysql记录 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-388448.html" title='PHP如何替换数组中的指定元素' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">14</span> PHP如何替换数组中的指定元素 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-124270.html" title='怎么去除字符串中非汉字、非字母、非数字的字符' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">15</span> 怎么去除字符串中非汉字、非字母、非数字的字符 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-112291.html" title='mysql如何一次执行多条SQL语句' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">16</span> mysql如何一次执行多条SQL语句 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-110669.html" title='修改header里面的Connection为close解决方法' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">17</span> 修改header里面的Connection为close解决方法 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-153.html" title='PHP基于session.upload_progress 实现文件上传进度显示功能详解' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">18</span> PHP基于session.upload_progress 实现文件上传进度显示功能详解 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-125.html" title='php5.6.x到php7.0.x特性小结' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">19</span> php5.6.x到php7.0.x特性小结 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-378118.html" title='php为什么会出现504错误' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">172次</span> <span class="g-sort-num ">20</span> php为什么会出现504错误 </a> </li> </ul> </div> </div> </div> <!-- / 教程内容页 --> </div> </div> <!-- 页尾 --> <div class="footer"> 本站所有资源全部来源于网络,若本站发布的内容侵害到您的隐私或者利益,请联系我们删除!</div> <!-- / 页尾 --> <script type="text/javascript" src="/kan/js/read.js"></script> <div style="display:none"> <div class="login-box" id="login-dialog"> <div class="login-top"><a class="current" rel="nofollow" id="login1" onclick="setTab('login',1,2);" >登录</a></div> <div class="login-form" id="nav-signin"> <!-- <div class="login-ico"><a rel="nofollow" class="qq" id="qqlogin" target="_blank" href="/user-center-qqlogin.html"> QQ </a></div> --> <div class="login-box-form" id="con_login_1"> <form id="loginform" action="/user-center-login.html" method="post" onsubmit="return false;"> <p class="int-text"> <input class="email" id="username" name="username" type="text" value="用户名或Email" onfocus="if(this.value=='用户名或Email'){this.value='';}" onblur="if(this.value==''){this.value='用户名或Email';};" ></p> <p class="int-text"> <input class="password1" type="password" id="password" name="password" value="******" onBlur="if(this.value=='') this.value='******';" onFocus="if(this.value=='******') this.value='';" > </p> <p class="int-info"> <label class="ui-label"> </label> <label for="agreement" class="ui-label-checkbox"> <input type="checkbox" value="" name="cookietime" id="cookietime" checked="checked" value="2592000"> <input type="hidden" name="notforward" id="notforward" value="1"> <input type="hidden" name="dosubmit" id="dosubmit" value="1">记住我的登录 </label> <a rel="nofollow" class="aright" href="/user-center-forgetpwd.html" target="_blank"> 忘记密码? </a></p> <p class="int-btn"><a rel="nofollow" id="loginbt" class="loginbtn"><span>登录</span></a></p> </form> </div> <form id="regform" action="/user-center-reg.html" method="post"> <div class="login-reg" style="display: none;" id="con_login_2"> <input type="hidden" name="t" id="t"/> <p class="int-text"> <input id="email" name="email" type="text" value="Email" onfocus="if(this.value=='Email'){this.value='';}" onblur="if(this.value==''){this.value='Email';};"></p> <p class="int-text"> <input id="uname" name="username" type="text" value="用户名或昵称" onfocus="if(this.value=='用户名或昵称'){this.value='';}" onblur="if(this.value==''){this.value='用户名或昵称';};"></p> <p class="int-text"> <input type="password" id="pwd" name="password" value="******" onBlur="if(this.value=='') this.value='******';" onFocus="if(this.value=='******') this.value='';"> </p> <p class="int-text1"><span class="inputbox"> <input id="validate" name="validate" type="text" value="验证码" onfocus="if(this.value=='验证码'){this.value='';}" onblur="if(this.value==''){this.value='验证码';};"> </span><span class="yzm-img"><img src="/user-checkcode-index" alt="看不清楚换一张" id="indexlogin"></p> <p class="int-info"> <label> <input value="" name="agreement" id="agreement" CHECKED="checked" type="checkbox"> 我已阅读<a rel="nofollow" href="/user-center-agreement.html">用户协议</a>及<a rel="nofollow" href="/user-center-agreement.html">版权声明</a></label> </p> <p class="int-btn"><input type="hidden" name="dosubmit"/> <a rel="nofollow" class="loginbtn" id="register"><span>注册</span></a></p> </div> </form> </div> </div> </div> </div> <script type="text/javascript" src="/kan/js/foot_js.js"></script> <script> var _hmt = _hmt || []; (function() { var hm = document.createElement("script"); hm.src = "https://hm.baidu.com/hm.js?6dc1c3c5281cf70f49bc0bc860ec24f2"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s); })(); </script> <script type="text/javascript" src="/layui/layui.js"></script> <script> layui.use('code', function() { layui.code({ elem: 'pre', //默认值为.layui-code about: false, skin: 'notepad', title: 'php怎么实现数据库验证跳转代码块', encode: true //是否转义html标签。默认不开启 }); }); </script> </body> </html>