时间:2021-07-01 10:21:17 帮助过:46人阅读
百度dict 采集样本
写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~
<?php /** * dict.class.php 采集百度词典翻译内容 * * @copyright (C) 2014 widuu * @license http://www.widuu.com * @lastmodify 2014-2-15 */ header("content-type:text/html;charset=utf8"); class Dict{ private $word; //显示的条数 private static $num = 10; public function __construct(){} /** * 公用返回百度采集数据的方法 * @param string 英文单词 * retun array( * symbol" => 音标 * "pro" => 发音 * "example"=> 例句 * "explain"=> 简明释义 * "synonym"=> 同反义词 * "phrase" => 短语数组 * ) * */ public function content($word){ $this -> word = $word; $symbol = $this -> Pronounced(); $pro = $this->getSay(); $example = $this -> getExample(); $explain = $this -> getExplain(); $synonym = $this -> getSynonym(); $phrase = $this -> getPhrase(); $result = array( "symbol" => $symbol, //音标 "pro" => $pro, //发音 "example"=> $example, //例句 "explain"=> $explain, //简明释义 "synonym"=> $synonym, //同反义词 "phrase" => $phrase //短语数组 ); return $result; } /** * 远程获取百度翻译内容 * get function curl * retun string * */ private function getContent(){ $useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0"; $ch = curl_init(); $url = "http://dict.baidu.com/s?wd=".$this->word; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_USERAGENT,$useragent); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HTTPGET, 1); curl_setopt($ch, CURLOPT_AUTOREFERER,1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_TIMEOUT, 30); $result = curl_exec($ch); if (curl_errno($curl)) { echo 'Errno'.curl_error($curl); } curl_close($ch); return $result; } /** * 获取百度翻译发音 * retun array(英,美) * */ private function Pronounced(){ $data = $this -> getContent(); preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced); return array( 'en' => $pronounced[1][0], 'us' => $pronounced[1][1] ); } /** * 获取百度翻译发音 * return array(英,美) * */ private function getSay(){ $data = $this -> getContent(); preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced); return array( 'en' => $pronounced[1][0], 'us' => $pronounced[1][1] ); } /** * 获取百度翻译例句 * return array() 多维数组 例句 * */ private function getExample(){ $str = ""; $data = $this -> getContent(); preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example); $data1 = "[[[".ltrim($example[1][0],"["); $data2 = explode("[[[",$data1); $num = count(array_filter($data2)); foreach($data2 as $key => $value){ $data3 = explode("[[","[[".$value); foreach ($data3 as $k => $v) { preg_match_all("/\[\"(.*)\",/Us","[".$v, $match); if(!empty($match[1])){ $str .= implode($match[1]," ")."@"; } } } $data4 = trim($str,"@"); $data5 = explode("@", $data4); $result = array_chunk($data5, 2); return $result; } /** * 获取简明释义 * return array (x => "词性",b => "附属") * **/ private function getExplain(){ $data = $this -> getContent(); preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\/Us",$data,$explain); $r_data = $explain[1][0]; preg_match_all("/\\(?P.*)\<\/strong\>\(?P
.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data); preg_match_all("/\(?P [^\>]+)\:\(?P .*)\<\/a\>\<\/span\>/Us", $r_data, $b_data); $result = array(); foreach ($a_data["adj"] as $key => $value) { $result[$value] = $a_data["name"][$key]; } $word_b = array(); foreach ($b_data["tag"] as $key => $value) { $word_b[$value] = strip_tags($b_data["word"][$key]); } $result_data = array("x" => $result,"b" => $word_b); return $result_data; } /** * 获取同义词 * return array(0 => "同义词", 1 => "反义词") 一般为多维数组 * */ private function getSynonym(){ $data = $this -> getContent(); preg_match_all("/id=\"en\-syn\-ant\"\>(.*)/Us",$data,$synonym); $content = $synonym[1][0]; $data1 = explode("", $content); $result = array(); $data2 = array(); foreach ($data1 as $key => $value) { preg_match_all("/\(?P.*)\ \;\<\/strong\>\<\/div\>\\ (?
.*)\<\/ul\>/Us", $value, $r_data); $data2[$key]["adj"] = $r_data["adj"]; $data2[$key]["content"] = $r_data["content"]; } foreach ($data2 as $key => $value) { foreach ($value["content"] as $k => $v) { if(!empty($v)){ preg_match_all("/\ - \
(?P
.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data); foreach ($v_data['title'] as $m => $d) { $data = strip_tags(preg_replace("<>"," ", $v_data["value"][$m])); $result[$key][$value["adj"][$k]][$d] = $data; } } } } return $result; } /** * 获取短语词组 * return array (key => value) 一维或者多维数组 * */ private function getPhrase(){ $num = self::$num; $data = $this -> getContent(); preg_match_all("/id=\"en\-phrase\"\>(.*)\/Us",$data,$phrase); $data = explode("</dd>",$phrase[1][0]); $data1 = array_slice($data,0,$num); $result = array(); foreach ($data1 as $key => $value) { $data2 = explode("</p>", $value); $n = count($data2); if($n<=3){ $result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]); }else{ $data3 = array_slice($data2,0,$n-1); $data4 = array_slice($data2,0,2); $res = array_diff($data3,$data4); $data5 = array_chunk($res,2); $key_value = trim(str_replace(" ","",strip_tags($data4[0]))); $result[$key_value] = strip_tags($data4[1]); foreach ($data5 as $key => $value) { foreach ($value as $k => $v) { $value[$k] = strip_tags($v); } $array = array($result[$key_value],$value); if (array_key_exists($key_value, $result)){ $result[$key_value] = $array; } } } } return $result; } /** * 将数组转换为字符串 * * @param array $data 数组 * @param bool $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1 * @return string 返回字符串,如果,data为空,则返回空 */ private function array2string($data, $isformdata = 1) { if($data == '') return ''; if($isformdata) $data = $this->new_stripslashes($data); return addslashes(var_export($data, TRUE)); } /** * 返回经stripslashes处理过的字符串或数组 * @param $string 需要处理的字符串或数组 * @return mixed */ private function new_stripslashes($string) { if(!is_array($string)) return stripslashes($string); foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val); return $string; } } // $word = new dict("express"); // $word ->content();</pre> </p> <p>以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。 </div> <div class=""> <ul class="m-news-opt fix"> <li class="opt-item"> <a href='/PHPjiqiao-85991.html' target='_blank'><p>< 上一篇</p><p class="ellipsis">php实现检查文章是否被百度收录_PHP</p></a> </li> <li class="opt-item ta-r"> <a href='/PHPjiqiao-85993.html' target='_blank'><p>下一篇 ></p><p class="ellipsis">php+mysqli批量查询多张表数据的方法_PHP</p></a> </li> </ul> </div> </div> </div> <div class="g-title fix"> <h2 class="title-txt">人气教程排行</h2> </div> <div class="m-rank u-dashed mb40"> <ul> <li class="rank-item"> <a href="/PHPjiqiao-379253.html" title='php如何获取跳转前的url' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num top">1</span> php如何获取跳转前的url </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-379019.html" title='php格林威治时间转换成当前时间的方法' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num second">2</span> php格林威治时间转换成当前时间的方法 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-366629.html" title='为什么php不能做大型系统?' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num third">3</span> 为什么php不能做大型系统? </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-207623.html" title='range函数怎么用' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">4</span> range函数怎么用 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-162433.html" title='php中计算页面加载时间几种方法总结_PHP教程' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">5</span> php中计算页面加载时间几种方法总结_PHP教程 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-140221.html" title='求帮助,关于paypal支付返回值修改订单状态' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">6</span> 求帮助,关于paypal支付返回值修改订单状态 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-103588.html" title='typecho怎么配置文章内容页?' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">7</span> typecho怎么配置文章内容页? </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-99213.html" title='PhpStorm左侧structure不显示文件的方法列表是这么回事?' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">8</span> PhpStorm左侧structure不显示文件的方法列表是这么回事? </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-92208.html" title='查看PHP的环境变量_PHP' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">9</span> 查看PHP的环境变量_PHP </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-170.html" title='PHP Primary script unknown 解决方法总结' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">10</span> PHP Primary script unknown 解决方法总结 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-148.html" title='php的命名空间与自动加载实现方法' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">11</span> php的命名空间与自动加载实现方法 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-133.html" title='解决laravel 出现ajax请求419(unknown status)的问题' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">174次</span> <span class="g-sort-num ">12</span> 解决laravel 出现ajax请求419(unknown status)的问题 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-462817.html" title='php 如何删除mysql记录' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">13</span> php 如何删除mysql记录 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-388448.html" title='PHP如何替换数组中的指定元素' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">14</span> PHP如何替换数组中的指定元素 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-124270.html" title='怎么去除字符串中非汉字、非字母、非数字的字符' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">15</span> 怎么去除字符串中非汉字、非字母、非数字的字符 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-112291.html" title='mysql如何一次执行多条SQL语句' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">16</span> mysql如何一次执行多条SQL语句 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-110669.html" title='修改header里面的Connection为close解决方法' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">17</span> 修改header里面的Connection为close解决方法 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-153.html" title='PHP基于session.upload_progress 实现文件上传进度显示功能详解' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">18</span> PHP基于session.upload_progress 实现文件上传进度显示功能详解 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-125.html" title='php5.6.x到php7.0.x特性小结' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">173次</span> <span class="g-sort-num ">19</span> php5.6.x到php7.0.x特性小结 </a> </li> <li class="rank-item"> <a href="/PHPjiqiao-378118.html" title='php为什么会出现504错误' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">172次</span> <span class="g-sort-num ">20</span> php为什么会出现504错误 </a> </li> </ul> </div> </div> </div> <!-- / 教程内容页 --> </div> </div> <!-- 页尾 --> <div class="footer"> 本站所有资源全部来源于网络,若本站发布的内容侵害到您的隐私或者利益,请联系我们删除!</div> <!-- / 页尾 --> <script type="text/javascript" src="/kan/js/read.js"></script> <div style="display:none"> <div class="login-box" id="login-dialog"> <div class="login-top"><a class="current" rel="nofollow" id="login1" onclick="setTab('login',1,2);" >登录</a></div> <div class="login-form" id="nav-signin"> <!-- <div class="login-ico"><a rel="nofollow" class="qq" id="qqlogin" target="_blank" href="/user-center-qqlogin.html"> QQ </a></div> --> <div class="login-box-form" id="con_login_1"> <form id="loginform" action="/user-center-login.html" method="post" onsubmit="return false;"> <p class="int-text"> <input class="email" id="username" name="username" type="text" value="用户名或Email" onfocus="if(this.value=='用户名或Email'){this.value='';}" onblur="if(this.value==''){this.value='用户名或Email';};" ></p> <p class="int-text"> <input class="password1" type="password" id="password" name="password" value="******" onBlur="if(this.value=='') this.value='******';" onFocus="if(this.value=='******') this.value='';" > </p> <p class="int-info"> <label class="ui-label"> </label> <label for="agreement" class="ui-label-checkbox"> <input type="checkbox" value="" name="cookietime" id="cookietime" checked="checked" value="2592000"> <input type="hidden" name="notforward" id="notforward" value="1"> <input type="hidden" name="dosubmit" id="dosubmit" value="1">记住我的登录 </label> <a rel="nofollow" class="aright" href="/user-center-forgetpwd.html" target="_blank"> 忘记密码? </a></p> <p class="int-btn"><a rel="nofollow" id="loginbt" class="loginbtn"><span>登录</span></a></p> </form> </div> <form id="regform" action="/user-center-reg.html" method="post"> <div class="login-reg" style="display: none;" id="con_login_2"> <input type="hidden" name="t" id="t"/> <p class="int-text"> <input id="email" name="email" type="text" value="Email" onfocus="if(this.value=='Email'){this.value='';}" onblur="if(this.value==''){this.value='Email';};"></p> <p class="int-text"> <input id="uname" name="username" type="text" value="用户名或昵称" onfocus="if(this.value=='用户名或昵称'){this.value='';}" onblur="if(this.value==''){this.value='用户名或昵称';};"></p> <p class="int-text"> <input type="password" id="pwd" name="password" value="******" onBlur="if(this.value=='') this.value='******';" onFocus="if(this.value=='******') this.value='';"> </p> <p class="int-text1"><span class="inputbox"> <input id="validate" name="validate" type="text" value="验证码" onfocus="if(this.value=='验证码'){this.value='';}" onblur="if(this.value==''){this.value='验证码';};"> </span><span class="yzm-img"><img src="/user-checkcode-index" alt="看不清楚换一张" id="indexlogin"></p> <p class="int-info"> <label> <input value="" name="agreement" id="agreement" CHECKED="checked" type="checkbox"> 我已阅读<a rel="nofollow" href="/user-center-agreement.html">用户协议</a>及<a rel="nofollow" href="/user-center-agreement.html">版权声明</a></label> </p> <p class="int-btn"><input type="hidden" name="dosubmit"/> <a rel="nofollow" class="loginbtn" id="register"><span>注册</span></a></p> </div> </form> </div> </div> </div> </div> <script type="text/javascript" src="/kan/js/foot_js.js"></script> <script> var _hmt = _hmt || []; (function() { var hm = document.createElement("script"); hm.src = "https://hm.baidu.com/hm.js?6dc1c3c5281cf70f49bc0bc860ec24f2"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s); })(); </script> <script type="text/javascript" src="/layui/layui.js"></script> <script> layui.use('code', function() { layui.code({ elem: 'pre', //默认值为.layui-code about: false, skin: 'notepad', title: 'php怎么实现数据库验证跳转代码块', encode: true //是否转义html标签。默认不开启 }); }); </script> </body> </html>