当前位置:Gxlcms > PHP教程 > PHP制作百度词典查词采集器_php实例

PHP制作百度词典查词采集器_php实例

时间:2021-07-01 10:21:17 帮助过:13人阅读

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~

<?php
/**
 * dict.class.php 采集百度词典翻译内容
 *
 * @copyright      (C) 2014 widuu
 * @license       http://www.widuu.com
 * @lastmodify     2014-2-15
 */
 
 
header("content-type:text/html;charset=utf8");
class Dict{

	private $word;
	
	//显示的条数
	private static $num = 10;

	public function __construct(){}
	
	
	/**
   * 公用返回百度采集数据的方法
   * @param string 英文单词
   * retun array(
	 *				symbol" => 音标
	 *				"pro"	 => 发音
	 *				"example"=> 例句
	 *				"explain"=> 简明释义
	 *				"synonym"=> 同反义词
	 *				"phrase" => 短语数组
	 *			)
   *
	 */
	public function content($word){
		 $this -> word = $word;
		 $symbol = $this -> Pronounced();
		 $pro	 = $this->getSay();
		 $example = $this -> getExample();
		 $explain = $this -> getExplain();
		 $synonym = $this -> getSynonym();
		 $phrase = $this -> getPhrase();
		 $result = array(
				"symbol" => $symbol,		//音标
				"pro"	 => $pro,			//发音
				"example"=> $example,		//例句
				"explain"=> $explain,		//简明释义
				"synonym"=> $synonym,		//同反义词
				"phrase" => $phrase 		//短语数组
			);
		return $result;
	}


	/**
   * 远程获取百度翻译内容
   * get function curl
   * retun string
   *
	 */

	private function getContent(){
 		$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
 		$ch = curl_init();
 		$url = "http://dict.baidu.com/s?wd=".$this->word;
 		curl_setopt($ch, CURLOPT_URL, $url);
 		curl_setopt($ch, CURLOPT_USERAGENT,$useragent);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
		curl_setopt($ch, CURLOPT_HTTPGET, 1);
		curl_setopt($ch, CURLOPT_AUTOREFERER,1);
		curl_setopt($ch, CURLOPT_HEADER, 0); 
		curl_setopt($ch, CURLOPT_TIMEOUT, 30);
		$result = curl_exec($ch);
		if (curl_errno($curl)) {
			echo 'Errno'.curl_error($curl);
		}
		curl_close($ch);
		return $result;
	}


	/**
   * 获取百度翻译发音
   * retun array(英,美)
   *
	 */

	private function Pronounced(){
		$data = $this -> getContent();
		preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced);
		return array(
			'en' => $pronounced[1][0],
			'us' => $pronounced[1][1]
		);
	}

	/**
	 * 获取百度翻译发音
	 * return array(英,美)
	 *
	 */

	private function getSay(){
		$data = $this -> getContent();
		preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced);
		return array(
			'en' => $pronounced[1][0],
			'us' => $pronounced[1][1]
		);	
	}

	/**
   * 获取百度翻译例句
   * return array() 多维数组 例句
   * 
	 */

	private function getExample(){
		$str = "";
		$data = $this -> getContent();
		preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example);
	  $data1 = "[[[".ltrim($example[1][0],"[");
	  $data2 = explode("[[[",$data1);
	  $num = count(array_filter($data2));
		foreach($data2 as $key => $value){
		 	$data3 = explode("[[","[[".$value);
		 	foreach ($data3 as $k => $v) {
		 		preg_match_all("/\[\"(.*)\",/Us","[".$v, $match);
		 		if(!empty($match[1])){
		 			$str .= implode($match[1]," ")."@";
		 		}
		 	}
		}
		$data4 = trim($str,"@");
		$data5 = explode("@", $data4);
		$result = array_chunk($data5, 2);
		return $result;
	}

	/**
   * 获取简明释义
   * return array (x => "词性",b => "附属")
   * 
	 **/

	private function getExplain(){
		$data = $this -> getContent();
		preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\/Us",$data,$explain);
		$r_data = $explain[1][0];
		preg_match_all("/\\(?P.*)\<\/strong\>\(?P.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data);
		preg_match_all("/\(?P[^\>]+)\:\(?P.*)\<\/a\>\<\/span\>/Us", $r_data, $b_data);
		
		$result = array();
		foreach ($a_data["adj"] as $key => $value) {
			$result[$value] = $a_data["name"][$key];
		}
		
		$word_b = array();
		foreach ($b_data["tag"] as $key => $value) {
			$word_b[$value] = strip_tags($b_data["word"][$key]);
		}
		
		$result_data = array("x" => $result,"b" => $word_b);

 		return $result_data;
	}


	/**
   * 获取同义词
   * return array(0 => "同义词", 1 => "反义词") 一般为多维数组
   * 
	 */

	private function getSynonym(){
		$data = $this -> getContent();
		preg_match_all("/id=\"en\-syn\-ant\"\>(.*)/Us",$data,$synonym);
		$content = $synonym[1][0];
		$data1 = explode("", $content);
		$result = array();
		$data2 = array();
		foreach ($data1 as $key => $value) {
			preg_match_all("/\(?P.*)\ \;\<\/strong\>\<\/div\>\\(?.*)\<\/ul\>/Us", $value, $r_data);
			$data2[$key]["adj"] = $r_data["adj"];
			$data2[$key]["content"] = $r_data["content"];
		}

		foreach ($data2 as $key => $value) {
			foreach ($value["content"] as $k => $v) {
				if(!empty($v)){
					preg_match_all("/\\(?P.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data);
					foreach ($v_data['title'] as $m => $d) {
						$data = strip_tags(preg_replace("<>"," ", $v_data["value"][$m]));
						$result[$key][$value["adj"][$k]][$d] = $data;
					}
				}
			}
		}
 		return $result;
	}

	/**
   * 获取短语词组
   * return array (key => value) 一维或者多维数组
   * 
	 */

	private function getPhrase(){
		$num = self::$num;
		$data = $this -> getContent();
		preg_match_all("/id=\"en\-phrase\"\>(.*)\/Us",$data,$phrase);
		$data = explode("</dd>",$phrase[1][0]);
		$data1 = array_slice($data,0,$num);
		$result = array();
		foreach ($data1 as $key => $value) {
			$data2 = explode("</p>", $value);
			$n = count($data2);
			if($n<=3){
				$result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]);
			}else{
				$data3 = array_slice($data2,0,$n-1);
				$data4 = array_slice($data2,0,2);
				$res = array_diff($data3,$data4);
				$data5 = array_chunk($res,2);
				$key_value = trim(str_replace(" ","",strip_tags($data4[0])));
				$result[$key_value] = strip_tags($data4[1]);
				foreach ($data5 as $key => $value) {
					foreach ($value as $k => $v) {
						$value[$k] = strip_tags($v);
					}
					$array = array($result[$key_value],$value);
					if (array_key_exists($key_value, $result)){
						$result[$key_value] = $array;
					}
				}
				
			}
		}
		return $result;
	}

	/**
	 * 将数组转换为字符串
	 *
	 * @param  array  $data    数组
	 * @param  bool  $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1
	 * @return  string 返回字符串,如果,data为空,则返回空
	 */
	private function array2string($data, $isformdata = 1) {
	  if($data == '') return '';
	  if($isformdata) $data = $this->new_stripslashes($data);
	  return addslashes(var_export($data, TRUE));
	}

	/**
	 * 返回经stripslashes处理过的字符串或数组
	 * @param $string 需要处理的字符串或数组
	 * @return mixed
	 */
	private function new_stripslashes($string) {
	  if(!is_array($string)) return stripslashes($string);
	  foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);
	  return $string;
	}

}

// $word = new dict("express");
// $word ->content();</pre>

<p>以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。                    </div>

                  

	 	
                    <div class="">
                        <ul class="m-news-opt fix">
                            <li class="opt-item">
                                <a href='/PHPjiqiao-107008.html' target='_blank'><p>< 上一篇</p><p class="ellipsis">symfony表单与页面实现技巧_php实例</p></a>
                            </li>
                            <li class="opt-item ta-r">
                                 <a href='/PHPjiqiao-107010.html' target='_blank'><p>下一篇 ></p><p class="ellipsis">推荐几个开源的微信开发项目_php实例</p></a>
                            </li>
                        </ul>
                    </div>
                    
                    
                    
                    
                </div>
              
                    </div>
                
                  

                    <div class="g-title fix">
                        <h2 class="title-txt">人气教程排行</h2>
                    </div>
                    <div class="m-rank u-dashed mb40">
			
                        <ul>
						
 <li class="rank-item">
                                <a href="/PHPjiqiao-379253.html" title='php如何获取跳转前的url' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num top">1</span>
                                   php如何获取跳转前的url                                </a>
                            </li>							  								  														  <li class="rank-item">
                                <a href="/PHPjiqiao-379019.html" title='php格林威治时间转换成当前时间的方法' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num second">2</span>
                                   php格林威治时间转换成当前时间的方法                                </a>
                            </li>								  														  								  <li class="rank-item">
                                <a href="/PHPjiqiao-366629.html" title='为什么php不能做大型系统?' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num third">3</span>
                                   为什么php不能做大型系统?                                </a>
                            </li>														  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-207623.html" title='range函数怎么用' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num ">4</span>
                                   range函数怎么用                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-162433.html" title='php中计算页面加载时间几种方法总结_PHP教程' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num ">5</span>
                                   php中计算页面加载时间几种方法总结_PHP教程                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-140221.html" title='求帮助,关于paypal支付返回值修改订单状态' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num ">6</span>
                                   求帮助,关于paypal支付返回值修改订单状态                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-103588.html" title='typecho怎么配置文章内容页?' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num ">7</span>
                                   typecho怎么配置文章内容页?                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-99213.html" title='PhpStorm左侧structure不显示文件的方法列表是这么回事?' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num ">8</span>
                                   PhpStorm左侧structure不显示文件的方法列表是这么回事?                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-92208.html" title='查看PHP的环境变量_PHP' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num ">9</span>
                                   查看PHP的环境变量_PHP                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-170.html" title='PHP Primary script unknown 解决方法总结' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num ">10</span>
                                   PHP Primary script unknown 解决方法总结                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-148.html" title='php的命名空间与自动加载实现方法' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num ">11</span>
                                   php的命名空间与自动加载实现方法                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-133.html" title='解决laravel 出现ajax请求419(unknown status)的问题' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">174次</span>
                                    <span class="g-sort-num ">12</span>
                                   解决laravel 出现ajax请求419(unknown status)的问题                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-462817.html" title='php 如何删除mysql记录' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">173次</span>
                                    <span class="g-sort-num ">13</span>
                                   php 如何删除mysql记录                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-388448.html" title='PHP如何替换数组中的指定元素' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">173次</span>
                                    <span class="g-sort-num ">14</span>
                                   PHP如何替换数组中的指定元素                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-124270.html" title='怎么去除字符串中非汉字、非字母、非数字的字符' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">173次</span>
                                    <span class="g-sort-num ">15</span>
                                   怎么去除字符串中非汉字、非字母、非数字的字符                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-112291.html" title='mysql如何一次执行多条SQL语句' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">173次</span>
                                    <span class="g-sort-num ">16</span>
                                   mysql如何一次执行多条SQL语句                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-110669.html" title='修改header里面的Connection为close解决方法' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">173次</span>
                                    <span class="g-sort-num ">17</span>
                                   修改header里面的Connection为close解决方法                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-153.html" title='PHP基于session.upload_progress 实现文件上传进度显示功能详解' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">173次</span>
                                    <span class="g-sort-num ">18</span>
                                   PHP基于session.upload_progress 实现文件上传进度显示功能详解                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-125.html" title='php5.6.x到php7.0.x特性小结' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">173次</span>
                                    <span class="g-sort-num ">19</span>
                                   php5.6.x到php7.0.x特性小结                                </a>
                            </li>							  								  							<li class="rank-item">
                                <a href="/PHPjiqiao-378118.html" title='php为什么会出现504错误' class="item-name ellipsis" target="_blank">
                                    <span class="g-art-count fr">172次</span>
                                    <span class="g-sort-num ">20</span>
                                   php为什么会出现504错误                                </a>
                            </li>

                        </ul>
                    </div>
                </div>
            </div>
            <!-- / 教程内容页 -->
        </div>
    </div>
  
<!-- 页尾 -->
<div class="footer">
   本站所有资源全部来源于网络,若本站发布的内容侵害到您的隐私或者利益,请联系我们删除!</div>
<!-- / 页尾 -->

 <script type="text/javascript" src="/kan/js/read.js"></script>

<div style="display:none">
<div class="login-box" id="login-dialog">
<div class="login-top"><a class="current" rel="nofollow" id="login1" onclick="setTab('login',1,2);" >登录</a></div>
<div class="login-form" id="nav-signin">
 <!-- <div class="login-ico"><a rel="nofollow" class="qq" id="qqlogin" target="_blank" href="/user-center-qqlogin.html"> QQ </a></div>  -->


<div class="login-box-form" id="con_login_1">
<form id="loginform" action="/user-center-login.html" method="post" onsubmit="return false;">
<p class="int-text">
<input class="email" id="username" name="username" type="text" value="用户名或Email" onfocus="if(this.value=='用户名或Email'){this.value='';}" onblur="if(this.value==''){this.value='用户名或Email';};" ></p>
<p class="int-text">
<input class="password1" type="password" id="password" name="password"  value="******"  onBlur="if(this.value=='') this.value='******';" onFocus="if(this.value=='******') this.value='';" >
</p>
<p class="int-info">
                <label class="ui-label"> </label>
                <label for="agreement" class="ui-label-checkbox">
                <input type="checkbox" value="" name="cookietime" id="cookietime" checked="checked" value="2592000">
                <input type="hidden" name="notforward" id="notforward" value="1">
                <input  type="hidden" name="dosubmit" id="dosubmit" value="1">记住我的登录 </label>                           
       <a rel="nofollow" class="aright" href="/user-center-forgetpwd.html" target="_blank"> 忘记密码? </a></p>
  <p class="int-btn"><a rel="nofollow" id="loginbt"  class="loginbtn"><span>登录</span></a></p> 
  </form>
</div>
<form id="regform" action="/user-center-reg.html" method="post">
<div  class="login-reg" style="display: none;" id="con_login_2">
<input type="hidden" name="t" id="t"/>
  <p class="int-text">
    <input  id="email" name="email" type="text" value="Email" onfocus="if(this.value=='Email'){this.value='';}" onblur="if(this.value==''){this.value='Email';};"></p>
    <p class="int-text">
    <input id="uname" name="username" type="text" value="用户名或昵称" onfocus="if(this.value=='用户名或昵称'){this.value='';}" onblur="if(this.value==''){this.value='用户名或昵称';};"></p>
  <p class="int-text">
  <input  type="password" id="pwd" name="password" value="******"  onBlur="if(this.value=='') this.value='******';" onFocus="if(this.value=='******') this.value='';"> </p>
  <p class="int-text1"><span class="inputbox">
    <input id="validate" name="validate" type="text" value="验证码" onfocus="if(this.value=='验证码'){this.value='';}" onblur="if(this.value==''){this.value='验证码';};">
    </span><span class="yzm-img"><img src="/user-checkcode-index" alt="看不清楚换一张"  id="indexlogin"></p>
  <p class="int-info">
    <label>
      <input value="" name="agreement" id="agreement" CHECKED="checked" type="checkbox">
      我已阅读<a rel="nofollow" href="/user-center-agreement.html">用户协议</a>及<a rel="nofollow" href="/user-center-agreement.html">版权声明</a></label>
  </p>
  <p class="int-btn"><input type="hidden" name="dosubmit"/>
<a rel="nofollow" class="loginbtn"  id="register"><span>注册</span></a></p>
</div>
 </form>
</div>
</div>

</div>















</div>
 
<script type="text/javascript" src="/kan/js/foot_js.js"></script>   
<script>
var _hmt = _hmt || [];
(function() {
  var hm = document.createElement("script");
  hm.src = "https://hm.baidu.com/hm.js?6dc1c3c5281cf70f49bc0bc860ec24f2";
  var s = document.getElementsByTagName("script")[0]; 
  s.parentNode.insertBefore(hm, s);
})();
</script>
 <script type="text/javascript" src="/layui/layui.js"></script>
    <script>
    layui.use('code', function() {
        layui.code({
            elem: 'pre', //默认值为.layui-code
            about: false,
            skin: 'notepad',
            title: 'php怎么实现数据库验证跳转代码块',
            encode: true //是否转义html标签。默认不开启
        });
    });
    </script>

</body>

</html>