当前位置:Gxlcms >
PHP教程 >
DedeHttpDownPHP远程下载网页的类,增强版2013-1-17修改_PHP教程
DedeHttpDownPHP远程下载网页的类,增强版2013-1-17修改_PHP教程
时间:2021-07-01 10:21:17
帮助过:9人阅读
1)新增远程主机判断,节约服务器资源。避免远程主机不存在的时候仍旧fsockopen,导致的死机占用服务器CPU
(2)新增响应401的判断和支持。
(3)增加对json返回文本的支持。
(4)日志的生成,如果设置了DEBUG_LEVEL且为TRUE则会对每次远程下载做日志。
(5)获取字节的限定dataLimit,节约服务器资源。
(6)修改日期:2013-1-17
如果还有更好的方法建议,可以随时联系我本人。admin@zbphp.com
[php]
m_url = $url;
if(is_array($urls))
{
$this->m_host = $urls["host"];
if(!emptyempty($urls["scheme"]))
{
$this->m_scheme = $urls["scheme"];
}
if(!emptyempty($urls["user"]))
{
$this->m_user = $urls["user"];
}
if(!emptyempty($urls["pass"]))
{
$this->m_pass = $urls["pass"];
}
if(!emptyempty($urls["port"]))
{
$this->m_port = $urls["port"];
}
if(!emptyempty($urls["path"]))
{
$this->m_path = $urls["path"];
}
$this->m_urlpath = $this->m_path;
if(!emptyempty($urls["query"]))
{
$this->m_query = $urls["query"];
$this->m_urlpath .= "?".$this->m_query;
}
$this->HomeUrl = $urls["host"];
$this->BaseUrlPath = $this->HomeUrl.$urls["path"];
$this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
$this->BaseUrlPath = preg_replace("/\/$/","",$this->BaseUrlPath);
}
}
/**
* 重设各参数
*
* @access public
* @return void
*/
function ResetAny()
{
$this->m_url = "";
$this->m_urlpath = "";
$this->m_scheme = "http";
$this->m_host = "";
$this->m_port = "80";
$this->m_user = "";
$this->m_pass = "";
$this->m_path = "/";
$this->m_query = "";
$this->m_error = "";
}
/**
* 打开指定网址
*
* @access public
* @param string $url 地址
* @param string $requestType 请求类型
* @return string
*/
function OpenUrl($url,$requestType="GET")
{
$this->ResetAny();
$this->JumpCount = 0;
$this->m_httphead = Array() ;
$this->m_html = '';
$this->dataLimit = 0;
$this->reTry = 0;
$this->Close();
//初始化系统
$this->PrivateInit($url);
$this->PrivateStartSession($requestType);
}
/**
* 转到303重定向网址
*
* @access public
* @param string $url 地址
* @return string
*/
function JumpOpenUrl($url)
{
$this->ResetAny();
$this->JumpCount++;
$this->m_httphead = Array() ;
$this->m_html = "";
$this->Close();
//初始化系统
$this->PrivateInit($url);
$this->PrivateStartSession('GET');
}
/**
* 获得某操作错误的原因
*
* @access public
* @return void
*/
function printError()
{
echo "错误信息:".$this->m_error;
echo "
具体返回头:
";
foreach($this->m_httphead as $k=>$v){ echo "$k => $v
\r\n"; }
}
/**
* 判别用Get方法发送的头的应答结果是否正确
*
* @access public
* @return bool
*/
function IsGetOK()
{
if( preg_match("/^2/",$this->GetHead("http-state")) )
{
return TRUE;
}
else
{
$this->m_error .= $this->GetHead("http-state")." - ".$this->GetHead("http-describe")."
";
return FALSE;
}
}
/**
* 看看返回的网页是否是text类型
*
* @access public
* @return bool
*/
function IsText()
{
if( preg_match("/^(2|401)/",$this->GetHead("http-state")) && preg_match("/text|xml|json/i",$this->GetHead("content-type")) )
{
return TRUE;
} else {
$this->m_error .= "内容为非文本类型或网址重定向
";
return FALSE;
}
}
/**
* 判断返回的网页是否是特定的类型
*
* @access public
* @param string $ctype 内容类型
* @return string
*/
function IsContentType($ctype)
{
if(preg_match("/^2/",$this->GetHead("http-state"))
&& $this->GetHead("content-type")==strtolower($ctype))
{ return TRUE; }
else
{
$this->m_error .= "类型不对 ".$this->GetHead("content-type")."
";
return FALSE;
}
}
/**
* 用Http协议下载文件
*
* @access public
* @param string $savefilename 保存文件名称
* @return string
*/
function SaveToBin($savefilename)
{
if(!$this->IsGetOK())
{
return FALSE;
}
if(@feof($this->m_fp))
{
$this->m_error = "连接已经关闭!"; return FALSE;
}
$fp = fopen($savefilename,"w");
while(!feof($this->m_fp))
{
fwrite($fp, fread($this->m_fp, 1024));
}
fclose($this->m_fp);
fclose($fp);
return TRUE;
}
/**
* 保存网页内容为Text文件
*
* @access public
* @param string $savefilename 保存文件名称
* @return string
*/
function SaveToText($savefilename)
{
if($this->IsText())
{
$this->SaveBinFile($savefilename);
}
else
{
return "";
}
}
/**
* 用Http协议获得一个网页的内容
*
* @access public
* @return string
*/
function GetHtml()
{
$tm1 = microtime(true);
if(!$this->IsText())
{
return '';
}
if($this->m_html!='')
{
return $this->m_html;
}
if(!$this->m_fp||@feof($this->m_fp))
{
return '';
}
while(!feof($this->m_fp))
{
$this->m_html .= fgets($this->m_fp,256);
if($this->dataLimit > 0 && strlen($this->m_html) > $this->dataLimit) break;
}
@fclose($this->m_fp);
$tm2 = microtime(true);
$log = "\ntm2-tm1 = ".($tm2-$tm1);
$log.= "\n".$this->m_html;
$this->log_write('GetHtml',$log);
return $this->m_html;
}
/**
* 开始HTTP会话
*
* @access public
* @param string $requestType 请求类型
* @return string
*/
function PrivateStartSession($requestType="GET")
{
if(!$this->PrivateOpenHost())
{
$this->m_error .= "打开远程主机出错!";
return FALSE;
}
$this->reTry++;
if($this->GetHead("http-edition")=="HTTP/1.1")
{
$httpv = "HTTP/1.1";
}
else
{
$httpv = "HTTP/1.0";
}
$ps = explode('?',$this->m_urlpath);
$headString = '';
//发送固定的起始请求头GET、Host信息
if($requestType=="GET")
{
$headString .= "GET ".$this->m_urlpath." $httpv\r\n";
}
else
{
$headString .= "POST ".$ps[0]." $httpv\r\n";
}
if($this->m_user || $this->m_pass)
{
$headString .= "Authorization: Basic ".base64_encode($this->m_user.":".$this->m_pass)."\r\n";
}
$this->m_puthead["Host"] = $this->m_host;
//发送用户自定义的请求头
if(!isset($this->m_puthead["User-Agent"]))
{
$this->m_puthead["User-Agent"] = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2)";
}
if(!isset($this->m_puthead["Refer"]))
{
$this->m_puthead["Refer"] = "http://".$this->m_puthead["Host"];
}
/* add on 2012-12-19 */
$headString.="Connection:keep-alive\r\n";
$headString.="Accept-Language:zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3\r\n";
$headString.="Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n";
foreach($this->m_puthead as $k=>$v)
{
$k = trim($k);
$v = trim($v);
if($k!=""&&$v!="")
{
$headString .= "$k: $v\r\n";
}
}
fputs($this->m_fp, $headString);
if($requestType=="POST")
{
$postdata = "";
if(count($ps)>1)
{
for($i=1;$i
m_fp,"Content-Type: application/x-www-form-urlencoded\r\n");
fputs($this->m_fp,"Content-Length: $plen\r\n");
}
//发送固定的结束请求头
//HTTP1.1协议必须指定文档结束后关闭链接,否则读取文档时无法使用feof判断结束
if($httpv=="HTTP/1.1")
{
fputs($this->m_fp,"Connection: Close\r\n\r\n");
}
else
{
fputs($this->m_fp,"\r\n");
}
if($requestType=="POST")
{
fputs($this->m_fp,$postdata);
}
//获取应答头状态信息
$httpstas = explode(" ",fgets($this->m_fp,256));
$this->m_httphead["http-edition"] = trim($httpstas[0]);
$this->m_httphead["http-state"] = trim($httpstas[1]);
$this->m_httphead["http-describe"] = "";
for($i=2;$im_httphead["http-describe"] .= " ".trim($httpstas[$i]);
}
//获取详细应答头
while(!feof($this->m_fp))
{
$line = trim(fgets($this->m_fp,256));
if($line == "")
{
break;
}
$hkey = "";
$hvalue = "";
$v = 0;
for($i=0;$im_httphead[strtolower($hkey)] = trim($hvalue);
}
}
//如果连接被不正常关闭,重试
if(feof($this->m_fp))
{
if($this->reTry > 10)
{
return FALSE;
}
$this->PrivateStartSession($requestType);
}
//判断是否是3xx开头的应答
if(preg_match("/^3/",$this->m_httphead["http-state"]))
{
if($this->JumpCount > 3)
{
return;
}
if(isset($this->m_httphead["location"]))
{
$newurl = $this->m_httphead["location"];
if(preg_match("/^http/i",$newurl))
{
$this->JumpOpenUrl($newurl);
}
else
{
$newurl = $this->FillUrl($newurl);
$this->JumpOpenUrl($newurl);
}
}
else
{
$this->m_error = "无法识别的答复!";
}
}
}
/**
* 获得一个Http头的值
*
* @access public
* @param string $headname 头文件名称
* @return string
*/
function GetHead($headname)
{
$headname = strtolower($headname);
return isset($this->m_httphead[$headname]) ? $this->m_httphead[$headname] : '';
}
/**
* 设置Http头的值
*
* @access public
* @param string $skey 键
* @param string $svalue 值
* @return string
*/
function SetHead($skey,$svalue)
{
$this->m_puthead[$skey] = $svalue;
}
/**
* 打开连接
*
* @access public
* @return bool
*/
function PrivateOpenHost()
{
if($this->m_host=="")
{
return FALSE;
}
if(function_exists('checkdnsrr') && !checkdnsrr($this->m_host,'A') && !checkdnsrr($this->m_host,'CNAME'))
{
$this->m_error = '远程主机'.$this->m_host.'不存在!checkdnsrr !';
return FALSE;
}
$errno = "";
$errstr = "";
$this->m_fp = @fsockopen($this->m_host, $this->m_port, $errno, $errstr,10);
if(!$this->m_fp)
{
$this->m_error = $errstr;
return FALSE;
}
else
{
return TRUE;
}
}
/**
* 关闭连接
*
* @access public
* @return void
*/
function Close()
{
@fclose($this->m_fp);
}
/**
* 补全相对网址
*
* @access public
* @param string $surl 需要不全的地址
* @return string
*/
function FillUrl($surl)
{
$i = 0;
$dstr = "";
$pstr = "";
$okurl = "";
$pathStep = 0;
$surl = trim($surl);
if($surl=="")
{
return "";
}
$pos = strpos($surl,"#");
if($pos>0)
{
$surl = substr($surl,0,$pos);
}
if($surl[0]=="/")
{
$okurl = "http://".$this->HomeUrl.$surl;
}
else if($surl[0]==".")
{
if(strlen($surl)<=1)
{
return "";
}
else if($surl[1]=="/")
{
$okurl = "http://".$this->BaseUrlPath."/".substr($surl,2,strlen($surl)-2);
}
else
{
$urls = explode("/",$surl);
foreach($urls as $u)
{
if($u=="..")
{
$pathStep++;
}
else if($iBaseUrlPath);
if(count($urls) <= $pathStep)
{
return "";
}
else
{
$pstr = "http://";
for($i=0;$iBaseUrlPath."/".$surl;
}
else if(strtolower(substr($surl,0,7))=="http://")
{
$okurl = $surl;
}
else
{
$okurl = "http://".$this->BaseUrlPath."/".$surl;
}
}
$okurl = preg_replace("/^(http:\/\/)/i","",$okurl);
$okurl = preg_replace("/\/{1,}/", "/", $okurl);
return "http://".$okurl;
}
function log_write($funcname,$message)
{
if(!(defined('DEBUG_LEVEL') && DEBUG_LEVEL == TRUE)) return ;
$log = "\n".date("Y-M-d H:i:s ").get_current_user()."[".getmypid()."]";
$log.= "\n".$this->m_url."\n".str_repeat('------', 10)."\n".$message;
$path = $funcname.' '.date('Y m d H i s ').preg_replace('/([\W]+|\s+)/i', ' ', $this->m_url);
if(strlen($path) > 250) $path = substr($path,0,250);
$dir = DEDEDATA.'/httpdownlog';
if(!is_dir($dir) && !mkdir($dir)) exit('Can not make dir '.$dir);
$path = $dir.'/'.$path;
if(!file_exists($path)) touch($path);
$fp = fopen($path,'a+');
flock($fp, LOCK_EX);
fputs($fp, "PATH:".$path."\nREAL:".realpath($path)."\nMSSG:".$log);
fclose($fp);
return TRUE;
}
}//End Class
使用方法:
[php]
$dhd = new DeDeHttpDown();
$dhd->OpenUrl($Rs['wurl']);
$dhd->dataLimit = 5120;
$dhd->m_puthead["Refer"] = $Rs['wurl'];
$filecnt = trim($dhd->GetHtml());
存在未解决的问题:
(1)假如域名是绑定了A记录和CNAME,有ip指向,但是IP地址是不存在的或者虚假的,程序仍旧会继续获取。
(2)PHP的fsockopen 里面的time out貌似根本就没有起作用。设置了10s超时,但是根本就是执行到程序time out 为止。
如果好的方法建议,可以随时联系我本人。admin@zbphp.com
http://www.bkjia.com/PHPjc/477811.htmlwww.bkjia.comtruehttp://www.bkjia.com/PHPjc/477811.htmlTechArticle1)新增远程主机判断,节约服务器资源。避免远程主机不存在的时候仍旧fsockopen,导致的死机占用服务器CPU (2)新增响应401的判断和支持...