时间:2021-07-01 10:21:17 帮助过:6人阅读
set_time_limit(0);// GET测试淘宝详情页抓取$url = "http://item.taobao.com/item.htm?id=37530539791";$url = "http://item.taobao.com/item.htm?id=14861616067";//curl 伪造useragent$useragent = array( 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13 ', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13');header("Content-type: text/html; charset=utf-8");$cookiefile = realpath("./")."/Application/Runtime/Temp/cookie.txt";//创建一个用于存放cookie信息的临时文件,if (!file_exists($cookiefile)){ $file = @file_put_contents($cookiefile, "");}$ch = curl_init();//设置选项,包括URLcurl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_NOBODY,0);curl_setopt($ch, CURLOPT_MAXREDIRS, 300);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); //获取数据返回流形式curl_setopt($ch, CURLOPT_AUTOREFERER, true); //重定向时,自动设置header中的Referer:信息curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); //启用时会将服务器服务器返回的"Location: "放在header中递归的返回给服务器,使用CURLOPT_MAXREDIRS可以限定递归返回的数量// 设置iP和useragent/*curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)');curl_setopt($ch, CURLOPT_HTTPHEADER, array('X-FORWARDED-FOR:28.58.88.'.$r, 'CLIENT-IP:225.28.58.'.$r)); //构造IP curl_setopt($ch, CURLOPT_REFERER, "http://www.baidu.com"); //构造来路 curl_setopt($ch, CURLOPT_USERAGENT, array_rand($useragent));*/// 设置代理/*curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, 1);curl_setopt($ch, CURLOPT_PROXY, '218.213.168.131:80');*///curl_setopt($ch, CURLOPT_PROXYUSERPWD, 'user:password');// 对于cookie保存curl_setopt($ch, CURLOPT_COOKIESESSION, true);curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiefile);//关闭连接时,将服务器端返回的cookie保存在以下文件中curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiefile);//执行并获取HTML文档内容for ($i=0;$i<=5;$i++){ curl_setopt($ch, CURLOPT_USERAGENT, $useragent[$i]); $output = curl_exec($ch); if (!empty($output)){ break; }}//释放curl句柄$info = curl_getinfo($ch);curl_close($ch);echo "";print_r($info);die($output);
打印出的curl信息如下:
Array
(
[url] => http://item.taobao.com/item.htm?id=14861616067
[content_type] => text/html
[http_code] => 200
[header_size] => 197
[request_size] => 156
[filetime] => -1
[ssl_verify_result] => 0
[redirect_count] => 0
[total_time] => 0.562
[namelookup_time] => 0
[connect_time] => 0
[pretransfer_time] => 0.015
[size_upload] => 0
[size_download] => 20
[speed_download] => 35
[speed_upload] => 0
[download_content_length] => -1
[upload_content_length] => 0
[starttransfer_time] => 0.562
[redirect_time] => 0
[redirect_url] =>
[primary_ip] => 58.63.255.240
[certinfo] => Array
(
)
[primary_port] => 80
[local_ip] => 192.168.1.102
[local_port] => 65328
)
但是die返回结果却一直为空,用第一个url的时候是可以的。(cookie,伪造ip,设置浏览器信息都试过了,第二个url还是不行)。
求个大神指教下,还有什么设置没对吗?
回复讨论(解决方案)
$url = "http://item.taobao.com/item.htm?id=14861616067";echo curl_get($url);
得到的数据中有
T浦力顿500g 成犬幼犬狗粮泰迪贵宾比熊金毛萨摩耶博美 批发散装-淘宝网
T浦力顿500g 成犬幼犬狗粮泰迪贵宾比熊金毛萨摩耶博美 批发散装
证明访问是没有问题的
function curl_get($durl, $data=array()) { $cookiejar = realpath('cookie.txt'); $t = parse_url($durl); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL,$durl); curl_setopt($ch, CURLOPT_TIMEOUT,5); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']); curl_setopt($ch, CURLOPT_REFERER, "http://$t[host]/"); curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiejar); curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiejar); curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); if($data) { curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $data); } $r = curl_exec($ch); curl_close($ch); return $r;}谢谢,是可以用的,我的不可以是因为 来源模拟 错了吗?
你可以在我的代码中逐个注释掉来判断少了什么
估计是 少了 CURLOPT_REFERER好的,谢谢。