时间:2021-07-01 10:21:17 帮助过:13人阅读
数据库:mysql 账号:root 密码:123456
建表语句:CREATE TABLE dy2008_url (id int(9) NOT NULL AUTO_INCREMENT, url varchar(2000) NOT NULL, status tinyint(2) NOT NULL, PRIMARY KEY(id));
代码:
array('method' => 'POST', 'header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"', 'content' => $data)
);
$context = stream_context_create($opts);
$content = file_get_contents($url,false,$context);
return $content;
}
//dy2018抓取主流程
function run_dy2018()
{
global $crawlers_pid;
global $finish_count;
$crawl_urls = array("http://www.dy2018.com/html/tv/hytv/",
"http://www.dy2018.com/html/tv/hepai/",
"http://www.dy2018.com/html/tv/gangtai/",
"http://www.dy2018.com/html/tv/oumeitv/",
"http://www.dy2018.com/html/tv/rihantv/",
"http://www.dy2018.com/html/tv/tvzz/",
"http://www.dy2018.com/0/",
"http://www.dy2018.com/1/",
"http://www.dy2018.com/2/",
"http://www.dy2018.com/3/",
"http://www.dy2018.com/4/",
"http://www.dy2018.com/5/",
"http://www.dy2018.com/6/",
"http://www.dy2018.com/7/",
"http://www.dy2018.com/8/",
"http://www.dy2018.com/9/",
"http://www.dy2018.com/10/",
"http://www.dy2018.com/11/",
"http://www.dy2018.com/12/",
"http://www.dy2018.com/13/",
"http://www.dy2018.com/14/",
"http://www.dy2018.com/15/",
"http://www.dy2018.com/16/",
"http://www.dy2018.com/17/",
"http://www.dy2018.com/18/",
"http://www.dy2018.com/19/",
"http://www.dy2018.com/20/");
$i = 0;
while($i < count($crawl_urls))
{
$pid = pcntl_fork();
if($pid == -1) {
echo "system error. check it now!";
exit();
} else if($pid > 0){
$crawlers_pid[$i] = $pid;
} else {
$url = $crawl_urls[$i];
$con = mysql_connect("localhost", "root", "123456");
if(!$con) {
die('Count not connect: '.mysql_error());
}
mysql_select_db("mysql", $con);
crawl_process($url);
$finish_count++;
}
$i++;
}
//pcntl_waitpid可能会导致信号监听失败
while (true) {
if($finish_count == count($crawlers_pid)) {
echo "---------- crawl task finish ----------";
mysql_close();
exit();
}
sleep(1);
}
}
//从入口链接到其下所有下载页链接抓取过程
function crawl_process($url)
{
echo "start handle url:".$url;
$page_idx = 1;
$valid_tag = true;
$info_url_pattern = '/\/i\/\d+.html/';
$ftp_url_pattern = '/ftp:\/\/.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$两个符号不起作用
while($valid_tag) {
$page_url = get_page_index_url($url, $page_idx);
printf("start crawl url:".$page_url."\n");
$page_content = get_page_content($page_url);
$valid_tag = is_valid_page($page_content);
if($valid_tag) {
$matches_urls = array();
preg_match_all($info_url_pattern, $page_content, $matches_urls);
$page_content = mb_convert_encoding($page_content, "UTF-8", "GBK");
for($i=0; $i 1){
$idx_url = $idx_url.'index_'.$idx.'.html';
}
return $idx_url;
}
//根据页面内容判断链接是否有效
function is_valid_page($content)
{
return $content?true:false;
}
run_dy2018();
mysql_close();
?> 结果:

以上就介绍了爬虫_电影ftp下载地址,包括了方面的内容,希望对PHP教程有兴趣的朋友有所帮助。