时间:2021-07-01 10:21:17 帮助过:13人阅读
数据库:mysql 账号:root 密码:123456
建表语句:CREATE TABLE dy2008_url (id int(9) NOT NULL AUTO_INCREMENT, url varchar(2000) NOT NULL, status tinyint(2) NOT NULL, PRIMARY KEY(id));
代码:
array('method' => 'POST', 'header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"', 'content' => $data) ); $context = stream_context_create($opts); $content = file_get_contents($url,false,$context); return $content; } //dy2018抓取主流程 function run_dy2018() { global $crawlers_pid; global $finish_count; $crawl_urls = array("http://www.dy2018.com/html/tv/hytv/", "http://www.dy2018.com/html/tv/hepai/", "http://www.dy2018.com/html/tv/gangtai/", "http://www.dy2018.com/html/tv/oumeitv/", "http://www.dy2018.com/html/tv/rihantv/", "http://www.dy2018.com/html/tv/tvzz/", "http://www.dy2018.com/0/", "http://www.dy2018.com/1/", "http://www.dy2018.com/2/", "http://www.dy2018.com/3/", "http://www.dy2018.com/4/", "http://www.dy2018.com/5/", "http://www.dy2018.com/6/", "http://www.dy2018.com/7/", "http://www.dy2018.com/8/", "http://www.dy2018.com/9/", "http://www.dy2018.com/10/", "http://www.dy2018.com/11/", "http://www.dy2018.com/12/", "http://www.dy2018.com/13/", "http://www.dy2018.com/14/", "http://www.dy2018.com/15/", "http://www.dy2018.com/16/", "http://www.dy2018.com/17/", "http://www.dy2018.com/18/", "http://www.dy2018.com/19/", "http://www.dy2018.com/20/"); $i = 0; while($i < count($crawl_urls)) { $pid = pcntl_fork(); if($pid == -1) { echo "system error. check it now!"; exit(); } else if($pid > 0){ $crawlers_pid[$i] = $pid; } else { $url = $crawl_urls[$i]; $con = mysql_connect("localhost", "root", "123456"); if(!$con) { die('Count not connect: '.mysql_error()); } mysql_select_db("mysql", $con); crawl_process($url); $finish_count++; } $i++; } //pcntl_waitpid可能会导致信号监听失败 while (true) { if($finish_count == count($crawlers_pid)) { echo "---------- crawl task finish ----------"; mysql_close(); exit(); } sleep(1); } } //从入口链接到其下所有下载页链接抓取过程 function crawl_process($url) { echo "start handle url:".$url; $page_idx = 1; $valid_tag = true; $info_url_pattern = '/\/i\/\d+.html/'; $ftp_url_pattern = '/ftp:\/\/.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$两个符号不起作用 while($valid_tag) { $page_url = get_page_index_url($url, $page_idx); printf("start crawl url:".$page_url."\n"); $page_content = get_page_content($page_url); $valid_tag = is_valid_page($page_content); if($valid_tag) { $matches_urls = array(); preg_match_all($info_url_pattern, $page_content, $matches_urls); $page_content = mb_convert_encoding($page_content, "UTF-8", "GBK"); for($i=0; $i1){ $idx_url = $idx_url.'index_'.$idx.'.html'; } return $idx_url; } //根据页面内容判断链接是否有效 function is_valid_page($content) { return $content?true:false; } run_dy2018(); mysql_close(); ?>
结果:
以上就介绍了爬虫_电影ftp下载地址,包括了方面的内容,希望对PHP教程有兴趣的朋友有所帮助。