当前位置:Gxlcms > PHP教程 > php与python实现的线程池多线程爬虫功能示例

php与python实现的线程池多线程爬虫功能示例

时间:2021-07-01 10:21:17 帮助过:10人阅读

本文实例讲述了php与python实现的线程池多线程爬虫功能。分享给大家供大家参考,具体如下:

多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看php与python 线程池多线程爬虫的例子,代码如下:

php例子

  1. <?php
  2. class Connect extends Worker //worker模式
  3. {
  4. public function __construct()
  5. {
  6. }
  7. public function getConnection()
  8. {
  9. if (!self::$ch)
  10. {
  11. self::$ch = curl_init();
  12. curl_setopt(self::$ch, CURLOPT_TIMEOUT, 2);
  13. curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, 1);
  14. curl_setopt(self::$ch, CURLOPT_HEADER, 0);
  15. curl_setopt(self::$ch, CURLOPT_NOSIGNAL, true);
  16. curl_setopt(self::$ch, CURLOPT_USERAGENT, "Firefox");
  17. curl_setopt(self::$ch, CURLOPT_FOLLOWLOCATION, 1);
  18. }
  19. /* do some exception/error stuff here maybe */
  20. return self::$ch;
  21. }
  22. public function closeConnection()
  23. {
  24. curl_close(self::$ch);
  25. }
  26. /**
  27. * Note that the link is stored statically, which for pthreads, means thread local
  28. * */
  29. protected static $ch;
  30. }
  31. class Query extends Threaded
  32. {
  33. public function __construct($url)
  34. {
  35. $this->url = $url;
  36. }
  37. public function run()
  38. {
  39. $ch = $this->worker->getConnection();
  40. curl_setopt($ch, CURLOPT_URL, $this->url);
  41. $page = curl_exec($ch);
  42. $info = curl_getinfo($ch);
  43. $error = curl_error($ch);
  44. $this->deal_data($this->url, $page, $info, $error);
  45. $this->result = $page;
  46. }
  47. function deal_data($url, $page, $info, $error)
  48. {
  49. $parts = explode(".", $url);
  50. $id = $parts[1];
  51. if ($info['http_code'] != 200)
  52. {
  53. $this->show_msg($id, $error);
  54. } else
  55. {
  56. $this->show_msg($id, "OK");
  57. }
  58. }
  59. function show_msg($id, $msg)
  60. {
  61. echo $id."\t$msg\n";
  62. }
  63. public function getResult()
  64. {
  65. return $this->result;
  66. }
  67. protected $url;
  68. protected $result;
  69. }
  70. function check_urls_multi_pthreads()
  71. {
  72. global $check_urls; //定义抓取的连接
  73. $check_urls = array( 'http://xxx.com' => "xx网",);
  74. $pool = new Pool(10, "Connect", array()); //建立10个线程池
  75. foreach ($check_urls as $url => $name)
  76. {
  77. $pool->submit(new Query($url));
  78. }
  79. $pool->shutdown();
  80. }
  81. check_urls_multi_pthreads();
  82. python 多线程
  83. def handle(sid)://这个方法内执行爬虫数据处理
  84. pass
  85. class MyThread(Thread):
  86. """docstring for ClassName"""
  87. def __init__(self, sid):
  88. Thread.__init__(self)
  89. self.sid = sid
  90. def run():
  91. handle(self.sid)
  92. threads = []
  93. for i in xrange(1,11):
  94. t = MyThread(i)
  95. threads.append(t)
  96. t.start()
  97. for t in threads:
  98. t.join()

python 线程池爬虫:

  1. from queue import Queue
  2. from threading import Thread, Lock
  3. import urllib.parse
  4. import socket
  5. import re
  6. import time
  7. seen_urls = set(['/'])
  8. lock = Lock()
  9. class Fetcher(Thread):
  10. def __init__(self, tasks):
  11. Thread.__init__(self)
  12. self.tasks = tasks
  13. self.daemon = True
  14. self.start()
  15. def run(self):
  16. while True:
  17. url = self.tasks.get()
  18. print(url)
  19. sock = socket.socket()
  20. sock.connect(('localhost', 3000))
  21. get = 'GET {} HTTP/1.0\r\nHost: localhost\r\n\r\n'.format(url)
  22. sock.send(get.encode('ascii'))
  23. response = b''
  24. chunk = sock.recv(4096)
  25. while chunk:
  26. response += chunk
  27. chunk = sock.recv(4096)
  28. links = self.parse_links(url, response)
  29. lock.acquire()
  30. for link in links.difference(seen_urls):
  31. self.tasks.put(link)
  32. seen_urls.update(links)
  33. lock.release()
  34. self.tasks.task_done()
  35. def parse_links(self, fetched_url, response):
  36. if not response:
  37. print('error: {}'.format(fetched_url))
  38. return set()
  39. if not self._is_html(response):
  40. return set()
  41. urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''',
  42. self.body(response)))
  43. links = set()
  44. for url in urls:
  45. normalized = urllib.parse.urljoin(fetched_url, url)
  46. parts = urllib.parse.urlparse(normalized)
  47. if parts.scheme not in ('', 'http', 'https'):
  48. continue
  49. host, port = urllib.parse.splitport(parts.netloc)
  50. if host and host.lower() not in ('localhost'):
  51. continue
  52. defragmented, frag = urllib.parse.urldefrag(parts.path)
  53. links.add(defragmented)
  54. return links
  55. def body(self, response):
  56. body = response.split(b'\r\n\r\n', 1)[1]
  57. return body.decode('utf-8')
  58. def _is_html(self, response):
  59. head, body = response.split(b'\r\n\r\n', 1)
  60. headers = dict(h.split(': ') for h in head.decode().split('\r\n')[1:])
  61. return headers.get('Content-Type', '').startswith('text/html')
  62. class ThreadPool:
  63. def __init__(self, num_threads):
  64. self.tasks = Queue()
  65. for _ in range(num_threads):
  66. Fetcher(self.tasks)
  67. def add_task(self, url):
  68. self.tasks.put(url)
  69. def wait_completion(self):
  70. self.tasks.join()
  71. if __name__ == '__main__':
  72. start = time.time()
  73. pool = ThreadPool(4)
  74. pool.add_task("/")
  75. pool.wait_completion()
  76. print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))

更多关于PHP相关内容感兴趣的读者可查看本站专题:《php curl用法总结》、《PHP数组(Array)操作技巧大全》、《php排序算法总结》、《PHP常用遍历算法与技巧总结》、《PHP数据结构与算法教程》、《php程序设计算法总结》、《PHP数学运算技巧总结》、《php正则表达式用法总结》、《PHP运算与运算符用法总结》、《php字符串(string)用法总结》及《php常见数据库操作技巧汇总》

希望本文所述对大家PHP程序设计有所帮助。

人气教程排行