  1. #grab ip proxies from xicidaili
  2. import sys, time, re, requests
  3. from multiprocessing.dummy import Pool as ThreadPool
  4. from lxml import etree
  5. IP_POOL = 'ip_pool.py'
  6. URL = 'http://www.xicidaili.com/nn/' #IP代理 高匿
  7. #URL = 'http://www.xicidaili.com/wt/' #IP代理 http
  8. RUN_TIME = time.strftime("%Y-%m-%d %H:%M", time.localtime()) #执行时间
  9. #用字典存放有效ip代理
  10. alive_ip = {'http': [], 'https': []}
  11. #多线程
  12. pool = ThreadPool(20)
  13. #返回html文本
  14. def get_html(url):
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0",
  17. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  18. "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
  19. "Accept-Encoding": "gzip, deflate",
  20. "Referer": "https://www.xicidaili.com/",
  21. "Connection": "keep-alive",
  22. "Upgrade-Insecure-Requests": "1"
  23. }
  24. r = requests.get(url, headers=headers)
  25. r.encoding = 'utf-8'
  26. return r.text
  27. #测试ip代理是否存活
  28. def test_alive(proxy):
  29. global alive_ip
  30. proxies = {'http': proxy}
  31. try:
  32. r = requests.get('https://www.baidu.com', proxies=proxies, timeout=3)
  33. if r.status_code == 200:
  34. if proxy.startswith('https'):
  35. alive_ip['https'].append(proxy)
  36. else:
  37. alive_ip['http'].append(proxy)
  38. except:
  39. print("%s无效!"%proxy)
  40. #解析html文本,获取ip代理
  41. def get_alive_ip_address():
  42. iplist = []
  43. html = get_html(URL)
  44. selector = etree.HTML(html)
  45. table = selector.xpath('//table[@id="ip_list"]')[0]
  46. lines = table.xpath('./tr')[1:]
  47. for line in lines:
  48. speed, connect_time = line.xpath('.//div/@title')
  49. data = line.xpath('./td')
  50. ip = data[1].xpath('./text()')[0]
  51. port = data[2].xpath('./text()')[0]
  52. anonymous = data[4].xpath('./text()')[0]
  53. ip_type = data[5].xpath('./text()')[0]
  54. #过滤掉速度慢和非高匿的ip代理
  55. if float(speed[:-1])>1 or float(connect_time[:-1])>1 or anonymous != '高匿':
  56. continue
  57. iplist.append(ip_type.lower() + '://' + ip + ':' + port)
  58. pool.map(test_alive, iplist)
  59. #把抓取到的有效ip代理写入到本地
  60. def write_txt(output_file):
  61. with open(output_file, 'w') as f:
  62. f.write('#create time: %s\n\n' % RUN_TIME)
  63. f.write('http_ip_pool = \\\n')
  64. f.write(str(alive_ip['http']).replace(',', ',\n'))
  65. f.write('\n\n')
  66. with open(output_file, 'a') as f:
  67. f.write('https_ip_pool = \\\n')
  68. f.write(str(alive_ip['https']).replace(',', ',\n'))
  69. print('write successful: %s' % output_file)
  70. def main():
  71. get_alive_ip_address()
  72. write_txt(output_file)
  73. if __name__ == '__main__':
  74. try:
  75. output_file = sys.argv[1] #第一个参数作为文件名
  76. except:
  77. output_file = IP_POOL
  78. main()


  1. root@c:test$ python get_ip_proxies.pywrite successful: ip_pool.py


  1. root@c:test$ vim ip_pool.py

  1. #create time: 2019-03-14 19:53
  2. http_ip_pool = \
  3. ['',
  4. '',
  5. '',
  6. '',
  7. '',
  8. '',
  9. '',
  10. '',
  11. '',
  12. '',
  13. '',
  14. '',
  15. '',
  16. '',
  17. '',
  18. '',
  19. '',
  20. '',
  21. '',
  22. '',
  23. '',
  24. '',
  25. '',
  26. '',
  27. '',
  28. '',
  29. '',
  30. '',
  31. '',
  32. '',
  33. '',
  34. '',
  35. '',
  36. '',
  37. '']
  38. https_ip_pool = \
  39. ['',
  40. '',
  41. '',
  42. '',
  43. '',
  44. '',
  45. '',
  46. '',
  47. '',
  48. '',
  49. '',
  50. '',
  51. '',
  52. '',
  53. '',
  54. '',
  55. '',
  56. '',
  57. '',
  58. '',
  59. '',
  60. '',
  61. '',
  62. '',
  63. '',
  64. '',
  65. '',
  66. '',
  67. '',
  68. '',
  69. '',
  70. '',
  71. '',
  72. '',
  73. '',
  74. '',
  75. '']


  1. from ip_pool import http_ip_pool, https_ip_pool

