当前位置:Gxlcms > Python > python实现的一个火车票转让信息采集器

python实现的一个火车票转让信息采集器

时间:2021-07-01 10:21:17 帮助过:48人阅读

好吧,我承认我是对晚上看到一张合适的票转让但打过电话去说已经被搞走了这件事情感到蛋疼。直接上文件吧。

  1. #coding: utf-8
  2. '''
  3. 春运查询火车票转让信息
  4. Author: piglei2007@gmail.com
  5. Date: 2011.01.25
  6. '''
  7. import re
  8. import os
  9. import time
  10. import urlparse
  11. import datetime
  12. import traceback
  13. import urllib2
  14. import socket
  15. socket.setdefaulttimeout(20)
  16. BLANK_RE = re.compile(r"\s+")
  17. opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
  18. opener.addheaders = [
  19. ("User-agent", "Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.1) Gecko/20090704 Firefox/3.5"),
  20. ("Accept", "*/*"),
  21. ]
  22. urllib2.install_opener(opener)
  23. from BeautifulSoup import BeautifulSoup
  24. SOURCE = {
  25. "58": "http://bj.58.com/huochepiao/?Num=%(train)s&StartTime=%(date)s00",
  26. "ganji": "http://bj.ganji.com/piao/cc_%(train)s/%(date)s/",
  27. }
  28. RECORD_FILE = "/tmp/ticket_records.txt"
  29. def parse_record():
  30. try:
  31. return set([x.strip() for x in open(RECORD_FILE, "r").readlines()])
  32. except IOError:
  33. open(RECORD_FILE, "w")
  34. return set()
  35. def flush_record(records):
  36. open(RECORD_FILE, "w").write("\n".join(records))
  37. def main(config):
  38. """
  39. 开始抓取
  40. """
  41. existed = parse_record()
  42. to_email = []
  43. for train in config["trains"]:
  44. for date in config["dates"]:
  45. for type, _url in SOURCE.items():
  46. url = _url % dict(train=train, date=date)
  47. content = urllib2.urlopen(url).read()
  48. soup = BeautifulSoup(content)
  49. result = parse_content(type, soup, train)
  50. for url, text in result:
  51. url = urlparse.urljoin(_url, url)
  52. # 只要卧铺!
  53. if url not in existed and u"卧" in text:
  54. to_email.append([text, url])
  55. existed.add(url)
  56. if to_email:
  57. content = "".join(
  58. [x for x in [" | ".join(y) for y in to_email]]
  59. ).encode("utf-8")
  60. simple_mail(config["people"], content)
  61. flush_record(existed)
  62. def parse_content(type, soup, train):
  63. """
  64. 获得车次信息
  65. """
  66. result = []
  67. if type == "58":
  68. info_table = soup.find("table", id="infolist")
  69. if info_table:
  70. for x in info_table.findAll("tr", text=re.compile(ur"%s(?!时刻表)" % train, re.I)):
  71. a = x.parent
  72. _text = BLANK_RE.sub("", a.text)
  73. result.append([a["href"], _text])
  74. if type == "ganji":
  75. for x in soup.findAll("dl", {"class": "list_piao"}):
  76. a = x.dt.a
  77. result.append([a["href"], a.text])
  78. return result
  79. EMAIL_HOST = 'smtp.sohu.com'
  80. EMAIL_HOST_USER = 'yourname@sohu.com'
  81. EMAIL_HOST_PASSWORD = 'yourpassword'
  82. EMAIL_PORT = 25
  83. def simple_mail(to, content):
  84. """
  85. 发送邮件
  86. """
  87. import smtplib
  88. from email.mime.text import MIMEText
  89. msgRoot = MIMEText(content, 'html', 'UTF-8')
  90. msgRoot['Subject'] = "[%s]有票来啦!!!!" % datetime.datetime.today().isoformat(" ")
  91. msgRoot['From'] = EMAIL_HOST_USER
  92. msgRoot['To'] = ", ".join(to)
  93. s = smtplib.SMTP(EMAIL_HOST, EMAIL_PORT)
  94. s.login(EMAIL_HOST_USER, EMAIL_HOST_PASSWORD)
  95. s.sendmail(EMAIL_HOST_USER, to, msgRoot.as_string())
  96. s.close()
  97. def switch_time_zone():
  98. """
  99. 切换时区
  100. """
  101. os.environ["TZ"] = "Asia/Shanghai"
  102. time.tzset()
  103. switch_time_zone()
  104. if __name__ == '__main__':
  105. config = {
  106. "trains": ("k471",),
  107. "dates": ("20110129",),
  108. "people": (
  109. "youremail@sohu.com",
  110. )
  111. }
  112. try:
  113. main(config)
  114. print "%s: ok" % datetime.datetime.today()
  115. except Exception, e:
  116. print traceback.format_exc()

然后放入cron,你懂的。

人气教程排行