时间:2021-07-01 10:21:17 帮助过:48人阅读
- #coding: utf-8
- '''
- 春运查询火车票转让信息
- Author: piglei2007@gmail.com
- Date: 2011.01.25
- '''
- import re
- import os
- import time
- import urlparse
- import datetime
- import traceback
- import urllib2
- import socket
- socket.setdefaulttimeout(20)
- BLANK_RE = re.compile(r"\s+")
- opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
- opener.addheaders = [
- ("User-agent", "Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.1) Gecko/20090704 Firefox/3.5"),
- ("Accept", "*/*"),
- ]
- urllib2.install_opener(opener)
- from BeautifulSoup import BeautifulSoup
- SOURCE = {
- "58": "http://bj.58.com/huochepiao/?Num=%(train)s&StartTime=%(date)s00",
- "ganji": "http://bj.ganji.com/piao/cc_%(train)s/%(date)s/",
- }
- RECORD_FILE = "/tmp/ticket_records.txt"
- def parse_record():
- try:
- return set([x.strip() for x in open(RECORD_FILE, "r").readlines()])
- except IOError:
- open(RECORD_FILE, "w")
- return set()
- def flush_record(records):
- open(RECORD_FILE, "w").write("\n".join(records))
- def main(config):
- """
- 开始抓取
- """
- existed = parse_record()
- to_email = []
- for train in config["trains"]:
- for date in config["dates"]:
- for type, _url in SOURCE.items():
- url = _url % dict(train=train, date=date)
- content = urllib2.urlopen(url).read()
- soup = BeautifulSoup(content)
- result = parse_content(type, soup, train)
- for url, text in result:
- url = urlparse.urljoin(_url, url)
- # 只要卧铺!
- if url not in existed and u"卧" in text:
- to_email.append([text, url])
- existed.add(url)
- if to_email:
- content = "".join(
- [x for x in [" | ".join(y) for y in to_email]]
- ).encode("utf-8")
- simple_mail(config["people"], content)
- flush_record(existed)
- def parse_content(type, soup, train):
- """
- 获得车次信息
- """
- result = []
- if type == "58":
- info_table = soup.find("table", id="infolist")
- if info_table:
- for x in info_table.findAll("tr", text=re.compile(ur"%s(?!时刻表)" % train, re.I)):
- a = x.parent
- _text = BLANK_RE.sub("", a.text)
- result.append([a["href"], _text])
- if type == "ganji":
- for x in soup.findAll("dl", {"class": "list_piao"}):
- a = x.dt.a
- result.append([a["href"], a.text])
- return result
- EMAIL_HOST = 'smtp.sohu.com'
- EMAIL_HOST_USER = 'yourname@sohu.com'
- EMAIL_HOST_PASSWORD = 'yourpassword'
- EMAIL_PORT = 25
- def simple_mail(to, content):
- """
- 发送邮件
- """
- import smtplib
- from email.mime.text import MIMEText
- msgRoot = MIMEText(content, 'html', 'UTF-8')
- msgRoot['Subject'] = "[%s]有票来啦!!!!" % datetime.datetime.today().isoformat(" ")
- msgRoot['From'] = EMAIL_HOST_USER
- msgRoot['To'] = ", ".join(to)
- s = smtplib.SMTP(EMAIL_HOST, EMAIL_PORT)
- s.login(EMAIL_HOST_USER, EMAIL_HOST_PASSWORD)
- s.sendmail(EMAIL_HOST_USER, to, msgRoot.as_string())
- s.close()
- def switch_time_zone():
- """
- 切换时区
- """
- os.environ["TZ"] = "Asia/Shanghai"
- time.tzset()
- switch_time_zone()
- if __name__ == '__main__':
- config = {
- "trains": ("k471",),
- "dates": ("20110129",),
- "people": (
- "youremail@sohu.com",
- )
- }
- try:
- main(config)
- print "%s: ok" % datetime.datetime.today()
- except Exception, e:
- print traceback.format_exc()
然后放入cron,你懂的。