这篇文章主要介绍了python实现的一个火车票转让信息采集器,采集信息来源是58同程或者赶集网,需要的朋友可以参考下。
#coding: utf-8
'''
春运查询火车票转让信息
author: piglei2007@gmail.com
date: 2011.01.25
'''
import re
import os
import time
import urlparse
import datetime
import traceback
import urllib2
import socket
socket.setdefaulttimeout(20)
blank_re = re.compile(r\s+)
opener = urllib2.build_opener(urllib2.httpcookieprocessor())
opener.addheaders = [
(user-agent, mozilla/5.0 (x11; u; freebsd i386; en-us; rv:1.9.1) gecko/20090704 firefox/3.5),
(accept, */*),
]
urllib2.install_opener(opener)
from beautifulsoup import beautifulsoup
source = {
58: http://bj.58.com/huochepiao/?num=%(train)s&starttime=%(date)s00,
ganji: http://bj.ganji.com/piao/cc_%(train)s/%(date)s/,
}
record_file = /tmp/ticket_records.txt
def parse_record():
try:
return set([x.strip() for x in open(record_file, r).readlines()])
except ioerror:
open(record_file, w)
return set()
def flush_record(records):
open(record_file, w).write(\n.join(records))
def main(config):
开始抓取
existed = parse_record()
to_email = []
for train in config[trains]:
for date in config[dates]:
for type, _url in source.items():
url = _url % dict(train=train, date=date)
content = urllib2.urlopen(url).read()
soup = beautifulsoup(content)
result = parse_content(type, soup, train)
for url, text in result:
url = urlparse.urljoin(_url, url)
# 只要卧铺!
if url not in existed and u卧 in text:
to_email.append([text, url])
existed.add(url)
if to_email:
content = .join(
[x for x in [ | .join(y) for y in to_email]]
).encode(utf-8)
simple_mail(config[people], content)
flush_record(existed)
def parse_content(type, soup, train):
获得车次信息
result = []
if type == 58:
info_table = soup.find(table, id=infolist)
if info_table:
for x in info_table.findall(tr, text=re.compile(ur%s(?!时刻表) % train, re.i)):
a = x.parent
_text = blank_re.sub(, a.text)
result.append([a[href], _text])
if type == ganji:
for x in soup.findall(dl, {class: list_piao}):
a = x.dt.a
result.append([a[href], a.text])
return result
email_host = 'smtp.sohu.com'
email_host_user = 'yourname@sohu.com'
email_host_password = 'yourpassword'
email_port = 25
def simple_mail(to, content):
发送邮件
import smtplib
from email.mime.text import mimetext
msgroot = mimetext(content, 'html', 'utf-8')
msgroot['subject'] = [%s]有票来啦!!!! % datetime.datetime.today().isoformat( )
msgroot['from'] = email_host_user
msgroot['to'] = , .join(to)
s = smtplib.smtp(email_host, email_port)
s.login(email_host_user, email_host_password)
s.sendmail(email_host_user, to, msgroot.as_string())
s.close()
def switch_time_zone():
切换时区
os.environ[tz] = asia/shanghai
time.tzset()
switch_time_zone()
if __name__ == '__main__':
config = {
trains: (k471,),
dates: (20110129,),
people: (
youremail@sohu.com,
)
}
try:
main(config)
print %s: ok % datetime.datetime.today()
except exception, e:
print traceback.format_exc()然后放入cron,你懂的。