Python3.5爬取豆瓣电视剧数据并且同步到mysql中
时间:2021-07-01 10:21:17
帮助过:4人阅读
!/usr/local/bin/python
# -*- coding: utf-8 -*-
# Python: 3.5
# Author: zhenghai.zhang@xxx.com
# Program: 爬取豆瓣网站上所有电视剧的名称并写入数据库。
# Version: 0.1
# History: 2017.11.01
import requests,time, pymysql, re, datetime, json
from exchangelib
import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBody
host =
‘xxx‘
user =
‘xxx‘
passwd =
‘xxx‘
dbme =
‘crawl‘
dbtarget =
‘back_brace‘
table =
‘tv_hotwords‘
tabledelta =
‘tv_hotwords_delta‘
tablesync =
‘slot_value‘
port = 3306
tolist = [
‘zhenghai.zhang@xxx.com‘]
def get_tvs(urlbase, page):
try:
url = urlbase +
str(page)
print(url)
try:
result =
requests.get(url).text
jresult =
json.loads(result)
tvs = jresult.get(
‘subjects‘)
except:
print(
‘爬取‘ + urlbase + str(page) +
‘失败!‘)
time.sleep(2
)
return tvs
except:
print(
‘获取第%s页电影列表失败‘ %
page)
def tv_insert(host, user, passwd, dbme, port, table, tvs_list):
conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset=
"utf8")
cur =
conn.cursor()
new_tvs =
[]
punc =
"!??"#$%&'()*+,-/:;<=>@[\]^_`{|}~?????、〃》「」『』【】〔〕〖〗?????〝〞????–—‘’?“”??…?﹏.()::。· "
punctuation =
punc
for tv
in tvs_list:
try:
tv[‘title‘] = re.sub(r
"[%s]+" % punctuation,
"", tv.get(
‘title‘))
cmd =
‘insert into %s(tv_id, tv_name) values("%s", "%s")‘ %
(
table, tv.get(‘id‘), tv.get(
‘title‘))
cur.execute(cmd)
new_tvs.append(tv)
except:
print(
" " * 20, tv.get(
‘title‘),
"already exists, skip……")
cur.close()
conn.commit()
conn.close()
return new_tvs
def tv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, tvs_list, tablesync):
conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset=
"utf8")
cur =
conn.cursor()
cur.execute("delete from %s " % dbme+
"."+
tabledelta)
for tv
in tvs_list:
try:
cmd =
‘insert into %s(tv_id, tv_name) values("%s", "%s")‘ % (tabledelta, tv[
‘id‘], tv[
‘title‘])
cmdsync =
‘insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s")‘ % (dbtarget+
"."+tablesync,
"xxxxxx", tv[
‘title‘],
"system",
"system", datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"),
"")
cur.execute(cmd)
cur.execute(cmdsync)
except pymysql.Error:
print(
" " * 20, tv[
‘title‘],
"already exists, skip……")
cur.close()
conn.commit()
conn.close()
def tv_new_to_release(host, user, passwd, dbtarget, port):
conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbtarget, port=port, charset=
"utf8")
cur =
conn.cursor()
try:
cmdbacktoskill =
‘insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")‘ % (datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"))
cmdskilltoskillpro =
‘insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")‘ % (datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"))
print(cmdbacktoskill)
cur.execute(cmdbacktoskill)
print(cmdskilltoskillpro)
cur.execute(cmdskilltoskillpro)
except pymysql.Error:
print(
"write into back_brace.release_task error!!!")
cur.close()
conn.commit()
conn.close()
def Email(to, subject, body):
creds =
Credentials(
username=
‘xxxxxx‘,
password=
‘xxxxxx‘)
account =
Account(
primary_smtp_address=
‘xxx@xxx.com‘,
credentials=
creds,
autodiscover=
True,
access_type=
DELEGATE)
m =
Message(
account=
account,
subject=
subject,
body=
HTMLBody(body),
to_recipients=[Mailbox(email_address=
to)])
m.send_and_save()
if __name__ ==
‘__main__‘:
update_tvs =
[]
pages = 25
# 豆瓣每项电视剧只有前500部
# 美剧 英剧 韩剧 日剧 国产剧 港剧 日本动画 综艺
urlbaselist = [
‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BE%8E%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,
‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E8%8B%B1%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,
‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E9%9F%A9%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,
‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,
‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E5%9B%BD%E4%BA%A7%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,
‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%B8%AF%E5%89%A7&sort=recommend&page_limit=20&page_start=‘,
‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E6%9C%AC%E5%8A%A8%E7%94%BB&sort=recommend&page_limit=20&page_start=‘,
‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BB%BC%E8%89%BA&sort=recommend&page_limit=20&page_start=‘]
for urlbase
in urlbaselist:
for i
in range(pages):
print(
"*"*30, i,
"*"*30
)
tvs_list = get_tvs(urlbase, i * 20
)
new_tvs =
tv_insert(host, user, passwd, dbme, port, table, tvs_list)
for tv
in new_tvs:
print(tv[
‘title‘],
"Added")
onetv =
{}
onetv["id"] = tv[
"id"]
onetv["title"] = tv[
"title"]
update_tvs.append(onetv)
time.sleep(1
)
print(update_tvs)
try:
tv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_tvs, tablesync) # 将增加的电影写入movie_hotwords_delta表中
except:
print(
"tv update and sync Error!")
try:
tv_new_to_release(host, user, passwd, dbtarget, port)
except:
print(
"tv_new_to_release error!!!")
subject =
‘本次新增电视剧名称‘
body =
"本次新增的电影名称为:<hr>"
for movie
in update_tvs:
body += movie[
‘title‘] +
"<br>"
for to
in tolist:
Email(to, subject, body)
还请各位大侠指点
Python3.5爬取豆瓣电视剧数据并且同步到mysql中
标签:app sys import cts test strftime amp pass 失败