Python3.5爬取cbooo.cn数据并且同步到mysql中
时间:2021-07-01 10:21:17
帮助过:4人阅读
!/usr/local/bin/python
# -*- coding: utf-8 -*-
# Python: 3.5
# Author: wucl(),zhenghai.zhang
# Program: 爬取CBO网站上所有电影的名称并写入数据库。
# Version: 0.1
# History: 2017.10.25
import requests,time, pymysql, re, datetime
from exchangelib
import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBody
host =
‘xxx‘
user =
‘xxx‘
passwd =
‘xxx‘
dbme =
‘crawl‘
dbtarget =
‘back_brace‘
table =
‘movie_hotwords‘
tabledelta =
‘movie_hotwords_delta‘
tablesync =
‘slot_value‘
port = 3306
tolist = [
‘xxx@xxx.com‘]
def get_info():
try:
url =
‘http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=1‘
pData =
requests.get(url).json()
return pData[
‘tPage‘], pData[
‘tCount‘]
except:
print(
"获取总页数和总电影数失败")
def get_movies(page):
try:
url =
‘http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=‘ +
str(page)
pData =
requests.get(url).json()
movies_list = pData[
‘pData‘]
return movies_list
except:
print(
‘获取第%s页电影列表失败‘ %
page)
def Movie_insert(host, user, passwd, dbme, port, table, movies_list):
conn=pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset=
"utf8")
cur=
conn.cursor()
new_movies =
[]
punc =
"!??"#$%&'()*+,-/:;<=>@[\]^_`{|}~?????、〃》「」『』【】〔〕〖〗?????〝〞????–—‘’?“”??…?﹏.()::。·"
punctuation =
punc
for movie
in movies_list:
try:
movie[‘MovieName‘] = re.sub(r
"[%s]+" % punctuation,
"", movie[
"MovieName"])
cmd =
‘insert into %s(movie_id, movie_name) values("%s", "%s")‘ % (table, movie[
‘ID‘], movie[
‘MovieName‘])
cur.execute(cmd)
new_movies.append(movie)
except pymysql.Error:
print(
" "*20, movie[
‘MovieName‘],
"already exists, skip……")
cur.close()
conn.commit()
conn.close()
return new_movies
def Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, movies_list, tablesync):
conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset=
"utf8")
cur =
conn.cursor()
cur.execute("delete from %s " % dbme+
"."+
tabledelta)
for movie
in movies_list:
try:
cmd =
‘insert into %s(movie_id, movie_name) values("%s", "%s")‘ % (tabledelta, movie[
‘ID‘], movie[
‘MovieName‘])
cmdsync =
‘insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s")‘ % (dbtarget+
"."+tablesync,
"xxxxxx", movie[
‘MovieName‘],
"system",
"system", datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"),
"")
cur.execute(cmd)
cur.execute(cmdsync)
except pymysql.Error:
print(
" " * 20, movie[
‘MovieName‘],
"already exists, skip……")
try:
cmdbacktoskill =
‘insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")‘ % (datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"))
cmdskilltoskillpro =
‘insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")‘ % (datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"))
print(cmdbacktoskill)
cur.execute(cmdbacktoskill)
print(cmdskilltoskillpro)
cur.execute(cmdskilltoskillpro)
except pymysql.Error:
print(
"write into back_brace.release_task error!!!")
cur.close()
conn.commit()
conn.close()
def Email(to, subject, body):
creds =
Credentials(
username=
‘xxxxxx‘,
password=
‘xxxxxx‘)
account =
Account(
primary_smtp_address=
‘xxx@xxx.com‘,
credentials=
creds,
autodiscover=
True,
access_type=
DELEGATE)
m =
Message(
account=
account,
subject=
subject,
body=
HTMLBody(body),
to_recipients=[Mailbox(email_address=
to)])
m.send_and_save()
if __name__ ==
‘__main__‘:
update_movies =
[]
pages, counts =
get_info()
pages = 1
for i
in range(1,pages + 1
):
print(
"*"*30,i,
"*"*30
)
movies_list =
get_movies(i)
new_movies =
Movie_insert(host, user, passwd, dbme, port, table, movies_list)
for new_movie
in new_movies:
print(new_movie[
‘MovieName‘],
"Added")
onemovie =
{}
onemovie["ID"] = new_movie[
"ID"]
onemovie["MovieName"] = new_movie[
"MovieName"]
update_movies.append(onemovie)
time.sleep(1
)
print(update_movies)
try:
Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_movies, tablesync) # 将增加的电影写入movie_hotwords_delta表中
except:
print(
"Movie update and sync Error!")
subject =
‘本次新增电影名称‘
body =
"本次新增的电影名称为:<hr>"
for movie
in update_movies:
body += movie[
"MovieName"] +
"<br>"
for to
in tolist:
Email(to, subject, body)
欢迎大侠指点
Python3.5爬取cbooo.cn数据并且同步到mysql中
标签:add auto try code str def ror address hang