当前位置:Gxlcms > 数据库问题 > Python3.5爬取cbooo.cn数据并且同步到mysql中

Python3.5爬取cbooo.cn数据并且同步到mysql中

时间:2021-07-01 10:21:17 帮助过:4人阅读

!/usr/local/bin/python # -*- coding: utf-8 -*- # Python: 3.5 # Author: wucl(),zhenghai.zhang # Program: 爬取CBO网站上所有电影的名称并写入数据库。 # Version: 0.1 # History: 2017.10.25 import requests,time, pymysql, re, datetime from exchangelib import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBody host = xxx user = ‘xxx passwd = ‘xxx dbme = crawl dbtarget = back_brace table = movie_hotwords tabledelta = movie_hotwords_delta tablesync = slot_value port = 3306 tolist = [‘xxx@xxx.com] def get_info(): try: url = http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=1 pData = requests.get(url).json() return pData[tPage], pData[tCount] except: print("获取总页数和总电影数失败") def get_movies(page): try: url = http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex= + str(page) pData = requests.get(url).json() movies_list = pData[pData] return movies_list except: print(获取第%s页电影列表失败 % page) def Movie_insert(host, user, passwd, dbme, port, table, movies_list): conn=pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8") cur=conn.cursor() new_movies = [] punc = "!??"#$%&'()*+,-/:;<=>@[\]^_`{|}~?????、〃》「」『』【】〔〕〖〗?????〝〞????–—‘’?“”??…?﹏.()::。·" punctuation = punc for movie in movies_list: try: movie[MovieName] = re.sub(r"[%s]+" % punctuation, "", movie["MovieName"]) cmd = insert into %s(movie_id, movie_name) values("%s", "%s") % (table, movie[ID], movie[MovieName]) cur.execute(cmd) new_movies.append(movie) except pymysql.Error: print(" "*20, movie[MovieName], "already exists, skip……") cur.close() conn.commit() conn.close() return new_movies def Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, movies_list, tablesync): conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8") cur = conn.cursor() cur.execute("delete from %s " % dbme+"."+tabledelta) for movie in movies_list: try: cmd = insert into %s(movie_id, movie_name) values("%s", "%s") % (tabledelta, movie[ID], movie[MovieName]) cmdsync = insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s") % (dbtarget+"."+tablesync, "xxxxxx", movie[MovieName], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"") cur.execute(cmd) cur.execute(cmdsync) except pymysql.Error: print(" " * 20, movie[MovieName], "already exists, skip……") try: cmdbacktoskill = insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s") % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) cmdskilltoskillpro = insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s") % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) print(cmdbacktoskill) cur.execute(cmdbacktoskill) print(cmdskilltoskillpro) cur.execute(cmdskilltoskillpro) except pymysql.Error: print("write into back_brace.release_task error!!!") cur.close() conn.commit() conn.close() def Email(to, subject, body): creds = Credentials( username=xxxxxx, password=‘xxxxxx) account = Account( primary_smtp_address=‘xxx@xxx.com, credentials=creds, autodiscover=True, access_type=DELEGATE) m = Message( account=account, subject=subject, body=HTMLBody(body), to_recipients=[Mailbox(email_address=to)]) m.send_and_save() if __name__ == __main__: update_movies = [] pages, counts = get_info() pages = 1 for i in range(1,pages + 1): print("*"*30,i,"*"*30) movies_list = get_movies(i) new_movies = Movie_insert(host, user, passwd, dbme, port, table, movies_list) for new_movie in new_movies: print(new_movie[MovieName],"Added") onemovie = {} onemovie["ID"] = new_movie["ID"] onemovie["MovieName"] = new_movie["MovieName"] update_movies.append(onemovie) time.sleep(1) print(update_movies) try: Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_movies, tablesync) # 将增加的电影写入movie_hotwords_delta表中 except: print("Movie update and sync Error!") subject = 本次新增电影名称 body = "本次新增的电影名称为:<hr>" for movie in update_movies: body += movie["MovieName"] + "<br>" for to in tolist: Email(to, subject, body)

 

欢迎大侠指点

 

Python3.5爬取cbooo.cn数据并且同步到mysql中

标签:add   auto   try   code   str   def   ror   address   hang   

人气教程排行