‘‘‘
2 ************************************************
3 *Made by 1120162015 李博
4 * 1120161966 张嘉熙
5 *Time:2017.9.11
6 *Target:All movies‘ information of IMDB TOP_250
7 *Resources:http://www.imdb.cn/IMDB250/
8 *纯原创 转载请注明作者:李博,张嘉熙
9 ************************************************
10 ‘‘‘
11
12 import re
13 import requests
14 import numpy as np
15 import matplotlib.pyplot as plt
16 from bs4
import BeautifulSoup
17
18 num = 1
#电影计数
19 All_txt = []
#全部电影的信息
20 headers={
‘User-Agent‘:
‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0‘}
#浏览器代理
21 def getHTMLText(url):
22 try:
23 #print(url)
24 r = requests.get( url,headers =
headers )
25 #print(r)
26 r.encoding =
‘utf-8‘
27 return r.text
28 except:
29 return "错误"
30
31 #从每一部电影的页面中获取全部信息
32 def get_all_information(url,page):
33 global num,All_txt
34 txt =
getHTMLText(url)
35 if txt !=
"错误":
36 print(
‘page‘+str(page)+
‘ NO.‘+str(num)+
‘ Get it!‘)
37 if num == 247
:
38 print(
‘Finished!!!‘)
39 soup = BeautifulSoup(txt,
"html.parser")
40 Cname,Ename,Score,title,Actor,Starring,Infor =
‘‘,
‘‘,
‘‘,
‘‘,
‘‘,
‘‘,
‘‘
41
42 #TOP250-film_Chinese_name&Score
43 infor_1 = soup.find_all(
‘div‘,class_ =
‘hdd‘)
44 rel =
‘<h3>‘+
‘[\s\S]*?‘+
‘</h3>‘
45 pattern =
re.compile(rel)
46 Cname =
‘‘.join(pattern.findall(str(infor_1[0])))
47 Cname = Cname.replace(
‘<h3>‘,
‘‘).replace(
‘</h3>‘,
‘‘)
48 #print(Cname)
49 #find_the_year & save
50 rel =
‘(‘+
‘[\s\S]*?‘+
‘)‘
51 pattern =
re.compile(rel)
52 time_ =
‘‘.join(pattern.findall(Cname))
53 #print(time_)
54 with open(
‘time.txt‘,
‘a‘,encoding=
‘utf-8‘) as t:
55 t.write( time_.replace(
‘(‘,
‘‘).replace(
‘)‘,
‘‘) +
‘\n‘ )
56 #find_Score
57 rel =
‘<i>‘+
‘[\s\S]*?‘+
‘</i>‘
58 pattern =
re.compile(rel)
59 Score =
‘‘.join(pattern.findall(str(infor_1[0])))
60 Score = Score.replace(
‘<i>‘,
‘‘).replace(
‘</i>‘,
‘‘)
61 #print(Cname,Score)
62
63 #TOP250-film_many_infor
64 now = soup.find_all(
‘div‘,class_ =
‘bdd clear‘)
65 #print(now[0])
66 a = BeautifulSoup(str(now[0]),
"html.parser")
67 many_infor = a.find_all(
‘li‘)
68
69 #TOP250-film_Ename
70 Ename = str(many_infor[0]).replace(
‘<li>‘,
‘‘).replace(
‘<i>‘,
‘‘).replace(
‘</i>‘,
‘‘).replace(
‘</li>‘,
‘‘).replace(
‘<a>‘,
‘‘).replace(
‘</a>‘,
‘‘)
71 #TOP250-film_Actor
72 Actor_temp = BeautifulSoup(str(many_infor[2]),
"html.parser").find_all(
‘a‘)
73 Actor = Actor_temp[0].get_text().replace(
‘导演:‘,
‘‘)
74 #TOP250-film_Starring
75 Starring_temp = BeautifulSoup(str(many_infor[3]),
"html.parser").find_all(
‘a‘)
76 for i
in Starring_temp:
77 Starring += i.get_text().replace(
‘ ‘,
‘‘) +
‘ ‘
78 #print(Starring)
79
80 #Top-film_Infor
81 for j
in range(4,7
):
82 Infor_temp = BeautifulSoup(str(many_infor[j]),
"html.parser")
83 for i
in Infor_temp.children:
84 Infor += i.get_text().replace(
‘ ‘,
‘‘) +
‘ ‘
85 Infor +=
‘\n‘
86 #print(Infor)
87
88 #TOP250-film_Synopsis
89 content = soup.find_all(
‘div‘,class_ =
‘fk-4 clear‘)
90 #print(content)
91 soup_con = BeautifulSoup(str(content[0]),
"html.parser")
92 title = soup_con.find_all(
‘div‘,class_ =
‘hdd‘)
93 title = str(title[0]).replace(
‘<div class="hdd">‘,
‘‘).replace(
‘</div>‘,
‘\n‘)
94 #print(title)
95 content_1 = soup_con.find_all(
‘div‘,class_ =
‘bdd clear‘)
96 content_1 = str(content_1[0]).replace(
‘<div class="bdd clear" style="font-size:15px">‘,
‘‘).replace(
‘</div>‘,
‘‘)
97 content_1 = content_1.replace(
‘<!-- <p><a href="#">更多剧情 >></a></p> -->‘,
‘‘).replace(
‘<br/>‘,
‘\n‘)
98
99 #Save_all_information
100 All_txt.append(
‘第‘+str(num)+
‘部‘+
‘\n‘)
101 All_txt.append( Cname+
‘\n‘ )
102 All_txt.append(
‘【英文名】‘+Ename+
‘\n‘ )
103 All_txt.append(
‘【评分】‘+Score+
‘\n‘ )
104 All_txt.append(
‘【导演】‘+Actor+
‘\n‘ )
105 All_txt.append(
‘【主演】‘+Starring+
‘\n‘ )
106 All_txt.append( Infor+
‘\n‘ )
107 All_txt.append( title+
‘\n‘+content_1+
‘\n‘ )
108 All_txt.append(
‘\n‘)
109 num += 1
110
111 #在每一页中得到当前页的全部电影的url
112 def getin_one(url,page):
113 txt =
getHTMLText(url)
114 soup = BeautifulSoup(txt,
"html.parser")
115 #print(soup)
116 temp = soup.find_all(
‘div‘,class_=
"ss-3 clear")
117 rel =
‘<a href="‘ +
‘[\s\S]*?‘ +
‘">‘
118 pattern =
re.compile(rel)
119 All_url =
pattern.findall( str(temp[0]) )
120 for i
in range(len(All_url)):
121 temp_url =
‘http://www.imdb.cn‘+All_url[i].replace(
‘<a href="‘,
‘‘).replace(
‘">‘,
‘‘)
122 get_all_information(temp_url,page)
123 #print(All_url)
124
125 #将所有电影的年份统计并生成条形图
126 def Analyze_some_infor():
127 plt.rc(
‘font‘, family=
‘SimHei‘, size=13)
#字体及大小
128 #Analyze_time
129 file = open(
‘time.txt‘)
130 a,b,c,d,e,f =
0,0,0,0,0,0
131 for line
in file:
132 line =
eval(line)
133 if line ==
0:
134 f += 1
135 elif line < 1940
and line >= 1920
:
136 a += 1
137 elif line < 1960
and line >= 1940
:
138 b += 1
139 elif line < 1980
and line >= 1960
:
140 c += 1
141 elif line < 2000
and line >= 1980
:
142 d += 1
143 else:
144 e += 1
145 times =
[a,b,c,d,e,f]
146 range_time = [
‘1920-1940‘,
‘1940-1960‘,
‘1960-1980‘,
‘1980-2000‘,
‘2000-现在‘,
‘无信息‘]
147 idx =
np.arange(len(range_time))
148 width = 0.5
149 plt.bar(idx,times,width,color=
‘green‘)
150 plt.xticks(idx+width/2, range_time, rotation=40
)
151 plt.xlabel(
‘电影年代‘)
152 plt.ylabel(
‘数目‘)
153 plt.savefig(
‘time_pic.jpg‘)
154 plt.show()
155
156 def main():
157 global All_txt
158 getin_one(
‘http://www.imdb.cn/IMDB250/‘,1
)
159 for i
in range(2,10
):
160 getin_one(
‘http://www.imdb.cn/imdb250/‘+
str(i) , i )
161 #将已有内容清空
162 with open(
‘All_infor.txt‘,
‘w‘,encoding=
‘utf-8‘) as x:
163 pass
164 with open(
‘All_infor.txt‘,
‘a‘,encoding=
‘utf-8‘) as x:
165 for i
in All_txt:
166 x.write(i)
167 Analyze_some_infor()
168
169 main()
作者: LB919
出处:http://www.cnblogs.com/L1B0/
该文章为LB919投入了时间和精力的原创;
如有转载,荣幸之至!请随手标明出处;
IMDB TOP 250爬虫
标签:time com mozilla core print 作业 enc sts ***