Python 3 原生爬虫
博主最近学习了Python 3的有关爬虫方面的知识,只是限于简单的原生爬虫,技术性并不高,求大佬
轻喷,多多给予我这个小菜鸟
一些支持,谢谢大家
本次项目用到的模块:
matplotlib
:用于画图
wordcloud
:分析数据,画出词云
requests
:爬取数据
time
:减速请求速度,防止ip被封
re
:匹配正则表达式
1.爬取猫眼电影TOP100
目标URL
:https://maoyan.com/board/4
GET请求
:通过分析,url通过请求一个参数offset
,每页10
部电影,url的地址规律为页数x10
正则表达式
:<p class="name">.*?>(.*?)</a></p>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>.*?<i class="integer">(.*?)</i><i class="fraction">(.*?)</i>
#!/usr/bin/env python
# coding=utf-8
''' 导入库 '''
import requests
import re
import time
''' 初始化存储电影的列表'''
movies = []
''' 初始化电影编号 '''
number = 1
''' 主方法 '''
def main(offet,number):
''' 设置爬取的url'''
url = 'https://maoyan.com/board/4?offset=' + str(offet)
''' 设置请求头,模拟真实浏览器'''
headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
}
''' 请求数据 '''
r = requests.get(url,headers=headers)
''' 匹配正则 '''
results = re.findall('<p class="name">.*?>(.*?)</a></p>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>.*?<i class="integer">(.*?)</i><i class="fraction">(.*?)</i>',r.text,re.S)
''' 格式化数据,获取电影的名称,演员,上映时间,电影得分 '''
for result in results:
movie = {}
movie['排名'] = number
movie['电影名称'] = result[0]
movie['演员'] = result[1].strip()
movie['上映时间'] = result[2]
movie['分数'] = result[3] + result[4]
movies.append(movie)
number += 1
''' 电影一共10页,进行10次请求 '''
for i in range(10):
main(i * 10,number)
number += 10
time.sleep(1)
''' 将数据存入文件'''
with open('result.txt','w') as file:
for x in movies:
for key,value in x.items():
file.write(key + ":" + str(value) + " ")
file.write('\n')
2.爬取热门电视剧电影豆瓣评论,分析出观众对于该部影视的关键词
① 电影 "毒液"
目标URL
:https://movie.douban.com/subject/3168101/comments?start=' + page_number + '&limit=20&sort=new_score&status=P
GET请求分析
:start参数代表了页面评论是从多少条开始,第一页是0,第二页是20,以此类推,获得url规律为页数x20
正则表达式
:<span class="short">(.*?)</span>
#!/usr/bin/env python
# coding=utf-8
''' 导入库'''
import requests
import time
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
''' 主方法 '''
def main(page_number):
''' 设置url'''
url = 'https://movie.douban.com/subject/3168101/comments?start=' + page_number + '&limit=20&sort=new_score&status=P'
''' 设置请求头,Cookies设置自己的Cookies'''
headers = {
'User-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Cookie':'*****'
}
''' 请求数据 '''
r = requests.get(url,headers=headers)
''' 格式化正则表达式 '''
pattern = re.compile('<span class="short">(.*?)</span>',re.S)
''' 匹配正则表达式'''
results = re.findall(pattern,r.text)
''' 写入数据 '''
with open('毒液.txt','a') as file :
for result in results :
file.write(result)
file.write('\n')
file.write('\n')
''' 豆瓣评论一共25页 '''
for i in range(25):
main(str(i * 20))
time.sleep(1)
''' 读取数据 '''
with open('毒液.txt') as f:
text = f.read()
''' 初始化词云 '''
wordcloud = WordCloud(width=3500,height=2000,background_color='white',font_path='./simfang.ttf').generate(text)
''' 设置图表清晰度和其他参数'''
plt.figure(dpi=256)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
''' 保存词云 '''
plt.savefig("毒液.png")
- 分析结果如下图所示:

② 电视剧 "如懿传"
目标URL
:https://movie.douban.com/subject/25812730/comments?start=' + page_number + '&limit=20&sort=new_score&status=P
GET请求分析
:start参数代表了页面评论是从多少条开始,第一页是0,第二页是20,以此类推,获得url规律为页数x20
正则表达式
:<span class="short">(.*?)</span>
#!/usr/bin/env python
# coding=utf-8
''' 导入库 '''
import requests
import time
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS
stopwords = set(STOPWORDS)
''' 设置屏蔽词 '''
stopwords.add("gt")
stopwords.add("如懿传")
stopwords.add("如懿")
stopwords.add("甄嬛传")
stopwords.add("金枝欲孽")
stopwords.add("延禧攻略")
''' 主方法 '''
def main(page_number):
''' 设置url '''
url = 'https://movie.douban.com/subject/25812730/comments?start=' + page_number + '&limit=20&sort=new_score&status=P'
''' 设置请求头,Cookies设置自己的Cookies'''
headers = {
'User-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Cookie':'*****',
'Host':'movie.douban.com'
}
''' 请求数据 '''
r = requests.get(url,headers=headers)
''' 格式化正则表达式 '''
pattern = re.compile('<span class="short">(.*?)</span>',re.S)
''' 正则匹配 '''
results = re.findall(pattern,r.text)
''' 写入数据 '''
with open('如懿传.txt','a') as file :
for result in results :
file.write(result)
file.write('\n')
file.write('\n')
''' 评论共25页 '''
for i in range(25):
main(str(i * 20))
time.sleep(1)
''' 读取数据 '''
with open('如懿传.txt') as f:
text = f.read()
wordcloud = WordCloud(stopwords='如懿传',width=3500,height=2000,background_color='white',font_path='./simfang.ttf').generate(text)
plt.figure(dpi=256)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.savefig('./如懿传.png')

③ 电视剧 "延禧攻略"
目标URL
:https://movie.douban.com/subject/25812730/comments?start=' + page_number + '&limit=20&sort=new_score&status=P
GET请求分析
:start参数代表了页面评论是从多少条开始,第一页是0,第二页是20,以此类推,获得url规律为页数x20
正则表达式
:<span class="short">(.*?)</span>
代码思路与前两个一致,故不标注释了
#!/usr/bin/env python
# coding=utf-8
import requests
import time
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS
stopwords = set(STOPWORDS)
stopwords.add("gt")
stopwords.add("如懿传")
stopwords.add("甄嬛传")
stopwords.add("金枝欲孽")
stopwords.add("延禧攻略")
def main(page_number):
url = 'https://movie.douban.com/subject/26999852/comments?start=' + page_number + '&limit=20&sort=new_score&status=P'
headers = {
'User-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Cookie':'*****',
'Host':'movie.douban.com'
}
r = requests.get(url,headers=headers)
pattern = re.compile('<span class="short">(.*?)</span>',re.S)
results = re.findall(pattern,r.text)
with open('延禧攻略.txt','a') as file :
for result in results :
file.write(result)
file.write('\n')
file.write('\n')
for i in range(25):
main(str(i * 20))
time.sleep(1)
with open('延禧攻略.txt') as f:
text = f.read()
wordcloud = WordCloud(stopwords=stopwords,width=3500,height=2000,background_color='white',font_path='./simfang.ttf').generate(text)
plt.figure(dpi=256)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.savefig('./延禧攻略.png')

3.结论
毒液
的关键词有:loser
寄生兽
等
如懿传
的关键词有:周迅
霍建华
演技在线
等
延禧攻略
的关键词有:皇后
好看
等
Tips
源码以及数据已经分享到Github:Python3 原生爬虫
有需要的朋友可以自行下载观看