首先新建一个.py脚本文件
[root@VM_0_13_centos ~]# vim top250.py
编写Python3爬虫代码
#!/bin/python3
# -*- coding: utf-8 -*-
# author:ujslxw time:2020/10/24
import re,json
def getPage(url):
#伪造浏览器信息
from urllib.request import urlopen, Request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
url = Request(url, headers=headers)
response = urlopen(url, timeout=10)
return response.read().decode('utf-8')
def parsePage(s):
ret = re.findall(
'<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>'
'.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>.*?<span class="inq">(?P<inq>.*?)</span>',s,re.S)
return ret
def main(num):
url = 'https://movie.douban.com/top250?start=%s&filter=' % num
response_html = getPage(url)
ret = parsePage(response_html)
print(ret)
f=open("Douban_top250.txt","a",encoding="utf8")
for obj in ret:
print(obj)
data = json.dumps(obj, ensure_ascii=False)
f.write(data+'\n')
if __name__ == '__main__':
count = 0
for i in range(10): # 10页
main(count)
count += 25
为top250.py添加执行权限
[root@VM_0_13_centos ~]# chmod a+x top250.py
执行脚本
[root@VM_0_13_centos ~]# ./top250.py &>/dev/null
# &>/dev/null 将命令的输出重定向到/dev/null(不显示命令的执行过程)
查看文件Douban_top250
[root@VM_0_13_centos ~]# ls
Douban_top250.txt top250.py
[root@VM_0_13_centos ~]# vim Douban_top250.txt