movies = [1291546,6973376]
# 导入xpath模块 用于解析页面数据
from lxml import etree
# 导入系统模块 用于自动创建目录
import os
# 导入requests模块用于发送http请求
import requests
# 发起请求获取数据
def get_content(url,data=[],ispost=0):
# user-agent 设置头信息 模拟浏览器访问
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
}
if ispost:
response = requests.post(url=url, headers=headers,data=data)
else:
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding
return response.text
limit = 100
# 遍历电影列表数据,获取电影id
for movieId in movies:
url = f"https://movie.douban.com/subject/{movieId}/comments?percent_type=h&limit={limit}&status=P&sort=new_score"
# 发送请求 获取页面源码
result = get_content(url)
# 获取数据
etree1 = etree.HTML(result)
# xpath解析
# 标题
title = etree1.xpath("//h1/text()")[0]
# 评论人
authors = etree1.xpath("//span[@class='comment-info']/a/text()")
# 评论时间
times = etree1.xpath("//span[@class='comment-info']/span[@class='comment-time ']/text()")
print(times)
# 评论内容
contents = etree1.xpath("//span[@class='short']/text()")
moviePath = "./movieCommentData/"
if not os.path.exists(moviePath):
os.makedirs(moviePath)
# 依次遍历,输出数据
s = "="
print(f"{s*30}正在获取电影【{title}】{s*30}")
for i in range(len(authors)):
print(f"评论人:'{authors[i]}'---评论内容:'{contents[i]}---评论时间:'{times[i].strip()}''")
filename = moviePath+'/'+ title + '.txt'
with open(filename, 'a+', encoding="utf-8") as f:
f.write(f"评论人:'{authors[i]}'---评论内容:'{contents[i]}'---评论时间:'{times[i].strip()}'\n")
f.close()
print(f"{s*30}电影【{title}】获取完毕{s*30}")
版权属于:
公子初心
作品采用:
《
署名-非商业性使用-相同方式共享 4.0 国际 (CC BY-NC-SA 4.0)
》许可协议授权
评论 (0)