# 导入xpath模块 用于解析页面数据
from lxml import etree
# 导入系统模块 用于自动创建目录
import os
# 导入requests模块用于发送http请求
import requests
import json
# 发起请求获取数据
def get_content(url,data=[],ispost=0):
# user-agent 设置头信息 模拟浏览器访问
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
}
if ispost:
response = requests.post(url=url, headers=headers,data=data)
else:
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding
return response.text
# 根据电影Id获取电影的评论
def getCommentByMovieId(typeName,movieId,limit=100):
try:
url = f"https://movie.douban.com/subject/{movieId}/comments?percent_type=h&limit={limit}&status=P&sort=new_score"
# 发送请求 获取页面源码
result = get_content(url)
# 获取数据
etree1 = etree.HTML(result)
# xpath解析
# 标题
title = etree1.xpath("//h1/text()")[0]
# 评论人
authors = etree1.xpath("//span[@class='comment-info']/a/text()")
# 评论时间
times = etree1.xpath("//span[@class='comment-info']/span[@class='comment-time ']/text()")
# 评论内容
contents = etree1.xpath("//span[@class='short']/text()")
moviePath = f"./movieCommentData/{typeName}"
if not os.path.exists(moviePath):
os.makedirs(moviePath)
# 依次遍历,输出数据
s = "="
print(f"{s * 30}正在获取电影【{title}】{s * 30}")
for i in range(len(authors)):
print(f"评论人:'{authors[i]}'---评论内容:'{contents[i]}---评论时间:'{times[i].strip()}''")
filename = moviePath + '/' + title + '.txt'
with open(filename, 'a+', encoding="utf-8") as f:
f.write(f"评论人:'{authors[i]}'---评论内容:'{contents[i]}'---评论时间:'{times[i].strip()}'\n")
f.close()
print(f"{s * 30}电影【{title}】获取完毕{s * 30}")
except:
pass
url = "https://movie.douban.com/chart"
# 发送请求 获取页面源码
result = get_content(url)
# 获取数据
etree1 = etree.HTML(result)
# xpath解析
typeUrl = etree1.xpath("//div[@class='types']/span/a/@href")
typeUrlList = []
for url in typeUrl:
typeName = url.split("&")[0].split('=')[1]
print(f"正在获取【{typeName}】分类下的电影信息......")
typeId = url.split("&")[1].split("=")[1]
# 根据电影分类获取此分类下的电影总数
totalUrl = f"https://movie.douban.com/j/chart/top_list_count?type={typeId}&interval_id=100%3A90&action="
# 获取到此分类下的电影总数量
movieTotal = json.loads(get_content(totalUrl))['total']
print(f"{typeName}分类下的电影一共有{movieTotal}部")
# 根据电影总数量依次获取这些电影
for i in range(movieTotal+1):
baseUrl = f"https://movie.douban.com/j/chart/top_list?type={typeId}&interval_id=100%3A90&action=&start={i}&limit=1"
movieInfo = json.loads(get_content(baseUrl))[0]
# 获取电影Id
movieId = movieInfo['id']
getCommentByMovieId(typeName,movieId)
版权属于:
公子初心
作品采用:
《
署名-非商业性使用-相同方式共享 4.0 国际 (CC BY-NC-SA 4.0)
》许可协议授权
评论 (0)