根据电影id获取豆瓣电影下的前100条好评

公子初心
2024-04-26 / 0 评论 / 22 阅读 / 正在检测是否收录...
movies = [1291546,6973376]

# 导入xpath模块 用于解析页面数据
from lxml import etree
# 导入系统模块 用于自动创建目录
import os
# 导入requests模块用于发送http请求
import requests
# 发起请求获取数据
def get_content(url,data=[],ispost=0):
    # user-agent 设置头信息 模拟浏览器访问
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    }
    if ispost:
        response =  requests.post(url=url, headers=headers,data=data)
    else:
        response = requests.get(url=url, headers=headers)
    response.encoding = response.apparent_encoding
    return response.text

limit = 100
# 遍历电影列表数据,获取电影id
for movieId in movies:
    url = f"https://movie.douban.com/subject/{movieId}/comments?percent_type=h&limit={limit}&status=P&sort=new_score"
    # 发送请求 获取页面源码
    result = get_content(url)
    # 获取数据
    etree1 = etree.HTML(result)
    # xpath解析
    # 标题
    title = etree1.xpath("//h1/text()")[0]
    # 评论人
    authors = etree1.xpath("//span[@class='comment-info']/a/text()")
    # 评论时间
    times = etree1.xpath("//span[@class='comment-info']/span[@class='comment-time ']/text()")
    print(times)
    # 评论内容
    contents = etree1.xpath("//span[@class='short']/text()")
    moviePath = "./movieCommentData/"
    if not os.path.exists(moviePath):
        os.makedirs(moviePath)
    # 依次遍历,输出数据
    s = "="
    print(f"{s*30}正在获取电影【{title}】{s*30}")
    for i in range(len(authors)):
        print(f"评论人:'{authors[i]}'---评论内容:'{contents[i]}---评论时间:'{times[i].strip()}''")
        filename = moviePath+'/'+ title + '.txt'
        with open(filename, 'a+', encoding="utf-8") as f:
            f.write(f"评论人:'{authors[i]}'---评论内容:'{contents[i]}'---评论时间:'{times[i].strip()}'\n")
            f.close()
    print(f"{s*30}电影【{title}】获取完毕{s*30}")

0

评论 (0)

取消