【Python3 爬虫】U09_爬取hao6v电影网
目录
1.需求描述
爬取hao6v电影网的数据,先通过xpath解析第一个页面,获取到每部电影的url详情页地址,然后解析详情页地址,获取出所需的数据
页面如下:
2.实现代码
# Author:Logan
import requests
from lxml import etree
HEADERS = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36‘
}
def get_detail_urls(url):
response = requests.get(url, headers=HEADERS)
html_str = response.text
# 获取数据
html = etree.HTML(html_str)
ul = html.xpath("//ul[@class=‘list‘]/li/a/@href")
urls_list =list()
for li in ul:
urls_list.append(li)
return urls_list
def parse_detail_page(detail_url):
response = requests.get(detail_url,headers=HEADERS)
html_str = response.content.decode(‘GBK‘)
html = etree.HTML(html_str)
# 定义字典存储电影信息
movie = dict()
# 获取电影名字
movie[‘title‘] = html.xpath("//div[@id=‘endText‘]/strong/a/text()")[0]
infos = html.xpath("//div[@id=‘endText‘]/p/text()")
for index,info in enumerate(infos):
info = info.strip()
# print(‘=‘ * 30)
# print(index,info)
if info.startswith("◎年 代"):
movie[‘year‘] = info.replace(‘◎年 代‘,‘‘).strip()
elif info.startswith("◎IMDb评分"):
movie[‘IMDBscore‘] = info.replace(‘◎IMDb评分‘, ‘‘).strip()
elif info.startswith("◎片 长"):
movie[‘duration‘] = info.replace(‘◎片 长‘, ‘‘).strip()
elif info.startswith("◎导 演"):
movie[‘direction‘] = info.replace(‘◎导 演‘, ‘‘).strip()
elif info.startswith("◎主 演"):
info = info.replace(‘◎主 演‘, ‘‘).strip()
actors = [info]
for x in range(index+1, len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie[‘actors‘] = actors
return movie
def main():
# 1.构造url地址
base_url = ‘http://www.hao6v.net/dy/index_{}.html‘
for i in range(1,2):
if i == 1:
url = base_url.replace(‘_{}‘,‘‘)
else:
url = base_url.format(i)
# 2.获取详细地址
urls_list = get_detail_urls(url)
# 3.解析详情页面
movie_detail_info = list()
for detail_url in urls_list:
movie = parse_detail_page(detail_url)
movie_detail_info.append(movie)
print(movie_detail_info)
if __name__ == ‘__main__‘:
main()运行结果如下:
相关推荐
chuckchen 2020-10-31
Dreamhome 2020-10-09
xirongxudlut 2020-09-28
星辰大海的路上 2020-09-13
chaochao 2020-08-31
猪猪侠喜欢躲猫猫 2020-08-17
快递小可 2020-08-16
shengge0 2020-07-26
巩庆奎 2020-07-21
张文倩数据库学生 2020-07-19
xirongxudlut 2020-07-18
Ericbig 2020-07-18
kyelu 2020-07-09
liangzhouqu 2020-07-07
GuoSir 2020-06-28
chaigang 2020-06-27
pythonxuexi 2020-06-25