头条美女千千万，利用Python抓抓看，所见即所获

katyusha

2018-09-23

关注关注

本次练习用到的知识点有

* Requests 库的使用

* BeautifulShop 库的使用

* 正则表达式的使用

* pymongo 库的使用

有需要Python学习资料的小伙伴吗?小编整理【一套Python资料、源码和PDF】，感兴趣者可以关注小编后私信学习资料（是关注后私信哦）反正闲着也是闲着呢，不如学点东西啦

1、项目流程分析

头条美女千千万，利用Python抓抓看，所见即所获

2、中心调度

# 中心调度
def main(offset):
 # 获取列表页
 index_data = get_page_index(offset,KEYWORDS)
 if index_data is None:
 print("offset:"+offset+"异常")
 return
 # 解析列表页获取所有详情页的url
 for url in parse_page_index(index_data):
 # 获取详情页
 detail_data = get_page_detail(url)
 if detail_data is None:
 print('url:%s异常'.format(url))
 pass
 # 解析详情页
 data = parse_page_detail(detail_data, url)
 if data is None:
 continue
 save_to_mongo(data)

3、请求和解析列表页

# 请求获取列表页的响应数据
def get_page_index(offset,keywords):
 params = {
 'offset':offset,
 'format':'json',
 'keyword':KEYWORDS,
 'cur_tab':3,
 'autoload':'true',
 'count':20
 }
 try:
 response = requests.get('http://www.toutiao.com/search_content/',params=params)
 if response.status_code==200:
 return response.text
 return None
 except RequestException as e:
 return None
# 解析列表页
def parse_page_index(text):
 try:
 data = json.loads(text)
 if data and 'data' in data.keys():
 for item in data.get('data'):
 yield item.get('article_url')
 except JSONDecodeError as e:
 print('解析异常')
 return []

4、请求和解析详情页

# 解析详情页面
def parse_page_detail(html, url):
 soup = BeautifulSoup(html,'lxml')
 # 获取页面的标题
 title = soup.title.string
 image_pattern = re.compile('var gallery = (.*?);',re.S)
 result = image_pattern.search(html)
 if result:
 try:
 data = json.loads(result.group(1))
 if data and 'sub_images' in data.keys():
 # 获取所有的image的url
 images = [item.get('url') for item in data.get('sub_images')]
 for image in images:
 # 下载图片
 download_image(image)
 return {'title':title, 'url':url, 'images':images}
 except JSONDecodeError as e:
 return None
 return None

5、下载图片和保存至Mongodb

# 获取图片的二进制流
def download_image(url):
 try:
 print('图片'+url+'正在下载')
 response = requests.get(url)
 if response.status_code == 200:
 # 保存图片
 save_image(response.content)
 except RequestException as e:
 print('异常image:'+url)
 pass
# 保存二进制流至文件
def save_image(content):
 file_path = '{0}/images/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
 if not os.path.exists(file_path):
 with open(file_path,'wb+') as file:
 file.write(content)
 file.close()
def save_to_mongo(data):
 if db[MONGO_TABLE].insert(data):
 print('成功保存'+data['title'])
 return True
 return False

6、完整代码

#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import re
import requests
import pymongo
import json
from hashlib import md5
from bs4 import BeautifulSoup
from setting import *
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError
from multiprocessing import Pool
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
# 请求获取列表页的响应数据
def get_page_index(offset,keywords):
 params = {
 'offset':offset,
 'format':'json',
 'keyword':KEYWORDS,
 'cur_tab':3,
 'autoload':'true',
 'count':20
 }
 try:
 response = requests.get('http://www.toutiao.com/search_content/',params=params)
 if response.status_code==200:
 return response.text
 return None
 except RequestException as e:
 return None
# 解析列表页
def parse_page_index(text):
 try:
 data = json.loads(text)
 if data and 'data' in data.keys():
 for item in data.get('data'):
 yield item.get('article_url')
 except JSONDecodeError as e:
 print('解析异常')
 return []
# 请求获取详情页面的响应数据
def get_page_detail(url):
 response = requests.get(url)
 try:
 if response.status_code==200:
 return response.text
 return None
 except RequestException as e:
 return None
# 解析详情页面
def parse_page_detail(html, url):
 soup = BeautifulSoup(html,'lxml')
 # 获取页面的标题
 title = soup.title.string
 image_pattern = re.compile('var gallery = (.*?);',re.S)
 result = image_pattern.search(html)
 if result:
 try:
 data = json.loads(result.group(1))
 if data and 'sub_images' in data.keys():
 # 获取所有的image的url
 images = [item.get('url') for item in data.get('sub_images')]
 for image in images:
 # 下载图片
 download_image(image)
 return {'title':title, 'url':url, 'images':images}
 except JSONDecodeError as e:
 return None
 return None
# 获取图片的二进制流
def download_image(url):
 try:
 print('图片'+url+'正在下载')
 response = requests.get(url)
 if response.status_code == 200:
 # 保存图片
 save_image(response.content)
 except RequestException as e:
 print('异常image:'+url)
 pass
# 保存二进制流至文件
def save_image(content):
 file_path = '{0}/images/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
 if not os.path.exists(file_path):
 with open(file_path,'wb+') as file:
 file.write(content)
 file.close()
def save_to_mongo(data):
 if db[MONGO_TABLE].insert(data):
 print('成功保存'+data['title'])
 return True
 return False
# 中心调度
def main(offset):
 # 获取列表页
 index_data = get_page_index(offset,KEYWORDS)
 if index_data is None:
 print("offset:"+offset+"异常")
 return
 # 解析列表页获取所有详情页的url
 for url in parse_page_index(index_data):
 # 获取详情页
 detail_data = get_page_detail(url)
 if detail_data is None:
 print('url:%s异常'.format(url))
 pass
 # 解析详情页
 data = parse_page_detail(detail_data, url)
 if data is None:
 continue
 save_to_mongo(data)
if __name__=='__main__':
 groups = [x*20 for x in range(GROUP_START,GROUP_END+1)]
 pool = Pool()
 pool.map(main, groups)

7、运行结果

头条美女千千万，利用Python抓抓看，所见即所获