使用Scrapy抓取新浪微博用户信息
详细代码可查看Knowsmore
数据的来源是新浪微博的手机端H5页面
个人资料API:https://m.weibo.cn/profile/in...【用户ID】发出的微博API:https://m.weibo.cn/api/contai...【用户ID】_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page=【页数从1开始】
# -*- coding: utf-8 -*- import scrapy import re import json import os,sys from scrapy import Selector, Request from knowsmore.items import WeiboUserItem, WeiboStatusItem from ..common import * from ..model.mongodb import * WEIBO_USER_CONFIG = { 'BASE_URL' : 'https://m.weibo.cn', 'USER_IDS' : ['6883966016'] } class WeiboUserSpider(scrapy.Spider): name = "weibo_user" def start_requests(self): for uid in WEIBO_USER_CONFIG['USER_IDS']: url = '%s/profile/info?uid=%s' % (WEIBO_USER_CONFIG['BASE_URL'], uid) yield Request(url) # Define your statuses implementation here, just a demo below for i in range(1, 2): status_url = '%s/api/container/getIndex?containerid=230413%s_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page=%d' % (WEIBO_USER_CONFIG['BASE_URL'], uid, i) yield Request(status_url, callback=self.parse_status) # https://m.weibo.cn/profile/1784537661 def parse(self, response): user_data = json.loads(response.text) yield WeiboUserItem( fans_url = user_data['data']['fans'], follow_url = user_data['data']['follow'], more_url = user_data['data']['more'], user = user_data['data']['user'] ) # https://m.weibo.cn/api/container/getIndex?containerid=2304131784537661_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page=2 def parse_status(self, response): status_data = json.loads(response.text) yield WeiboStatusItem( cards = status_data['data']['cards'] )
相关推荐
javaraylu 2020-06-28
ZHANGRENXIANG00 2020-06-28
Catastrophe 2020-06-26
andrewwf 2020-05-30
ZHANGRENXIANG00 2020-05-11
Catastrophe 2020-05-07
andrewwf 2020-11-11
Arvinzx 2020-10-28
CycloneKid 2020-10-27
paleyellow 2020-10-25
baifanwudi 2020-10-25
heyboz 2020-10-21
wumxiaozhu 2020-10-16
ZHANGRENXIANG00 2020-07-27
zhangll00 2020-07-05
Catastrophe 2020-06-26
fangjack 2020-06-25