Ajax数据爬取
Ajax 即“Asynchronous Javascript And XML”(异步 JavaScript 和 XML),是指一种创建交互式、快速动态网页应用的网页开发技术,无需重新加载整个网页的情况下,能够更新部分网页的技术。
通过在后台与服务器进行少量数据交换,Ajax 可以使网页实现异步更新。这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新。
1. 爬取微博页面Ajax数据
import requests from urllib.parse import urlencode from pyquery import PyQuery as pq import json, pymongo def get_ajax_page(page): headers = { ‘Host‘: ‘weibo.com‘, ‘Referer‘: ‘https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36‘, ‘X-Requested-With‘: ‘XMLHttpRequest‘, ‘Cookie‘: ‘SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567‘ } url = ‘https://weibo.com/aj/v6/comment/big‘ params = { ‘ajwvr‘: ‘6‘, ‘id‘: ‘4483557667874538‘, ‘root_comment_max_id_type‘: ‘0‘, ‘page‘: page, } try: response = requests.get(url=url, headers=headers, params=params) if response.status_code == 200: return response.json() # print(type(response.json()), response.json()) except requests.ConnectionError as e: print(‘error‘, e.args) def parse_page(js): data = js.get(‘data‘) html = data.get(‘html‘) doc = pq(html) items = doc(‘div.list_con‘).items() for item in items: msg = {} msg[‘name‘] = item(‘.WB_text‘).text().split(‘:‘)[0] msg[‘content‘] = item(‘.WB_text‘).text().split(‘:‘)[1] msg[‘datetime‘] = item(‘div.WB_from.S_txt2‘).text() yield msg def collection_mongo(host=‘localhost‘, port=27017): client = pymongo.MongoClient(host=host, port=port) return client def save_mongo(client ,data): db = client.weibo collection = db.weibo if collection.insert(data): print(‘Save to mongo‘) def search_mongo(client): db = client.weibo collection = db.weibo result = collection.find() return result def main(): for i in range(1, 11): js = get_ajax_page(str(i)) results = parse_page(js) for result in results: client = collection_mongo(‘10.0.0.100‘) save_mongo(client, result) if __name__ == ‘__main__‘: # main() client = collection_mongo(‘10.0.0.100‘) data = search_mongo(client) for item in data: print(item)
2. Ajax爬取头条街拍图片
import requests from urllib.parse import urlencode from pyquery import PyQuery as pq import json, pymongo def get_ajax_page(page): headers = { ‘Host‘: ‘weibo.com‘, ‘Referer‘: ‘https://weibo.com/1461280777/Iz3Iqx2wG?ref=feedsdk&type=comment‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36‘, ‘X-Requested-With‘: ‘XMLHttpRequest‘, ‘Cookie‘: ‘SINAGLOBAL=7735058780719.93.1582184597719; _s_tentry=zz.253.com; Apache=6823476113810.396.1584424118910; ULV=1584424118929:5:1:1:6823476113810.396.1584424118910:1582854530521; SUB=_2AkMpLTzuf8NxqwJRmP8dzGLgbIxxywvEieKfcc01JRMxHRl-yT92qnAFtRB6Aq0SASvP3fxjV-YYDUSQSyRek7uE3A6b; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF39BLl0OFppYFLW7GUd5Zl; login_sid_t=8687f896b60dd07aaa80e41d83159a89; cross_origin_proto=SSL; Ugrow-G0=6fd5dedc9d0f894fec342d051b79679e; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; wb_view_log=1366*7681; UOR=zz.253.com,widget.weibo.com,www.baidu.com; YF-Page-G0=d30fd7265234f674761ebc75febc3a9f|1584511608|1584511567‘ } url = ‘https://weibo.com/aj/v6/comment/big‘ params = { ‘ajwvr‘: ‘6‘, ‘id‘: ‘4483557667874538‘, ‘root_comment_max_id_type‘: ‘0‘, ‘page‘: page, } try: response = requests.get(url=url, headers=headers, params=params) if response.status_code == 200: return response.json() # print(type(response.json()), response.json()) except requests.ConnectionError as e: print(‘error‘, e.args) def parse_page(js): data = js.get(‘data‘) html = data.get(‘html‘) doc = pq(html) items = doc(‘div.list_con‘).items() for item in items: msg = {} msg[‘name‘] = item(‘.WB_text‘).text().split(‘:‘)[0] msg[‘content‘] = item(‘.WB_text‘).text().split(‘:‘)[1] msg[‘datetime‘] = item(‘div.WB_from.S_txt2‘).text() yield msg def collection_mongo(host=‘localhost‘, port=27017): client = pymongo.MongoClient(host=host, port=port) return client def save_mongo(client ,data): db = client.weibo collection = db.weibo if collection.insert(data): print(‘Save to mongo‘) def search_mongo(client): db = client.weibo collection = db.weibo result = collection.find() return result def main(): for i in range(1, 11): js = get_ajax_page(str(i)) results = parse_page(js) for result in results: client = collection_mongo(‘10.0.0.100‘) save_mongo(client, result) if __name__ == ‘__main__‘: # main() client = collection_mongo(‘10.0.0.100‘) data = search_mongo(client) for item in data: print(item)
相关推荐
kentrl 2020-11-10
结束数据方法的参数,该如何定义?-- 集合为自定义实体类中的结合属性,有几个实体类,改变下标就行了。<input id="add" type="button" value="新增visitor&quo
ajaxyan 2020-11-09
zndy0 2020-11-03
学留痕 2020-09-20
Richardxx 2020-11-09
learningever 2020-09-19
chongxiaocheng 2020-08-16
ajaxhe 2020-08-16
lyqdanang 2020-08-16
curiousL 2020-08-03
TONIYH 2020-07-22
时光如瑾雨微凉 2020-07-19
83510998 2020-07-18
坚持着执着 2020-07-16
jiaguoquan00 2020-07-07
李永毅 2020-07-05
坚持着执着 2020-07-05