Python学习爬虫 requests库
下载图片import requests response = requests.get(‘http://www.51gis.com.cn/static/upload/3e223daf9df6216f/f3e187dfc0e4143a.jpg‘) with open(‘51gis.jpg‘, ‘wb‘) as f: f.write(response.content)
==================================
import requests class TiebaSpider(object): def __init__(self, tieba_name): self.tieba_name = tieba_name self.url_temp = "http://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}" self.headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36‘ } def parse_url(self, url): #‘‘‘访问url‘‘‘ response = requests.get(url, headers=self.headers) return response.text def save_html(self, url_html, page_num): ‘‘‘保存页面‘‘‘ file_path = "《{}》-第{}页".format(self.tieba_name, page_num) with open(file_path + ‘.htm‘, ‘w‘) as f: f.write(url_html) def get_url_list(self): ‘‘‘构造url列表‘‘‘ # 方法1 url_list = [] for i in range(10): url_list.append(self.url_temp.format(i * 50)) return url_list # 方法2 #return [self.url_temp.format(i * 50) for i in range(10)] def run(self): ‘‘‘主要逻辑‘‘‘ # 1 构造url url_list = self.get_url_list() # 2 访问url for url in url_list: url_html = self.parse_url(url) # 3 保存 page_num = url_list.index(url) + 1 # 获取页码 self.save_html(url_html, page_num) if __name__ == ‘__main__‘: name = input(‘请输入你想要爬取的论坛名称:‘) tb_spider = TiebaSpider(name) tb_spider.run()
相关推荐
sunzhihaofuture 2020-07-19
sunzhihaofuture 2020-06-06
oXiaoChong 2020-06-05
ARCXIANG 2020-06-05
ZHANGRENXIANG00 2020-06-28
kikaylee 2020-05-05
夜斗不是神 2020-11-17
染血白衣 2020-11-16
ARCXIANG 2020-11-02
ARCXIANG 2020-10-28
CycloneKid 2020-10-27
荒谬小孩 2020-10-26
逍遥友 2020-10-26
snakeson 2020-10-09
meylovezn 2020-08-28
囧芝麻 2020-08-17
数据挖掘工人 2020-08-15
cxcxrs 2020-07-28