Python学习爬虫 requests库
下载图片import requests
response = requests.get(‘http://www.51gis.com.cn/static/upload/3e223daf9df6216f/f3e187dfc0e4143a.jpg‘)
with open(‘51gis.jpg‘, ‘wb‘) as f:
f.write(response.content)==================================
import requests
class TiebaSpider(object):
def __init__(self, tieba_name):
self.tieba_name = tieba_name
self.url_temp = "http://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
self.headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36‘
}
def parse_url(self, url):
#‘‘‘访问url‘‘‘
response = requests.get(url, headers=self.headers)
return response.text
def save_html(self, url_html, page_num):
‘‘‘保存页面‘‘‘
file_path = "《{}》-第{}页".format(self.tieba_name, page_num)
with open(file_path + ‘.htm‘, ‘w‘) as f:
f.write(url_html)
def get_url_list(self):
‘‘‘构造url列表‘‘‘
# 方法1
url_list = []
for i in range(10):
url_list.append(self.url_temp.format(i * 50))
return url_list
# 方法2
#return [self.url_temp.format(i * 50) for i in range(10)]
def run(self):
‘‘‘主要逻辑‘‘‘
# 1 构造url
url_list = self.get_url_list()
# 2 访问url
for url in url_list:
url_html = self.parse_url(url)
# 3 保存
page_num = url_list.index(url) + 1 # 获取页码
self.save_html(url_html, page_num)
if __name__ == ‘__main__‘:
name = input(‘请输入你想要爬取的论坛名称:‘)
tb_spider = TiebaSpider(name)
tb_spider.run() 相关推荐
sunzhihaofuture 2020-07-19
sunzhihaofuture 2020-06-06
oXiaoChong 2020-06-05
ARCXIANG 2020-06-05
ZHANGRENXIANG00 2020-06-28
kikaylee 2020-05-05
夜斗不是神 2020-11-17
染血白衣 2020-11-16
ARCXIANG 2020-11-02
CycloneKid 2020-10-27
meylovezn 2020-08-28
囧芝麻 2020-08-17
数据挖掘工人 2020-08-15
cxcxrs 2020-07-28