Python爬虫-百度贴吧
百度贴吧爬虫实现
GET请求
from urllib import request
import urllib
import time
# https://tieba.baidu.com/f?kw=python&fr=ala0&tpl=5 #第一页
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 #第二页 (2-1)*50
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 #第三页 (3-1)*50
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 #第四页 (4-1)*50
# 第n页 (n-1)*50
# 推测第一页:https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36"
}
#根据url发送请求,获取服务器响应文件
def loadPage(url,filename):
print("正在下载"+filename)
req=request.Request(url,headers=headers)
return request.urlopen(req).read()
#将HTML内容写到本地
def writePage(html,filename):
print("正在保存"+filename)
with open(filename,"wb") as f:
f.write(html)
print("---------------------------")
def tiebaSpider(url,begin,end):
for page in range(begin,end+1):
pn=(page-1)*50
fullurl=url+"&pn="+str(pn) #每次请求的url
filename="D:/贴吧/第"+str(page)+"页.html" #每次请求后保存的文件名
html=loadPage(fullurl,filename) #调用爬虫,爬取网页信息
writePage(html,filename) #写入本地
if __name__==‘__main__‘:
while(True):
kw=input("请输入字条:")
begin=int(input("请输入起始页:"))
end=int(input("请输入结束页:"))
url="http://tieba.baidu.com/f?"
key=urllib.parse.urlencode({"kw":kw})
url=url+key
tiebaSpider(url,begin,end) 相关推荐
sunzhihaofuture 2020-07-19
sunzhihaofuture 2020-06-06
oXiaoChong 2020-06-05
ARCXIANG 2020-06-05
夜斗不是神 2020-11-17
染血白衣 2020-11-16
ARCXIANG 2020-11-02
CycloneKid 2020-10-27
meylovezn 2020-08-28
囧芝麻 2020-08-17
数据挖掘工人 2020-08-15
cxcxrs 2020-07-28
dashoumeixi 2020-07-20