网络爬虫练习之网络小说
import requests
import bs4
#获取网页代码
def gethtml(url):
try:
response = requests.get(url)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except:
return "禁止爬取本网站"
#获取每一页中的文字
def chapters(url,name):
html = gethtml("http://www.bjkgjlu.com"+url)
soup = bs4.BeautifulSoup(html,‘html.parser‘)
for i in soup.find_all("div",attrs={"class":"chapter_content"}):
with open(name+".txt","wb") as f:
f.write(i.text.split("<")[0].encode("utf-8"))
print(name+"爬取结束,并存入文件")
if __name__=="__main__":
url = "http://www.bjkgjlu.com/303618kyi/catalog"
chapter_name_list = []
chapter_url_list = []
html =gethtml(url)
soup = bs4.BeautifulSoup(html, "html.parser")
for i in soup.findAll("div", attrs={"class": "col-xs-120 col-sm-60 col-md-40 col-lg-30"}):
for j in i.children:
chapter_name_list.append(j.text)
chapter_url_list .append(j.get("href"))
print(chapter_name_list )
for j in range(len(chapter_name_list)):
chapters(chapter_url_list[j],chapter_name_list[j] ) 相关推荐
我欲疾风前行 2020-06-18
CycloneKid 2020-10-27
jling 2020-09-17
fengling 2020-08-15
我欲疾风前行 2020-06-04
athrenzala 2020-05-30
zengni 2020-05-29
sunzhihaofuture 2020-05-17
hilary0 2020-05-15
hilary0 2020-05-04
hilary0 2020-05-03
fangjack 2020-04-22
knightwatch 2020-04-16
宿舍 2020-03-06
四叶草 2020-02-15
oXiaoChong 2020-02-14
四叶草 2020-01-30