网络爬虫练习之网络小说

import requests
import bs4

#获取网页代码
def gethtml(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except:
        return "禁止爬取本网站"

#获取每一页中的文字
def chapters(url,name):
    html = gethtml("http://www.bjkgjlu.com"+url)
    soup = bs4.BeautifulSoup(html,‘html.parser‘)
    for i in soup.find_all("div",attrs={"class":"chapter_content"}):
        with open(name+".txt","wb") as f:
            f.write(i.text.split("&lt")[0].encode("utf-8"))
            print(name+"爬取结束,并存入文件")

if __name__=="__main__":
    url = "http://www.bjkgjlu.com/303618kyi/catalog"
    chapter_name_list = []
    chapter_url_list = []
    html =gethtml(url)
    soup = bs4.BeautifulSoup(html, "html.parser")

    for i in soup.findAll("div", attrs={"class": "col-xs-120 col-sm-60 col-md-40 col-lg-30"}):
        for j in i.children:
            chapter_name_list.append(j.text)
            chapter_url_list .append(j.get("href"))
    print(chapter_name_list )
    for j in range(len(chapter_name_list)):
        chapters(chapter_url_list[j],chapter_name_list[j] )

相关推荐