python3 爬虫内涵段子

import re<br />from urllib import request<br />class Sprder:<br />    def __init__(self):<br />        self.page=1<br />        self.switch=True<br />    def loadPage(self):<br />        """"<br />        下载页面<br />        """<br />        url="http://www.neihan8.com/article/list_5_"+str(self.page)+".html"<br />        user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; Trident / 5.0'<br />        headers = {'User-Agent': user_agent}<br />        request1=request.Request(url,headers=headers)<br />        response=request.urlopen(request1)<br />        html=response.read().decode("gbk")<br />        pattern=re.compile(r'<div\sclass="f18 mb20">(.*?)</div>', re.S)<br />        content_list=pattern.findall(html)<br /><br />        self.dealPage(content_list)<br /><br />    def dealPage(self,content_list):<br />        """<br />        处理每页段子<br />        """<br />        for item in content_list:<br />            item=item.replace("<p>","").replace("</p>","").replace("<br>","").replace("<br />","").replace("&ldquo;","")<br />            self.writePage(item)<br /><br />    def writePage(self,item):<br />        """<br />         把段子逐个写入文件<br />        """<br />        with open("段子.txt","a") as f:<br />            f.write(item)<br />    def startWork(self):<br />        """<br />        控制爬虫运行<br /><br />        """<br />        while self.switch:<br />            self.loadPage()<br />            command=str(input("如果继续按回车(退出输入quit)"))<br />            if command=="quit":<br />                self.switch=False<br /><br />            self.page+=1<br />if __name__ == '__main__':<br />        duanziSpider=Sprder()<br />        # duanziSpider.loadPage()<br />        duanziSpider.startWork()