import re<br />from urllib import request<br />class Sprder:<br /> def __init__(self):<br /> self.page=1<br /> self.switch=True<br /> def loadPage(self):<br /> """"<br /> 下载页面<br /> """<br /> url="http://www.neihan8.com/article/list_5_"+str(self.page)+".html"<br /> user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; Trident / 5.0'<br /> headers = {'User-Agent': user_agent}<br /> request1=request.Request(url,headers=headers)<br /> response=request.urlopen(request1)<br /> html=response.read().decode("gbk")<br /> pattern=re.compile(r'<div\sclass="f18 mb20">(.*?)</div>', re.S)<br /> content_list=pattern.findall(html)<br /><br /> self.dealPage(content_list)<br /><br /> def dealPage(self,content_list):<br /> """<br /> 处理每页段子<br /> """<br /> for item in content_list:<br /> item=item.replace("<p>","").replace("</p>","").replace("<br>","").replace("<br />","").replace("“","")<br /> self.writePage(item)<br /><br /> def writePage(self,item):<br /> """<br /> 把段子逐个写入文件<br /> """<br /> with open("段子.txt","a") as f:<br /> f.write(item)<br /> def startWork(self):<br /> """<br /> 控制爬虫运行<br /><br /> """<br /> while self.switch:<br /> self.loadPage()<br /> command=str(input("如果继续按回车(退出输入quit)"))<br /> if command=="quit":<br /> self.switch=False<br /><br /> self.page+=1<br />if __name__ == '__main__':<br /> duanziSpider=Sprder()<br /> # duanziSpider.loadPage()<br /> duanziSpider.startWork()