python 爬小说
#coding=utf-8 import datetime import time import sys import os import urllib2 import urllib sx = '小说站网址' type = sys.getfilesystemencoding() user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } fo = open("note.txt", "wb") def getHtml(url): try: request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) data = response.read() data = data.decode('gbk') data = data.encode('utf-8') print len(data) return data except urllib2.URLError, e: if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reson pass def dealIndex(url): data = getHtml(url) # pos = data.find() bgnpos = data.index('ChapterList_HengFu_1') + 10 endpos = data.index('ChapterList_HengFu_2') - 10 print bgnpos print endpos achfx = data[bgnpos:endpos] pos = bgnpos i =while 1: newpos = achfx.find('href=', pos) if newpos == -1 or newpos >= endpos: break # print data[newpos:newpos+200] indexurl = achfx[newpos+6:newpos+19] titlepos = achfx.find('</a>', newpos+20) titlename = achfx[newpos+21:titlepos+1] # print indexurl + " " + titlename pos = titlepos + 5 dealContext(sx + indexurl, titlename) # i = i + 1 # # print "-----------------" + str(pos) # if i >= 1: # break pass # print achfx def dealContext(url, title): print url print title data = getHtml(url) bgnpos = data.find('name="content"', 10) + 15 endpos = data.find('yuedu_bottom', bgnpos) endpos = data.find('</div>', endpos - 50) sContent = data[bgnpos:endpos] sContent = sContent.replace(' ', ' ') sContent = sContent.replace('<br />', ' ') # # sContent = sContent.strip(" ") # # sContent = sContent.strip('<br />') # print sContent # # print sContent.strip('<br />') sContent = title + " " + sContent fo.write(sContent) dealIndex(sx) fo.close()
相关推荐
YENCSDN 2020-11-17
lsjweiyi 2020-11-17
houmenghu 2020-11-17
Erick 2020-11-17
HeyShHeyou 2020-11-17
以梦为马不负韶华 2020-10-20
lhtzbj 2020-11-17
夜斗不是神 2020-11-17
pythonjw 2020-11-17
dingwun 2020-11-16
lhxxhl 2020-11-16
坚持是一种品质 2020-11-16
染血白衣 2020-11-16
huavhuahua 2020-11-20
meylovezn 2020-11-20
逍遥友 2020-11-20
weiiron 2020-11-16