python 爬小说

#coding=utf-8
import datetime
import time
import sys
import os 

import urllib2
import urllib

sx = '小说站网址'

type = sys.getfilesystemencoding()  
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  
headers = { 'User-Agent' : user_agent }  


fo = open("note.txt", "wb")

def getHtml(url):  
    try:  
        request = urllib2.Request(url, headers=headers)  
        response = urllib2.urlopen(request)  
        data = response.read()  
        data = data.decode('gbk')  
        data = data.encode('utf-8')  
        print len(data)  
        return data
    except urllib2.URLError, e:  
        if hasattr(e, "code"):  
            print e.code  
        if hasattr(e, "reason"):  
            print e.reson  
        pass

def dealIndex(url):
    data = getHtml(url)
    # pos = data.find()
    bgnpos = data.index('ChapterList_HengFu_1') + 10
    endpos = data.index('ChapterList_HengFu_2') - 10
    print bgnpos
    print endpos

    achfx = data[bgnpos:endpos]
    pos = bgnpos

    i =while 1:
        newpos = achfx.find('href=', pos)
        if newpos == -1 or newpos >= endpos:
            break

        # print data[newpos:newpos+200]
        indexurl = achfx[newpos+6:newpos+19]

        titlepos = achfx.find('</a>', newpos+20)
        titlename = achfx[newpos+21:titlepos+1]
        # print indexurl + "   " + titlename
        pos = titlepos + 5

        dealContext(sx + indexurl, titlename)
        # i = i + 1
        # # print "-----------------" + str(pos)
        # if i >= 1:
        #     break
        pass

    # print achfx


def dealContext(url, title):
    print url
    print title

    data = getHtml(url)
    bgnpos = data.find('name="content"', 10) + 15
    endpos = data.find('yuedu_bottom', bgnpos)
    endpos = data.find('</div>', endpos - 50)

    sContent = data[bgnpos:endpos]
    sContent = sContent.replace('&nbsp;', ' ')
    sContent = sContent.replace('<br />', ' ')

    # # sContent = sContent.strip("&nbsp;")
    # # sContent = sContent.strip('<br />')
    # print sContent
    # # print sContent.strip('<br />')
    sContent = title + "  " + sContent
    fo.write(sContent)

dealIndex(sx)


fo.close()

相关推荐