python实现博客文章爬虫示例
代码如下:
#!/usr/bin/python #-*-coding:utf-8-*- # JCrawler # Author: Jam <[email protected]> import time import urllib2 from bs4 import BeautifulSoup # 目标站点 TargetHost = "http://adirectory.blog.com" # User Agent UserAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36' # 链接采集规则 # 目录链接采集规则 CategoryFind = [{'findMode':'find','findTag':'div','rule':{'id':'cat-nav'}}, {'findMode':'findAll','findTag':'a','rule':{}}] # 文章链接采集规则 ArticleListFind = [{'findMode':'find','findTag':'div','rule':{'id':'content'}}, {'findMode':'findAll','findTag':'h2','rule':{'class':'title'}}, {'findMode':'findAll','findTag':'a','rule':{}}] # 分页URL规则 PageUrl = 'page/#page/' PageStart = 1 PageStep = 1 PageStopHtml = '404: Page Not Found' def GetHtmlText(url): request = urllib2.Request(url) request.add_header('Accept', "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp") request.add_header('Accept-Encoding', "*") request.add_header('User-Agent', UserAgent) return urllib2.urlopen(request).read() def ArrToStr(varArr): returnStr = "" for s in varArr: returnStr += str(s) return returnStr def GetHtmlFind(htmltext, findRule): findReturn = BeautifulSoup(htmltext) returnText = "" for f in findRule: if returnText != "": findReturn = BeautifulSoup(returnText) if f['findMode'] == 'find': findReturn = findReturn.find(f['findTag'], f['rule']) if f['findMode'] == 'findAll': findReturn = findReturn.findAll(f['findTag'], f['rule']) returnText = ArrToStr(findReturn) return findReturn def GetCategory(): categorys = []; htmltext = GetHtmlText(TargetHost) findReturn = GetHtmlFind(htmltext, CategoryFind) for tag in findReturn: print "[G]->Category:" + tag.string + "|Url:" + tag['href'] categorys.append({'name': tag.string, 'url': tag['href']}) return categorys; def GetArticleList(categoryUrl): articles = [] page = PageStart #pageUrl = PageUrl while True: htmltext = "" pageUrl = PageUrl.replace("#page", str(page)) print "[G]->PageUrl:" + categoryUrl + pageUrl while True: try: htmltext = GetHtmlText(categoryUrl + pageUrl) break except urllib2.HTTPError,e: print "[E]->HTTP Error:" + str(e.code) if e.code == 404: htmltext = PageStopHtml break if e.code == 504: print "[E]->HTTP Error 504: Gateway Time-out, Wait" time.sleep(5) else: break if htmltext.find(PageStopHtml) >= 0: print "End Page." break else: findReturn = GetHtmlFind(htmltext, ArticleListFind) for tag in findReturn: if tag.string != None and tag['href'].find(TargetHost) >= 0: print "[G]->Article:" + tag.string + "|Url:" + tag['href'] articles.append({'name': tag.string, 'url': tag['href']}) page += 1 return articles; print "[G]->GetCategory" Mycategorys = GetCategory(); print "[G]->GetCategory->Success." time.sleep(3) for category in Mycategorys: print "[G]->GetArticleList:" + category['name'] GetArticleList(category['url'])
相关推荐
夜斗不是神 2020-11-17
染血白衣 2020-11-16
ARCXIANG 2020-11-02
ARCXIANG 2020-10-28
CycloneKid 2020-10-27
荒谬小孩 2020-10-26
逍遥友 2020-10-26
snakeson 2020-10-09
meylovezn 2020-08-28
囧芝麻 2020-08-17
数据挖掘工人 2020-08-15
cxcxrs 2020-07-28
dashoumeixi 2020-07-20
sunzhihaofuture 2020-07-19
我欲疾风前行 2020-07-06
sunzhihaofuture 2020-07-04
Dimples 2020-06-28