scrapy爬虫部分
items.py部分
import scrapy
class App01Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
original_url = scrapy.Field()
management_info = scrapy.Field()
com_name = scrapy.Field()
punish_num = scrapy.Field()
mana_results = scrapy.Field()
law_depart = scrapy.Field()
get_date = scrapy.Field()
app_01.py项目部分
-- coding: utf-8 --
import scrapy
import requests
from pyquery import PyQuery
from app01.items import *
class App01Spider(scrapy.Spider):
name = ‘app_01‘
allowed_domains = [‘http://gtghj.wuhan.gov.cn/pt-2256-7-1.html‘]
start_urls = [‘http://gtghj.wuhan.gov.cn/pt-2256-7-1.html‘]
def parse(self, response): count = 1 s = PyQuery(response.text) page_1 = s(‘#info > div > strong‘).text() page = page_1.split(‘/‘)[1] for i in range(int(page)): url_page = ‘http://gtghj.wuhan.gov.cn/pt-2256-7-{}.html‘.format(i + 1) original_r = requests.get(url_page) original_r.encoding = ‘gbk‘ original_s = PyQuery(original_r.text) original_urls = original_s(‘#info > ul > li > a‘).items() get_dates = original_s(‘#info > ul > li > span‘).items() for j, dates in zip(original_urls, get_dates): original_url = ‘http://gtghj.wuhan.gov.cn{}‘.format(j.attr(‘href‘)) # 详情url management_info = j.attr(‘title‘) # 标题 get_date = dates.text() # 时间 yield scrapy.Request(url=original_url,callback=self.parse_info,dont_filter=True#防止parse_info不回调, meta={‘management_info‘:management_info,‘get_date‘:get_date,‘original_url‘:original_url}) count += 1 def parse_info(self,response): item = App01Item()#调用items中的App01Item() detail_s = PyQuery(response.text) com_name = detail_s(‘#show > table:nth-child(2) > tr > td > div > table > tr:nth-child(2) > td:nth-child(2)‘).text() mana_results = detail_s(‘#show > table:nth-child(2) > tr > td > div > table> tr:nth-child(8) > td:nth-child(2)‘).text() punish_num = detail_s(‘#show > table:nth-child(2) > tr > td > div > table > tr:nth-child(4) > td:nth-child(2)‘).text() law_depart = detail_s(‘#show > table:nth-child(2) > tr > td > div > table> tr:nth-child(9) > td:nth-child(2)‘).text() item[‘com_name‘] = com_name item[‘mana_results‘] = mana_results item[‘punish_num‘] = punish_num item[‘law_depart‘] = law_depart item[‘management_info‘] = response.meta[‘management_info‘] item[‘get_date‘] = response.meta[‘get_date‘] item[‘original_url‘] = response.meta[‘original_url‘] print(item)
main.py部分
from scrapy.cmdline import execute
execute(‘scrapy crawl app_01‘.split())