An example processor

package org.archive.crawler.extractor;

import java.util.regex.Matcher;

import javax.management.AttributeNotFoundException;

import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;
import org.archive.crawler.extractor.Link;
import org.archive.util.TextUtils;

/**
 * A very simple extractor. Will assume that any string that matches a 
 * configurable regular expression is a link.
 *
 * @author Kristinn Sigurdsson
 */
public class SimpleExtractor extends Processor
    implements CoreAttributeConstants
{
    public static final String ATTR_REGULAR_EXPRESSION = "input-param";
    public static final String DEFAULT_REGULAR_EXPRESSION = 
        "http://([a-zA-Z0-9]+\\.)+[a-zA-Z0-9]+/"; //Find domains
    
    int numberOfCURIsHandled = 0; 
    int numberOfLinksExtracted = 0;

    public SimpleExtractor(String name) { 1
        super(name, "A very simple link extractor. Doesn't do anything useful.");
        Type e;
        e = addElementToDefinition(new SimpleType(ATTR_REGULAR_EXPRESSION,
            "How deep to look into files for URI strings, in bytes",
            DEFAULT_REGULAR_EXPRESSION));
        e.setExpertSetting(true);
    }

    protected void innerProcess(CrawlURI curi) {

        if (!curi.isHttpTransaction()) 2
        {
            // We only handle HTTP at the moment.
            return;
        }
        
        numberOfCURIsHandled++; 3

        CharSequence cs = curi.getHttpRecorder().getReplayCharSequence(); 4
        String regexpr = null;
        try {
            regexpr = (String)getAttribute(ATTR_REGULAR_EXPRESSION,curi); 5
        } catch(AttributeNotFoundException e) {
            regexpr = DEFAULT_REGULAR_EXPRESSION;
        }

        Matcher match = TextUtils.getMatcher(regexpr, cs); 6
        
        while (match.find()){ 
            String link = cs.subSequence(match.start(),match.end()).toString(); 7
            curi.createAndAddLink(link, Link.SPECULATIVE_MISC, Link.NAVLINK_HOP);8
            numberOfLinksExtracted++; 9
            System.out.println("SimpleExtractor: " + link); 10
        }
        
        TextUtils.recycleMatcher(match); 11
    }

    public String report() { 12
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: org.archive.crawler.extractor." +
            "SimpleExtractor\n");
        ret.append("  Function:          Example extractor\n");
        ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
        ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");

        return ret.toString();
    }
}

相关推荐