An example processor
package org.archive.crawler.extractor; import java.util.regex.Matcher; import javax.management.AttributeNotFoundException; import org.archive.crawler.datamodel.CoreAttributeConstants; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.framework.Processor; import org.archive.crawler.settings.SimpleType; import org.archive.crawler.settings.Type; import org.archive.crawler.extractor.Link; import org.archive.util.TextUtils; /** * A very simple extractor. Will assume that any string that matches a * configurable regular expression is a link. * * @author Kristinn Sigurdsson */ public class SimpleExtractor extends Processor implements CoreAttributeConstants { public static final String ATTR_REGULAR_EXPRESSION = "input-param"; public static final String DEFAULT_REGULAR_EXPRESSION = "http://([a-zA-Z0-9]+\\.)+[a-zA-Z0-9]+/"; //Find domains int numberOfCURIsHandled = 0; int numberOfLinksExtracted = 0; public SimpleExtractor(String name) { 1 super(name, "A very simple link extractor. Doesn't do anything useful."); Type e; e = addElementToDefinition(new SimpleType(ATTR_REGULAR_EXPRESSION, "How deep to look into files for URI strings, in bytes", DEFAULT_REGULAR_EXPRESSION)); e.setExpertSetting(true); } protected void innerProcess(CrawlURI curi) { if (!curi.isHttpTransaction()) 2 { // We only handle HTTP at the moment. return; } numberOfCURIsHandled++; 3 CharSequence cs = curi.getHttpRecorder().getReplayCharSequence(); 4 String regexpr = null; try { regexpr = (String)getAttribute(ATTR_REGULAR_EXPRESSION,curi); 5 } catch(AttributeNotFoundException e) { regexpr = DEFAULT_REGULAR_EXPRESSION; } Matcher match = TextUtils.getMatcher(regexpr, cs); 6 while (match.find()){ String link = cs.subSequence(match.start(),match.end()).toString(); 7 curi.createAndAddLink(link, Link.SPECULATIVE_MISC, Link.NAVLINK_HOP);8 numberOfLinksExtracted++; 9 System.out.println("SimpleExtractor: " + link); 10 } TextUtils.recycleMatcher(match); 11 } public String report() { 12 StringBuffer ret = new StringBuffer(); ret.append("Processor: org.archive.crawler.extractor." + "SimpleExtractor\n"); ret.append(" Function: Example extractor\n"); ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); return ret.toString(); } }
相关推荐
Lzs 2020-10-23
聚合室 2020-11-16
零 2020-09-18
Justhavefun 2020-10-22
jacktangj 2020-10-14
ChaITSimpleLove 2020-10-06
Andrea0 2020-09-18
周游列国之仕子 2020-09-15
afanti 2020-09-16
88234852 2020-09-15
YClimb 2020-09-15
风雨断肠人 2020-09-04
卖口粥湛蓝的天空 2020-09-15
stulen 2020-09-15
pythonxuexi 2020-09-06
abfdada 2020-08-26
梦的天空 2020-08-25