heritrix文档上的一个例子,放这备用
package mypackage; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FetchStatusCodes; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Frontier; import org.archive.crawler.framework.FrontierMarker; import org.archive.crawler.framework.exceptions.FatalConfigurationException; import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException; import org.archive.crawler.settings.ModuleType; /** * A simple Frontier implementation for tutorial purposes */ public class MyFrontier extends ModuleType implements Frontier, FetchStatusCodes { // A list of the discovered URIs that should be crawled. List pendingURIs = new ArrayList(); // A list of prerequisites that needs to be met before any other URI is // allowed to be crawled, e.g. DNS-lookups List prerequisites = new ArrayList(); // A hash of already crawled URIs so that every URI is crawled only once. Map alreadyIncluded = new HashMap(); // Reference to the CrawlController. CrawlController controller; // Flag to note if a URI is being processed. boolean uriInProcess = false; // top-level stats long successCount = 0; long failedCount = 0; long disregardedCount = 0; long totalProcessedBytes = 0; public MyFrontier(String name) { super(Frontier.ATTR_NAME, "A simple frontier."); } public void initialize(CrawlController controller) throws FatalConfigurationException, IOException { this.controller = controller; // Initialize the pending queue with the seeds this.controller.getScope().refreshSeeds(); List seeds = this.controller.getScope().getSeedlist(); synchronized(seeds) { for (Iterator i = seeds.iterator(); i.hasNext();) { UURI u = (UURI) i.next(); CandidateURI caUri = new CandidateURI(u); caUri.setSeed(); schedule(caUri); } } } public synchronized CrawlURI next(int timeout) throws InterruptedException { if (!uriInProcess && !isEmpty()) { uriInProcess = true; CrawlURI curi; if (!prerequisites.isEmpty()) { curi = CrawlURI.from((CandidateURI) prerequisites.remove(0)); } else { curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0)); } curi.setServer(controller.getServerCache().getServerFor(curi)); return curi; } else { wait(timeout); return null; } } public boolean isEmpty() { return pendingURIs.isEmpty() && prerequisites.isEmpty(); } public synchronized void schedule(CandidateURI caURI) { // Schedule a uri for crawling if it is not already crawled if (!alreadyIncluded.containsKey(caURI.getURIString())) { if(caURI.needsImmediateScheduling()) { prerequisites.add(caURI); } else { pendingURIs.add(caURI); } alreadyIncluded.put(caURI.getURIString(), caURI); } } public void batchSchedule(CandidateURI caURI) { schedule(caURI); } public void batchFlush() { } public synchronized void finished(CrawlURI cURI) { uriInProcess = false; if (cURI.isSuccess()) { successCount++; totalProcessedBytes += cURI.getContentSize(); controller.fireCrawledURISuccessfulEvent(cURI); cURI.stripToMinimal(); } else if (cURI.getFetchStatus() == S_DEFERRED) { cURI.processingCleanup(); alreadyIncluded.remove(cURI.getURIString()); schedule(cURI); } else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED || cURI.getFetchStatus() == S_OUT_OF_SCOPE || cURI.getFetchStatus() == S_BLOCKED_BY_USER || cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS || cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS || cURI.getFetchStatus() == S_DELETED_BY_USER) { controller.fireCrawledURIDisregardEvent(cURI); disregardedCount++; cURI.stripToMinimal(); } else { controller.fireCrawledURIFailureEvent(cURI); failedCount++; cURI.stripToMinimal(); } cURI.processingCleanup(); } public long discoveredUriCount() { return alreadyIncluded.size(); } public long queuedUriCount() { return pendingURIs.size() + prerequisites.size(); } public long finishedUriCount() { return successCount + failedCount + disregardedCount; } public long successfullyFetchedCount() { return successCount; } public long failedFetchCount() { return failedCount; } public long disregardedFetchCount() { return disregardedCount; } public long totalBytesWritten() { return totalProcessedBytes; } public String report() { return "This frontier does not return a report."; } public void importRecoverLog(String pathToLog) throws IOException { throw new UnsupportedOperationException(); } public FrontierMarker getInitialMarker(String regexpr, boolean inCacheOnly) { return null; } public ArrayList getURIsList(FrontierMarker marker, int numberOfMatches, boolean verbose) throws InvalidFrontierMarkerException { return null; } public long deleteURIs(String match) { return 0; } }
相关推荐
zzcchunter 2007-11-16
冬冬阳光 2013-07-30
神码不是浮云 2010-12-04
woyanyouxin 2010-11-21
tonygsw 2010-11-21
zhonglinzhang 2010-11-21
lvbaolin 2010-11-21
wdeo0 2010-11-21
sunh 2010-11-21
bxqybxqy 2010-11-21
阿赞 2010-08-08
douyunqian 2010-07-13
heyeqingquan 2010-03-11