Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler.processor; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import java.net.URLConnection; import java.util.Iterator; import java.util.SortedMap; import java.util.TreeMap; import org.archive.crawler.framework.Frontier; import org.archive.modules.CrawlURI; import org.archive.spring.ConfigPath; import org.archive.util.iterator.LineReadingIterator; import org.archive.util.iterator.RegexLineIterator; import org.springframework.beans.factory.annotation.Autowired; /** * A simple crawl splitter/mapper, dividing up CrawlURIs/CrawlURIs * between crawlers by diverting some range of URIs to local log files * (which can then be imported to other crawlers). * * May operate on a CrawlURI (typically early in the processing chain) or * its CrawlURI outlinks (late in the processing chain, after * LinksScoper), or both (if inserted and configured in both places). * * <p>Uses lexical comparisons of classKeys to map URIs to crawlers. The * 'map' is specified via either a local or HTTP-fetchable file. Each * line of this file should contain two space-separated tokens, the * first a key and the second a crawler node name (which should be * legal as part of a filename). All URIs will be mapped to the crawler * node name associated with the nearest mapping key equal or subsequent * to the URI's own classKey. If there are no mapping keys equal or * after the classKey, the mapping 'wraps around' to the first mapping key. * * <p>One crawler name is distinguished as the 'local name'; URIs mapped to * this name are not diverted, but continue to be processed normally. * * <p>For example, assume a SurtAuthorityQueueAssignmentPolicy and * a simple mapping file: * * <pre> * d crawlerA * ~ crawlerB * </pre> * <p>All URIs with "com," classKeys will find the 'd' key as the nearest * subsequent mapping key, and thus be mapped to 'crawlerA'. If that's * the 'local name', the URIs will be processed normally; otherwise, the * URI will be written to a diversion log aimed for 'crawlerA'. * * <p>If using the JMX importUris operation importing URLs dropped by * a {@link LexicalCrawlMapper} instance, use <code>recoveryLog</code> style. * * @author gojomo * @version $Date$, $Revision$ */ public class LexicalCrawlMapper extends CrawlMapper { @SuppressWarnings("unused") private static final long serialVersionUID = 2L; /** * Path to map specification file. Each line should include 2 * whitespace-separated tokens: the first a key indicating the end of a * range, the second the crawler node to which URIs in the key range should * be mapped. This setting is ignored if MAP_URI is specified. */ protected ConfigPath mapPath = new ConfigPath("map specification file", "lexicalcrawlmapper.config"); public ConfigPath getMapPath() { return this.mapPath; } public void setMapPath(ConfigPath path) { this.mapPath = path; } /** * URI to map specification file. Each line should include 2 * whitespace-separated tokens: the first a key indicating the end of a * range, the second the crawler node to which URIs in the key range should * be mapped. This setting takes precedence over MAP_PATH; if both are * specified, then MAP_PATH is ignored. */ protected String mapUri = ""; public String getMapUri() { return this.mapUri; } public void setMapUri(String uri) { this.mapUri = uri; } protected Frontier frontier; public Frontier getFrontier() { return this.frontier; } @Autowired public void setFrontier(Frontier frontier) { this.frontier = frontier; } /** * Mapping of classKey ranges (as represented by their start) to * crawlers (by abstract name/filename) */ protected TreeMap<String, String> map = new TreeMap<String, String>(); /** * Constructor. */ public LexicalCrawlMapper() { super(); } /** * Look up the crawler node name to which the given CrawlURI * should be mapped. * * @param cauri CrawlURI to consider * @return String node name which should handle URI */ protected String map(CrawlURI cauri) { // get classKey, via frontier to generate if necessary String classKey = frontier.getClassKey(cauri); SortedMap<String, String> tail = map.tailMap(classKey); if (tail.isEmpty()) { // wraparound tail = map; } // target node is value of nearest subsequent key return (String) tail.get(tail.firstKey()); } public void start() { super.start(); try { loadMap(); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } /** * Retrieve and parse the mapping specification from a local path or * HTTP URL. * * @throws IOException */ protected void loadMap() throws IOException { map.clear(); String uri = getMapUri(); Reader reader = null; if (uri.trim().length() == 0) { File source = getMapPath().getFile(); reader = new FileReader(source); } else { URLConnection conn = (new URL(uri)).openConnection(); reader = new InputStreamReader(conn.getInputStream()); } reader = new BufferedReader(reader); Iterator<String> iter = new RegexLineIterator(new LineReadingIterator((BufferedReader) reader), RegexLineIterator.COMMENT_LINE, RegexLineIterator.TRIMMED_ENTRY_TRAILING_COMMENT, RegexLineIterator.ENTRY); while (iter.hasNext()) { String[] entry = ((String) iter.next()).split("\\s+"); map.put(entry[0], entry[1]); } reader.close(); } }