Java tutorial
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * SimplePolitenessEnforcer.java * Created on May 22, 2003 * * $Header$ */ package com.cyberway.issue.crawler.prefetch; import java.util.Iterator; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import javax.management.AttributeNotFoundException; import org.apache.commons.httpclient.URIException; import com.cyberway.issue.crawler.datamodel.CoreAttributeConstants; import com.cyberway.issue.crawler.datamodel.CrawlHost; import com.cyberway.issue.crawler.datamodel.CrawlServer; import com.cyberway.issue.crawler.datamodel.CrawlURI; import com.cyberway.issue.crawler.datamodel.CredentialStore; import com.cyberway.issue.crawler.datamodel.FetchStatusCodes; import com.cyberway.issue.crawler.datamodel.credential.Credential; import com.cyberway.issue.crawler.datamodel.credential.CredentialAvatar; import com.cyberway.issue.crawler.framework.Processor; import com.cyberway.issue.crawler.settings.SimpleType; import com.cyberway.issue.crawler.settings.Type; import com.cyberway.issue.net.UURI; /** * Ensures the preconditions for a fetch -- such as DNS lookup * or acquiring and respecting a robots.txt policy -- are * satisfied before a URI is passed to subsequent stages. * * @author gojomo */ public class PreconditionEnforcer extends Processor implements CoreAttributeConstants, FetchStatusCodes { private static final long serialVersionUID = 4636474153589079615L; private static final Logger logger = Logger.getLogger(PreconditionEnforcer.class.getName()); private final static Integer DEFAULT_IP_VALIDITY_DURATION = new Integer(60 * 60 * 6); // six hours private final static Integer DEFAULT_ROBOTS_VALIDITY_DURATION = new Integer(60 * 60 * 24); // one day /** seconds to keep IP information for */ public final static String ATTR_IP_VALIDITY_DURATION = "ip-validity-duration-seconds"; /** seconds to cache robots info */ public final static String ATTR_ROBOTS_VALIDITY_DURATION = "robot-validity-duration-seconds"; /** whether to calculate robots exclusion without applying */ public final static Boolean DEFAULT_CALCULATE_ROBOTS_ONLY = Boolean.FALSE; public final static String ATTR_CALCULATE_ROBOTS_ONLY = "calculate-robots-only"; public PreconditionEnforcer(String name) { super(name, "Precondition enforcer"); Type e; e = addElementToDefinition(new SimpleType(ATTR_IP_VALIDITY_DURATION, "The minimum interval for which a dns-record will be considered " + "valid (in seconds). " + "If the record's DNS TTL is larger, that will be used instead.", DEFAULT_IP_VALIDITY_DURATION)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_ROBOTS_VALIDITY_DURATION, "The time in seconds that fetched robots.txt information is " + "considered to be valid. " + "If the value is set to '0', then the robots.txt information" + " will never expire.", DEFAULT_ROBOTS_VALIDITY_DURATION)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_CALCULATE_ROBOTS_ONLY, "Whether to only calculate the robots status of an URI, " + "without actually applying any exclusions found. If true, " + "exlcuded URIs will only be annotated in the crawl.log, but " + "still fetched. Default is false. ", DEFAULT_CALCULATE_ROBOTS_ONLY)); e.setExpertSetting(true); } protected void innerProcess(CrawlURI curi) { if (considerDnsPreconditions(curi)) { return; } // make sure we only process schemes we understand (i.e. not dns) String scheme = curi.getUURI().getScheme().toLowerCase(); if (!(scheme.equals("http") || scheme.equals("https"))) { logger.fine("PolitenessEnforcer doesn't understand uri's of type " + scheme + " (ignoring)"); return; } if (considerRobotsPreconditions(curi)) { return; } if (!curi.isPrerequisite() && credentialPrecondition(curi)) { return; } // OK, it's allowed // For all curis that will in fact be fetched, set appropriate delays. // TODO: SOMEDAY: allow per-host, per-protocol, etc. factors // curi.setDelayFactor(getDelayFactorFor(curi)); // curi.setMinimumDelay(getMinimumDelayFor(curi)); return; } /** * Consider the robots precondition. * * @param curi CrawlURI we're checking for any required preconditions. * @return True, if this <code>curi</code> has a precondition or processing * should be terminated for some other reason. False if * we can precede to process this url. */ private boolean considerRobotsPreconditions(CrawlURI curi) { //CMS??? // treat /robots.txt fetches specially UURI uuri = curi.getUURI(); try { if (uuri != null && uuri.getPath() != null && curi.getUURI().getPath().equals("/robots.txt")) { // allow processing to continue curi.setPrerequisite(true); return false; } } catch (URIException e) { logger.severe("Failed get of path for " + curi); } // require /robots.txt if not present if (isRobotsExpired(curi)) { // Need to get robots if (logger.isLoggable(Level.FINE)) { logger.fine("No valid robots for " + getController().getServerCache().getServerFor(curi) + "; deferring " + curi); } // Robots expired - should be refetched even though its already // crawled. try { String prereq = curi.getUURI().resolve("/robots.txt").toString(); curi.markPrerequisite(prereq, getController().getPostprocessorChain()); } catch (URIException e1) { logger.severe("Failed resolve using " + curi); throw new RuntimeException(e1); // shouldn't ever happen } return true; } // test against robots.txt if available CrawlServer cs = getController().getServerCache().getServerFor(curi); if (cs.isValidRobots()) { String ua = getController().getOrder().getUserAgent(curi); if (cs.getRobots().disallows(curi, ua)) { if (((Boolean) getUncheckedAttribute(curi, ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) { // annotate URI as excluded, but continue to process normally curi.addAnnotation("robotExcluded"); return false; } // mark as precluded; in FetchHTTP, this will // prevent fetching and cause a skip to the end // of processing (unless an intervening processor // overrules) curi.setFetchStatus(S_ROBOTS_PRECLUDED); curi.putString("error", "robots.txt exclusion"); logger.fine("robots.txt precluded " + curi); return true; } return false; } // No valid robots found => Attempt to get robots.txt failed curi.skipToProcessorChain(getController().getPostprocessorChain()); curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE); curi.putString("error", "robots.txt prerequisite failed"); if (logger.isLoggable(Level.FINE)) { logger.fine("robots.txt prerequisite failed " + curi); } return true; //return false; } /** * @param curi CrawlURI whose dns prerequisite we're to check. * @return true if no further processing in this module should occur */ private boolean considerDnsPreconditions(CrawlURI curi) { if (curi.getUURI().getScheme().equals("dns")) { // DNS URIs never have a DNS precondition curi.setPrerequisite(true); return false; } CrawlServer cs = getController().getServerCache().getServerFor(curi); if (cs == null) { curi.setFetchStatus(S_UNFETCHABLE_URI); curi.skipToProcessorChain(getController().getPostprocessorChain()); return true; } // If we've done a dns lookup and it didn't resolve a host // cancel further fetch-processing of this URI, because // the domain is unresolvable CrawlHost ch = getController().getServerCache().getHostFor(curi); if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) { if (logger.isLoggable(Level.FINE)) { logger.fine("no dns for " + ch + " cancelling processing for CrawlURI " + curi.toString()); } curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE); curi.skipToProcessorChain(getController().getPostprocessorChain()); return true; } // If we haven't done a dns lookup and this isn't a dns uri // shoot that off and defer further processing if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) { logger.fine("Deferring processing of CrawlURI " + curi.toString() + " for dns lookup."); String preq = "dns:" + ch.getHostName(); try { curi.markPrerequisite(preq, getController().getPostprocessorChain()); } catch (URIException e) { throw new RuntimeException(e); // shouldn't ever happen } return true; } // DNS preconditions OK return false; } /** * Get the maximum time a dns-record is valid. * * @param curi the uri this time is valid for. * @return the maximum time a dns-record is valid -- in seconds -- or * negative if record's ttl should be used. */ public long getIPValidityDuration(CrawlURI curi) { Integer d; try { d = (Integer) getAttribute(ATTR_IP_VALIDITY_DURATION, curi); } catch (AttributeNotFoundException e) { d = DEFAULT_IP_VALIDITY_DURATION; } return d.longValue(); } /** Return true if ip should be looked up. * * @param curi the URI to check. * @return true if ip should be looked up. */ public boolean isIpExpired(CrawlURI curi) { CrawlHost host = getController().getServerCache().getHostFor(curi); if (!host.hasBeenLookedUp()) { // IP has not been looked up yet. return true; } if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) { // IP never expires (numeric IP) return false; } long duration = getIPValidityDuration(curi); if (duration == 0) { // Never expire ip if duration is null (set by user or more likely, // set to zero in case where we tried in FetchDNS but failed). return false; } // catch old "default" -1 settings that are now problematic, // convert to new minimum if (duration <= 0) { duration = DEFAULT_IP_VALIDITY_DURATION.intValue(); } long ttl = host.getIpTTL(); if (ttl > duration) { // Use the larger of the operator-set minimum duration // or the DNS record TTL duration = ttl; } // Duration and ttl are in seconds. Convert to millis. if (duration > 0) { duration *= 1000; } return (duration + host.getIpFetched()) < System.currentTimeMillis(); } /** Get the maximum time a robots.txt is valid. * * @param curi * @return the time a robots.txt is valid in milliseconds. */ public long getRobotsValidityDuration(CrawlURI curi) { Integer d; try { d = (Integer) getAttribute(ATTR_ROBOTS_VALIDITY_DURATION, curi); } catch (AttributeNotFoundException e) { // This should never happen, but if it does, return default logger.severe(e.getLocalizedMessage()); d = DEFAULT_ROBOTS_VALIDITY_DURATION; } // convert from seconds to milliseconds return d.longValue() * 1000; } /** * Is the robots policy expired. * * This method will also return true if we haven't tried to get the * robots.txt for this server. * * @param curi * @return true if the robots policy is expired. */ public boolean isRobotsExpired(CrawlURI curi) { CrawlServer server = getController().getServerCache().getServerFor(curi); long robotsFetched = server.getRobotsFetchedTime(); if (robotsFetched == CrawlServer.ROBOTS_NOT_FETCHED) { // Have not attempted to fetch robots return true; } long duration = getRobotsValidityDuration(curi); if (duration == 0) { // When zero, robots should be valid forever return false; } if (robotsFetched + duration < System.currentTimeMillis()) { // Robots is still valid return true; } return false; } /** * Consider credential preconditions. * * Looks to see if any credential preconditions (e.g. html form login * credentials) for this <code>CrawlServer</code>. If there are, have they * been run already? If not, make the running of these logins a precondition * of accessing any other url on this <code>CrawlServer</code>. * * <p> * One day, do optimization and avoid running the bulk of the code below. * Argument for running the code everytime is that overrides and refinements * may change what comes back from credential store. * * @param curi CrawlURI we're checking for any required preconditions. * @return True, if this <code>curi</code> has a precondition that needs to * be met before we can proceed. False if we can precede to process * this url. */ private boolean credentialPrecondition(final CrawlURI curi) { boolean result = false; CredentialStore cs = CredentialStore.getCredentialStore(getSettingsHandler()); if (cs == null) { logger.severe("No credential store for " + curi); return result; } Iterator i = cs.iterator(curi); if (i == null) { return result; } while (i.hasNext()) { Credential c = (Credential) i.next(); if (c.isPrerequisite(curi)) { // This credential has a prereq. and this curi is it. Let it // through. Add its avatar to the curi as a mark. Also, does // this curi need to be posted? Note, we do this test for // is it a prereq BEFORE we do the check that curi is of the // credential domain because such as yahoo have you go to // another domain altogether to login. c.attach(curi); curi.setPost(c.isPost(curi)); break; } if (!c.rootUriMatch(getController(), curi)) { continue; } if (!c.hasPrerequisite(curi)) { continue; } if (!authenticated(c, curi)) { // Han't been authenticated. Queue it and move on (Assumption // is that we can do one authentication at a time -- usually one // html form). String prereq = c.getPrerequisite(curi); if (prereq == null || prereq.length() <= 0) { CrawlServer server = getController().getServerCache().getServerFor(curi); logger.severe(server.getName() + " has " + " credential(s) of type " + c + " but prereq" + " is null."); } else { try { curi.markPrerequisite(prereq, getController().getPostprocessorChain()); } catch (URIException e) { logger.severe("unable to set credentials prerequisite " + prereq); getController().logUriError(e, curi.getUURI(), prereq); return false; } result = true; if (logger.isLoggable(Level.FINE)) { logger.fine("Queueing prereq " + prereq + " of type " + c + " for " + curi); } break; } } } return result; } /** * Has passed credential already been authenticated. * * @param credential Credential to test. * @param curi CrawlURI. * @return True if already run. */ private boolean authenticated(final Credential credential, final CrawlURI curi) { boolean result = false; CrawlServer server = getController().getServerCache().getServerFor(curi); if (!server.hasCredentialAvatars()) { return result; } Set avatars = server.getCredentialAvatars(); for (Iterator i = avatars.iterator(); i.hasNext();) { CredentialAvatar ca = (CredentialAvatar) i.next(); String key = null; try { key = credential.getKey(curi); } catch (AttributeNotFoundException e) { logger.severe("Failed getting key for " + credential + " for " + curi); continue; } if (ca.match(credential.getClass(), key)) { result = true; } } return result; } }