org.archive.modules.CrawlMetadata.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.modules.CrawlMetadata.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;

import org.archive.modules.fetcher.UserAgentProvider;
import org.archive.modules.net.RobotsPolicy;
import org.archive.spring.BeanFieldsPatternValidator;
import org.archive.spring.HasKeyedProperties;
import org.archive.spring.HasValidator;
import org.archive.spring.KeyedProperties;
import org.archive.util.ArchiveUtils;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.validation.Validator;

/**
 * ?
 * 
 * Basic crawl metadata, as consulted by functional modules and
 * recorded in ARCs/WARCs.
 * 
 * @contributor pjack
 */
public class CrawlMetadata
        implements UserAgentProvider, Serializable, HasKeyedProperties, HasValidator, InitializingBean {
    private static final long serialVersionUID = 1L;

    protected KeyedProperties kp = new KeyedProperties();

    public KeyedProperties getKeyedProperties() {
        return kp;
    }

    /**
     * Robots policy name
     */
    {
        setRobotsPolicyName("obey");
    }

    public String getRobotsPolicyName() {
        return (String) kp.get("robotsPolicyName");
    }

    /**
     *  @Autowired Spring ?
     *  Bean ?? Bean Spring 
     * BeanCreationException ? Bean
     * @Autowired(required = false) Spring?? Bean ?
     * 
     */
    @Autowired(required = false)
    public void setRobotsPolicyName(String policy) {
        kp.put("robotsPolicyName", policy);
    }

    /** Map of all available RobotsPolicies, by name, to choose from. 
     * assembled from declared instances in configuration plus the standard
     * 'obey' (aka 'classic') and 'ignore' policies. */
    protected Map<String, RobotsPolicy> availableRobotsPolicies = new HashMap<String, RobotsPolicy>();

    public Map<String, RobotsPolicy> getAvailableRobotsPolicies() {
        return availableRobotsPolicies;
    }

    @Autowired(required = false)
    public void setAvailableRobotsPolicies(Map<String, RobotsPolicy> policies) {
        availableRobotsPolicies = policies;
        ensureStandardPoliciesAvailable();
    }

    protected void ensureStandardPoliciesAvailable() {
        availableRobotsPolicies.putAll(RobotsPolicy.STANDARD_POLICIES);
    }

    /**
     * Get the currently-effective RobotsPolicy, as specified by the
     * string name and chosen from the full available map. (Setting 
     * a different policy for some sites/URL patterns is best acheived
     * by establishing a setting overlay for the robotsPolicyName 
     * property.)
     */
    public RobotsPolicy getRobotsPolicy() {
        return availableRobotsPolicies.get(getRobotsPolicyName());
    }

    protected String operator = "";

    public String getOperator() {
        return operator;
    }

    public void setOperator(String operatorName) {
        this.operator = operatorName;
    }

    protected String description = "";

    public String getDescription() {
        return description;
    }

    public void setDescription(String description) {
        this.description = description;
    }

    {
        setUserAgentTemplate("Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)");
    }

    public String getUserAgentTemplate() {
        return (String) kp.get("userAgentTemplate");
    }

    public void setUserAgentTemplate(String template) {
        // TODO compile pattern outside method
        //        if(!template.matches("^.*\\+@OPERATOR_CONTACT_URL@.*$")) {
        //            throw new IllegalArgumentException("bad user-agent: "+template);
        //        }
        kp.put("userAgentTemplate", template);
    }

    {
        setOperatorFrom("");
    }

    public String getOperatorFrom() {
        return (String) kp.get("operatorFrom");
    }

    public void setOperatorFrom(String operatorFrom) {
        // TODO compile pattern outside method
        //        if(!operatorFrom.matches("^(\\s*|\\S+@[-\\w]+\\.[-\\w\\.]+)$")) {
        //            throw new IllegalArgumentException("bad operatorFrom: "+operatorFrom);
        //        }
        kp.put("operatorFrom", operatorFrom);
    }

    {
        // set default to illegal value
        kp.put("operatorContactUrl", "ENTER-A-CONTACT-HTTP-URL-FOR-CRAWL-OPERATOR");
    }

    public String getOperatorContactUrl() {
        return (String) kp.get("operatorContactUrl");
    }

    public void setOperatorContactUrl(String operatorContactUrl) {
        // TODO compile pattern outside method
        //        if(!operatorContactUrl.matches("^https?://.*$")) {
        //            throw new IllegalArgumentException("bad operatorContactUrl: "+operatorContactUrl);
        //        }
        kp.put("operatorContactUrl", operatorContactUrl);
    }

    protected String audience = "";

    public String getAudience() {
        return audience;
    }

    public void setAudience(String audience) {
        this.audience = audience;
    }

    protected String organization = "";

    public String getOrganization() {
        return organization;
    }

    public void setOrganization(String organization) {
        this.organization = organization;
    }

    public String getUserAgent() {
        String userAgent = getUserAgentTemplate();
        String contactURL = getOperatorContactUrl();
        userAgent = userAgent.replaceFirst("@OPERATOR_CONTACT_URL@", contactURL);
        userAgent = userAgent.replaceFirst("@VERSION@", Matcher.quoteReplacement(ArchiveUtils.VERSION));
        return userAgent;
    }

    protected String jobName;

    public String getJobName() {
        return jobName;
    }

    public void setJobName(String jobName) {
        this.jobName = jobName;
    }

    public String getFrom() {
        return getOperatorFrom();
    }

    public void afterPropertiesSet() {
        // force revalidation, throwing exception if invalid
        setOperatorContactUrl(getOperatorContactUrl());
        ensureStandardPoliciesAvailable();
    }

    protected static Validator VALIDATOR = new BeanFieldsPatternValidator(CrawlMetadata.class, "userAgentTemplate",
            "^.*\\+@OPERATOR_CONTACT_URL@.*$",
            "You must supply a userAgentTemplate value that includes "
                    + "the string \"@OPERATOR_CONTACT_URL@\" where your crawl" + "contact URL will appear.",

            "operatorContactUrl", "^https?://.*$",
            "You must supply an HTTP(S) URL which will be included "
                    + "in your user-agent and should explain the purpose of your "
                    + "crawl and how to contact the crawl operator in the event " + "of webmaster issues.",

            "operatorFrom", "^(\\s*|\\S+@[-\\w]+\\.[-\\w\\.]+)|()$",
            "If not blank, operatorFrom must be an email address.");

    public Validator getValidator() {
        return VALIDATOR;
    }
}