dk.netarkivet.harvester.harvesting.frontier.FrontierReportLine.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.harvester.harvesting.frontier.FrontierReportLine.java

Source

/*
 * #%L
 * Netarchivesuite - harvester
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package dk.netarkivet.harvester.harvesting.frontier;

import java.io.Serializable;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.sleepycat.persist.model.Persistent;

import dk.netarkivet.common.exceptions.ArgumentNotValid;

/**
 * Wraps a line of the frontier report. As of Heritrix 1.14.4, the format of a frontier report line sequentially lists
 * the following tokens, separated by a whitespace :
 * <p>
 * <ol>
 * <li>queue</li>
 * <li>currentSize</li>
 * <li>totalEnqueues</li>
 * <li>sessionBalance</li>
 * <li>lastCost(averageCost)</li>
 * <li>lastDequeueTime</li>
 * <li>wakeTime</li>
 * <li>totalSpend/totalBudget</li>
 * <li>errorCount</li>
 * <li>lastPeekUri</li>
 * <li>lastQueuedUri</li>
 * </ol>
 * <p>
 * This class implements a natural order : comparisons are made : - first by decreasing values of totalEnqueues -
 * secondly by domain name (string natural order)
 * <p>
 * Thanks to Gordon Mohr at Internet Archive for explaining the exact semantics of the frontier report fields.
 */
@Persistent
@SuppressWarnings({ "serial" })
public class FrontierReportLine
        implements Serializable, Comparable<FrontierReportLine>, FrontierReportLineOrderKey {

    /** The logger for this class. */
    private static final Log LOG = LogFactory.getLog(FrontierReportLine.class);

    /**
     * Expected size of string array when we split the line token across "\\s+".
     */
    private static final int EXPECTED_SPLIT_SEGMENTS = 11;

    /**
     * Token used to signify an empty value.
     */
    static final String EMPTY_VALUE_TOKEN = "-";

    /**
     * The queue name, in our case the domain, as we use per domain queues.
     */
    private String domainName;

    /** Number of URIs currently in the queue. */
    private long currentSize;

    /**
     * Count of total times a URI has been enqueued to this queue; a measure of the total number of URI instances ever
     * put on this queue. This can be a larger number than the unique URIs, as some URIs (most notably DNS/robots when
     * refetched, but possibly other things force-requeued under advanced usage) may be enqueued more than once.
     */
    private long totalEnqueues;

    /**
     * When using the 'budget/rotation' functionality (a non-zero URI cost policy), this is the running 'balance' of a
     * queue during its current 'active' session. This balance declines; when it hits zero, another queue (if any are
     * waiting 'inactive') gets a chance to enter active crawling (as fast as politeness allows).
     */
    private long sessionBalance;

    /**
     * The 'cost' of the last URI charged against the queue's budgets. If using a cost policy that makes some URIs more
     * costly than others, this may indicate the queue has reached more-costly URIs. (Such larger-cost URIs will be
     * inserted later in the queue, accelerate the depletion of the session balance, and accelerate progress towards the
     * total queue budget, which could send the queue into 'retirement'. Thus higher-cost URIs mean a queue over time
     * gets less of the crawler's cycles.)
     */
    private double lastCost;

    /** Average cost of a processed URI. */
    private double averageCost;

    /**
     * Timestamp of when the last URI came off this queue for processing. May give an indication of how long a queue has
     * been empty/inactive.
     */
    private String lastDequeueTime;

    /**
     * If the queue is in any sort of politeness- or connect-problem-'snooze' delay, this indicates when it will again
     * be eligible to offer URIs to waiting threads. (When it wakes, it gets in line -- so actual wait before next URI
     * is tried may be longer depending on the balance of threads and other active queues.)
     */
    private String wakeTime;

    /**
     * The total of all URI costs charged against this queue.
     */
    private long totalSpend;

    /**
     * The totalBudget above which the queue will be retired (made permanently inactive unless its totalBudget is
     * raised).
     */
    private long totalBudget;

    /**
     * The number of URIs from this queue that reached 'finished' status with an error code (non-retryable errors, or
     * exhausted retries, or other errors). When nonzero and rising there may be special problems with the site(s)
     * related to this queue.
     */
    private long errorCount;

    /**
     * The last URI peeked/dequeued from the head of this queue.
     */
    private String lastPeekUri;

    /**
     * The last URI enqueued to anywhere in this queue.
     */
    private String lastQueuedUri;

    /**
     * Default empty constructor.
     */
    public FrontierReportLine() {

    }

    /**
     * Builds a cloned line.
     *
     * @param original the line to clone
     */
    protected FrontierReportLine(FrontierReportLine original) {
        this.averageCost = original.averageCost;
        this.currentSize = original.currentSize;
        this.domainName = original.domainName;
        this.errorCount = original.errorCount;
        this.lastCost = original.lastCost;
        this.lastDequeueTime = original.lastDequeueTime;
        this.lastPeekUri = original.lastPeekUri;
        this.lastQueuedUri = original.lastQueuedUri;
        this.sessionBalance = original.sessionBalance;
        this.totalBudget = original.totalBudget;
        this.totalEnqueues = original.totalEnqueues;
        this.totalSpend = original.totalSpend;
        this.wakeTime = original.wakeTime;
    }

    /**
     * Parses the given string.
     *
     * @param lineToken the string to parse.
     */
    FrontierReportLine(String lineToken) {

        String[] split = lineToken.split("\\s+");

        if (split.length != EXPECTED_SPLIT_SEGMENTS) {
            throw new ArgumentNotValid(
                    "Format of line token '" + lineToken + "' is not a valid frontier report line!");
        }

        this.domainName = split[0];
        try {
            this.currentSize = parseLong(split[1]);
        } catch (NumberFormatException e) {
            LOG.warn("Found incorrect formatted currentsize " + split[1]);
        }
        this.totalEnqueues = parseLong(split[2]);
        this.sessionBalance = parseLong(split[3]);

        // Cost token is lastCost(averageCost)
        String costToken = split[4];
        int leftParenIdx = costToken.indexOf("(");
        this.lastCost = parseDouble(costToken.substring(0, leftParenIdx));
        this.averageCost = parseDouble(costToken.substring(leftParenIdx + 1, costToken.indexOf(")")));
        this.lastDequeueTime = split[5];
        this.wakeTime = split[6];

        // Budget token is totalSpend/totalBudget
        String[] budgetTokens = split[7].split("/");
        if (budgetTokens.length != 2) {
            LOG.warn("Found incorrect budget token '" + split[7]);
        } else {
            this.totalSpend = parseLong(budgetTokens[0]);
            this.totalBudget = parseLong(budgetTokens[1]);
        }

        this.errorCount = parseLong(split[8]);

        this.lastPeekUri = split[9];
        this.lastQueuedUri = split[10];

    }

    /**
     * @return the domainName
     */
    public String getDomainName() {
        return domainName;
    }

    /**
     * @param domainName the domainName to set
     */
    public void setDomainName(String domainName) {
        this.domainName = domainName;
    }

    /**
     * @return the currentSize
     */
    public long getCurrentSize() {
        return currentSize;
    }

    /**
     * @param currentSize the currentSize to set
     */
    public void setCurrentSize(long currentSize) {
        this.currentSize = currentSize;
    }

    /**
     * @return the totalEnqueues
     */
    public long getTotalEnqueues() {
        return totalEnqueues;
    }

    /**
     * @param totalEnqueues the totalEnqueues to set
     */
    public void setTotalEnqueues(long totalEnqueues) {
        this.totalEnqueues = totalEnqueues;
    }

    /**
     * @return the sessionBalance
     */
    public long getSessionBalance() {
        return sessionBalance;
    }

    /**
     * @param sessionBalance the sessionBalance to set
     */
    public void setSessionBalance(long sessionBalance) {
        this.sessionBalance = sessionBalance;
    }

    /**
     * @return the lastCost
     */
    public double getLastCost() {
        return lastCost;
    }

    /**
     * @param lastCost the lastCost to set
     */
    public void setLastCost(double lastCost) {
        this.lastCost = lastCost;
    }

    /**
     * @return the averageCost
     */
    public double getAverageCost() {
        return averageCost;
    }

    /**
     * @param averageCost the averageCost to set
     */
    public void setAverageCost(double averageCost) {
        this.averageCost = averageCost;
    }

    /**
     * @return the lastDequeueTime
     */
    public String getLastDequeueTime() {
        return lastDequeueTime;
    }

    /**
     * @param lastDequeueTime the lastDequeueTime to set
     */
    public void setLastDequeueTime(String lastDequeueTime) {
        this.lastDequeueTime = lastDequeueTime;
    }

    /**
     * @return the wakeTime
     */
    public String getWakeTime() {
        return wakeTime;
    }

    /**
     * @param wakeTime the wakeTime to set
     */
    public void setWakeTime(String wakeTime) {
        this.wakeTime = wakeTime;
    }

    /**
     * @return the totalSpend
     */
    public long getTotalSpend() {
        return totalSpend;
    }

    /**
     * @param totalSpend the totalSpend to set
     */
    public void setTotalSpend(long totalSpend) {
        this.totalSpend = totalSpend;
    }

    /**
     * @return the totalBudget
     */
    public long getTotalBudget() {
        return totalBudget;
    }

    /**
     * @param totalBudget the totalBudget to set
     */
    public void setTotalBudget(long totalBudget) {
        this.totalBudget = totalBudget;
    }

    /**
     * @return the errorCount
     */
    public long getErrorCount() {
        return errorCount;
    }

    /**
     * @param errorCount the errorCount to set
     */
    public void setErrorCount(long errorCount) {
        this.errorCount = errorCount;
    }

    /**
     * @return the lastPeekUri
     */
    public String getLastPeekUri() {
        return lastPeekUri;
    }

    /**
     * @param lastPeekUri the lastPeekUri to set
     */
    public void setLastPeekUri(String lastPeekUri) {
        this.lastPeekUri = lastPeekUri;
    }

    /**
     * @return the lastQueuedUri
     */
    public String getLastQueuedUri() {
        return lastQueuedUri;
    }

    /**
     * @param lastQueuedUri the lastQueuedUri to set
     */
    public void setLastQueuedUri(String lastQueuedUri) {
        this.lastQueuedUri = lastQueuedUri;
    }

    /**
     * Default order relation is descending size of the queue (totalEnqueues).
     */
    @Override
    public int compareTo(FrontierReportLine l) {
        return FrontierReportLineNaturalOrder.getInstance().compare(this, l);
    }

    /**
     * There is one queue per domain, so equality is based on the domain name.
     */
    @Override
    public boolean equals(Object obj) {
        if (obj instanceof FrontierReportLine) {
            return domainName.equals(((FrontierReportLine) obj).getDomainName());
        }
        return false;
    }

    /**
     * There is one queue per domain, so hashcode is based on the domain name.
     */
    @Override
    public int hashCode() {
        return domainName.hashCode();
    }

    public String getQueueId() {
        return domainName;
    }

    public long getQueueSize() {
        return totalEnqueues;
    }

    /**
     * Parses the token.
     *
     * @param longToken token to parse.
     * @return parsed value or default value if value is empty or unparsable.
     */
    private static long parseLong(String longToken) {
        if (EMPTY_VALUE_TOKEN.equals(longToken)) {
            return Long.MIN_VALUE;
        }
        try {
            return Long.parseLong(longToken);
        } catch (NumberFormatException e) {
            // Strange data my occur here, but it's harmless
            return Long.MIN_VALUE;
        }
    }

    /**
     * Parses the token.
     *
     * @param dblToken token to parse.
     * @return parsed value or default value if value is empty or unparsable.
     */
    private static double parseDouble(String dblToken) {
        if (EMPTY_VALUE_TOKEN.equals(dblToken)) {
            return Double.MIN_VALUE;
        }
        try {
            return Double.parseDouble(dblToken);
        } catch (NumberFormatException e) {
            // Strange data my occur here, but it's harmless
            return Double.MIN_VALUE;
        }
    }

}