dk.netarkivet.harvester.harvesting.frontier.FullFrontierReport.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.harvester.harvesting.frontier.FullFrontierReport.java

Source

/*
 * #%L
 * Netarchivesuite - harvester
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package dk.netarkivet.harvester.harvesting.frontier;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import com.sleepycat.persist.EntityCursor;
import com.sleepycat.persist.EntityStore;
import com.sleepycat.persist.PrimaryIndex;
import com.sleepycat.persist.SecondaryIndex;
import com.sleepycat.persist.StoreConfig;
import com.sleepycat.persist.model.Entity;
import com.sleepycat.persist.model.KeyField;
import com.sleepycat.persist.model.Persistent;
import com.sleepycat.persist.model.PrimaryKey;
import com.sleepycat.persist.model.Relationship;
import com.sleepycat.persist.model.SecondaryKey;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;

/**
 * Wraps an Heritrix full frontier report. As these reports can be big in size, this implementation relies on Berkeley
 * DB direct persistence layer to store the report lines, allowing to store the lines partially in memory, and on disk.
 */
@SuppressWarnings({ "serial" })
public class FullFrontierReport extends AbstractFrontierReport {

    @Persistent
    static class PersistentLineKey implements Comparable<PersistentLineKey>, FrontierReportLineOrderKey {

        @KeyField(1)
        long totalEnqueues;

        @KeyField(2)
        String domainName;

        // Default empty constructor for BDB.
        PersistentLineKey() {

        }

        public PersistentLineKey(FrontierReportLine l) {
            this.domainName = l.getDomainName();
            this.totalEnqueues = l.getTotalEnqueues();
        }

        public String getQueueId() {
            return domainName;
        }

        public long getQueueSize() {
            return totalEnqueues;
        }

        /**
         * Compares first by decreasing queue size, then by domain name.
         */
        @Override
        public int compareTo(PersistentLineKey k) {
            return FrontierReportLineNaturalOrder.getInstance().compare(this, k);
        }

        @Override
        public String toString() {
            return totalEnqueues + " " + domainName;
        }

    }

    @Entity
    static class PersistentLine extends FrontierReportLine {

        @PrimaryKey
        private PersistentLineKey primaryKey;

        @SecondaryKey(relate = Relationship.ONE_TO_ONE)
        private String domainNameKey;

        @SecondaryKey(relate = Relationship.MANY_TO_ONE)
        private Long totalSpendKey;

        @SecondaryKey(relate = Relationship.MANY_TO_ONE)
        private Long currentSizeKey;

        // Default empty constructor for BDB.
        PersistentLine() {

        }

        PersistentLine(FrontierReportLine reportLine) {
            super(reportLine);
            this.primaryKey = new PersistentLineKey(reportLine);
            this.domainNameKey = reportLine.getDomainName();
            this.currentSizeKey = reportLine.getCurrentSize();
            this.totalSpendKey = reportLine.getTotalSpend();
        }

    }

    public class ReportIterator implements Iterator<FrontierReportLine> {

        private final EntityCursor<PersistentLine> cursor;
        private final Iterator<PersistentLine> iter;

        /**
         * Returns an iterator on the given sort key.
         *
         * @param cursor The cursor (sort key) to iterate on.
         */
        ReportIterator(EntityCursor<PersistentLine> cursor) {
            this.cursor = cursor;
            iter = cursor.iterator();
        }

        @Override
        public boolean hasNext() {
            return iter.hasNext();
        }

        @Override
        public FrontierReportLine next() {
            return iter.next();
        }

        @Override
        public void remove() {
            throw new ArgumentNotValid("Remove is not supported!");
        }

        /**
         * Close method should be called explicitely to free underlying resources!
         */
        public void close() {
            try {
                cursor.close();
            } catch (DatabaseException e) {
                LOG.error("Error closing entity cursor:\n" + e.getLocalizedMessage());
            }
        }

    }

    private static final String WORKING_DIR = FullFrontierReport.class.getSimpleName();

    /** The logger for this class. */
    private static final Log LOG = LogFactory.getLog(FullFrontierReport.class);

    /**
     * The Berkeley DB JE environment.
     */
    private final Environment dbEnvironment;

    /**
     * The BDB entity store.
     */
    private final EntityStore store;

    /**
     * Primary index.
     */
    private final PrimaryIndex<PersistentLineKey, PersistentLine> linesIndex;

    /**
     * Secondary index, per domain name.
     */
    private final SecondaryIndex<String, PersistentLineKey, PersistentLine> linesByDomain;

    /**
     * Secondary index, per current size.
     */
    private final SecondaryIndex<Long, PersistentLineKey, PersistentLine> linesByCurrentSize;

    /**
     * Secondary index, per spent budget.
     */
    private final SecondaryIndex<Long, PersistentLineKey, PersistentLine> linesBySpentBudget;

    /**
     * The directory where the BDB is stored.
     */
    private final File storageDir;

    /**
     * Builds an empty frontier report wrapper.
     *
     * @param jobName the Heritrix job name
     */
    private FullFrontierReport(String jobName) {
        super(jobName);

        File workingDir = new File(Settings.getFile(CommonSettings.CACHE_DIR), WORKING_DIR);

        this.storageDir = new File(workingDir, jobName);
        if (!storageDir.mkdirs()) {
            throw new IOFailure("Failed to create directory " + storageDir.getAbsolutePath());
        }

        try {
            EnvironmentConfig envConfig = new EnvironmentConfig();
            envConfig.setAllowCreate(true);
            dbEnvironment = new Environment(storageDir, envConfig);

            StoreConfig storeConfig = new StoreConfig();
            storeConfig.setAllowCreate(true);

            store = new EntityStore(dbEnvironment, FrontierReportLine.class.getSimpleName() + "-" + jobName,
                    storeConfig);

            linesIndex = store.getPrimaryIndex(PersistentLineKey.class, PersistentLine.class);

            linesByDomain = store.getSecondaryIndex(linesIndex, String.class, "domainNameKey");

            linesByCurrentSize = store.getSecondaryIndex(linesIndex, Long.class, "currentSizeKey");

            linesBySpentBudget = store.getSecondaryIndex(linesIndex, Long.class, "totalSpendKey");

        } catch (DatabaseException e) {
            throw new IOFailure("Failed to init frontier BDB for job " + jobName, e);
        }

    }

    /**
     * Releases all resources once this report is to be discarded. NB this method MUST be explicitly called!
     */
    public void dispose() {

        try {
            store.close();
            dbEnvironment.cleanLog();
            dbEnvironment.close();
        } catch (DatabaseException e) {
            throw new IOFailure("Failed to close frontier BDB for job " + getJobName(), e);
        }

        FileUtils.removeRecursively(storageDir);
    }

    @Override
    public void addLine(FrontierReportLine line) {
        try {
            linesIndex.put(new PersistentLine(line));
        } catch (DatabaseException e) {
            throw new IOFailure("Failed to store frontier report line for job " + getJobName(), e);
        }
    }

    @Override
    public FrontierReportLine getLineForDomain(String domainName) {
        try {
            return linesByDomain.get(domainName);
        } catch (DatabaseException e) {
            LOG.warn("Failed to get queue for domain " + domainName, e);
            return null;
        }
    }

    /**
     * Returns an iterator where lines are ordered by primary key order: first by decreasing totalEnqueues, then by
     * domain name natural order.
     *
     * @return an iterator on the report lines.
     */
    public ReportIterator iterateOnTotalEnqueues() {
        try {
            return new ReportIterator(linesIndex.entities());
        } catch (DatabaseException e) {
            throw new IOFailure("Failed to read frontier BDB for job " + getJobName(), e);
        }
    }

    /**
     * Returns an iterator where lines are ordered by domain name natural order.
     *
     * @return an iterator on the report lines.
     */
    public ReportIterator iterateOnDomainName() {
        try {
            return new ReportIterator(linesByDomain.entities());
        } catch (DatabaseException e) {
            throw new IOFailure("Failed to read frontier BDB for job " + getJobName(), e);
        }
    }

    /**
     * Returns an iterator where lines are ordered by increasing currentSize.
     *
     * @return an iterator on the report lines.
     */
    public ReportIterator iterateOnCurrentSize() {
        try {
            return new ReportIterator(linesByCurrentSize.entities());
        } catch (DatabaseException e) {
            throw new IOFailure("Failed to read frontier BDB for job " + getJobName(), e);
        }
    }

    /**
     * Returns an iterator on lines having a given currentSize.
     *
     * @param dupValue
     * @return an iterator on the report lines.
     */
    public ReportIterator iterateOnDuplicateCurrentSize(long dupValue) {
        try {
            return new ReportIterator(linesByCurrentSize.subIndex(dupValue).entities());
        } catch (DatabaseException e) {
            throw new IOFailure("Failed to read frontier BDB for job " + getJobName(), e);
        }
    }

    /**
     * Returns an iterator where lines are ordered by increasing totalSpend.
     *
     * @return an iterator on the report lines.
     */
    public ReportIterator iterateOnSpentBudget() {
        try {
            return new ReportIterator(linesBySpentBudget.entities());
        } catch (DatabaseException e) {
            throw new IOFailure("Failed to read frontier BDB for job " + getJobName(), e);
        }
    }

    /**
     * Returns an iterator on lines having a given totalSpend.
     *
     * @param dupValue
     * @return an iterator on the report lines.
     */
    public ReportIterator iterateOnDuplicateSpentBudget(long dupValue) {
        try {
            return new ReportIterator(linesBySpentBudget.subIndex(dupValue).entities());
        } catch (DatabaseException e) {
            throw new IOFailure("Failed to read frontier BDB for job " + getJobName(), e);
        }
    }

    /**
     * Generates an Heritrix frontier report wrapper object by parsing the frontier report returned by the JMX
     * controller as a string.
     *
     * @param jobName the Heritrix job name
     * @param contentsAsString the text returned by the JMX call
     * @return the report wrapper object
     */
    public static FullFrontierReport parseContentsAsString(String jobName, String contentsAsString) {

        FullFrontierReport report = new FullFrontierReport(jobName);

        // First dump this possibly huge string to a file
        File tmpDir = Settings.getFile(CommonSettings.CACHE_DIR);
        File tmpFile = new File(tmpDir, jobName + "-" + System.currentTimeMillis() + ".txt");
        try {
            tmpFile.createNewFile();
            BufferedWriter out = new BufferedWriter(new FileWriter(tmpFile));
            out.write(contentsAsString);
            out.close();
        } catch (IOException e) {
            LOG.error("Failed to create temporary file", e);
            return report;
        }

        BufferedReader br;
        try {
            br = new BufferedReader(new FileReader(tmpFile));
        } catch (FileNotFoundException e) {
            LOG.error("Failed to read temporary file", e);
            return report;
        }

        try {
            String lineToken = br.readLine(); // Discard header line
            while ((lineToken = br.readLine()) != null) {
                report.addLine(new FrontierReportLine(lineToken));
            }

            br.close();
        } catch (IOException e) {
            LOG.warn("Failed to close reader", e);
        } catch (Throwable t) {
            LOG.error(t);
            t.printStackTrace(System.err);
        } finally {
            FileUtils.remove(tmpFile);
        }

        return report;
    }

    /**
     * Return the directory where the BDB is stored.
     *
     * @return the storage directory.
     */
    File getStorageDir() {
        return storageDir;
    }

}