dk.netarkivet.harvester.harvesting.ArchiveFilesReportGenerator.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.harvester.harvesting.ArchiveFilesReportGenerator.java

Source

/* File:        $Id$
 * Revision:    $Revision$
 * Author:      $Author$
 * Date:        $Date$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
package dk.netarkivet.harvester.harvesting;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.MessageFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.harvester.HarvesterSettings;

/**
 * This class generate a report that lists ARC/WARC files (depending on the configured archive 
 * format) along with the opening date, closing date (if file was properly closed),
 * and size in bytes.
 *
 * Here is a sample of such a file:
 *
 * [ARCHIVEFILE] [Opened] [Closed] [Size]
 *  5-1-20100720161253-00000-bnf_test.arc.gz "2010-07-20 16:12:53.698" "2010-07-20 16:14:31.792" 162928
 *
 * The file is named "archivefiles-report.txt" and is generated by parsing the
 * "heritrix.out" file located in the crawl directory. Useful lines match the
 * following examples:
 *
 * 2010-07-20 16:12:53.698 INFO thread-14 org.archive.io.WriterPoolMember.createFile() Opened /somepath/jobs/current/high/5_1279642368951/arcs/5-1-20100720161253-00000.arc.gz.open
 *
 * and
 *
 * 2010-07-20 16:14:31.792 INFO thread-29 org.archive.io.WriterPoolMember.close() Closed /somepath/jobs/current/high/5_1279642368951/arcs/5-1-20100720161253-00000-bnf_test.arc.gz, size 162928
 *
 * In order to have such messages output to heritrix.out,
 * the "heritrix.properties" file must contain the following, uncommented line:
 *
 * org.archive.io.arc.ARCWriter.level = INFO
 *
 * Note that these strings have changed between Heritrix version 1.14.3
 * and 1.14.4, so they might change again in the future.
 *
 */
class ArchiveFilesReportGenerator {

    private static final Log LOG = LogFactory.getLog(ArchiveFilesReportGenerator.class);

    private static final String ARCHIVE_FORMAT = Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT);

    private static final SimpleDateFormat ISO_8601_DATE_FORMAT = new SimpleDateFormat(
            "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");

    private static final SimpleDateFormat SOURCE_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");

    /**
     * Stores the opening date, closing date and size of an ARC file.
     */
    static class ArchiveFileStatus {

        private static final String NOT_AVAILABLE = "-1";

        String openedDate = NOT_AVAILABLE;
        String closedDate = NOT_AVAILABLE;
        long size = 0;

        protected String getOpenedDate() {
            return openedDate;
        }

        protected void setOpenedDate(String openedDate) {
            this.openedDate = getIsoDateString(openedDate);
        }

        protected String getClosedDate() {
            return closedDate;
        }

        protected void setClosedDate(String closedDate) {
            this.closedDate = getIsoDateString(closedDate);
        }

        protected long getSize() {
            return size;
        }

        protected void setSize(long size) {
            this.size = size;
        }

        @Override
        public String toString() {
            return openedDate + " " + closedDate + " " + Long.toString(size);
        }

        private String getIsoDateString(String dateStr) {
            try {
                return ISO_8601_DATE_FORMAT.format(SOURCE_DATE_FORMAT.parse(dateStr));
            } catch (ParseException e) {
                return NOT_AVAILABLE;
            }
        }

    }

    /**
     * Format used to parse and extract values from  lines of heritrix.out
     * pertaining to an ARC/WARC file opening.
     */
    public static final MessageFormat FILE_OPEN_FORMAT = new MessageFormat(
            "{0} INFO thread-{1} " + "org.archive.io.WriterPoolMember.createFile() Opened " + "{2}.open");

    /**
     * Format used to parse and extract values from lines of heritrix.out
     * pertaining to an ARC/WARC file closing.
     */
    public static final MessageFormat FILE_CLOSE_FORMAT = new MessageFormat(
            "{0} INFO thread-{1} " + "org.archive.io.WriterPoolMember.close() Closed {2}" + ", size {3}");

    /**
     * The name of the report file. It will be generated in the crawl directory.
     */
    public static final String REPORT_FILE_NAME = Settings
            .get(HarvesterSettings.METADATA_ARCHIVE_FILES_REPORT_NAME);

    /**
     * The header line of the report file.
     */
    public static final String REPORT_FILE_HEADER = Settings
            .get(HarvesterSettings.METADATA_ARCHIVE_FILES_REPORT_HEADER);

    /**
     * The Heritrix crawl directory.
     */
    private File crawlDir;

    /**
     * Builds a ARC files report generator, given the Heritrix crawl directory.
     * @param crawlDir the Heritrix crawl directory.
     */
    ArchiveFilesReportGenerator(File crawlDir) {
        this.crawlDir = crawlDir;
    }

    /**
     * Parses heritrix.out and generates the ARC/WARC files report.
     * @return the generated report file.
     */
    protected File generateReport() {

        Map<String, ArchiveFileStatus> reportContents = parseHeritrixOut();

        File reportFile = new File(crawlDir, REPORT_FILE_NAME);

        try {
            boolean created = reportFile.createNewFile();
            if (!created) {
                throw new IOException("Unable to create '" + reportFile.getAbsolutePath() + "'.");
            }
            PrintWriter out = new PrintWriter(reportFile);

            out.println(REPORT_FILE_HEADER);

            HashSet<String> arcFilesFromHeritrixOut = new HashSet<String>();
            for (Map.Entry<String, ArchiveFilesReportGenerator.ArchiveFileStatus> entry : reportContents
                    .entrySet()) {
                String arcFileName = entry.getKey();
                arcFilesFromHeritrixOut.add(arcFileName);
                ArchiveFileStatus afs = entry.getValue();
                out.println(arcFileName + " " + afs.toString());
            }

            // Inspect the contents of the local ARC folder

            //TODO check if this value is configurable
            File localArchiveFolder = new File(crawlDir, ARCHIVE_FORMAT + "s");
            if (localArchiveFolder.exists() && localArchiveFolder.isDirectory()) {
                File[] localArchiveFiles = localArchiveFolder.listFiles(new FileFilter() {
                    @Override
                    public boolean accept(File f) {
                        return f.isFile() && f.getName().contains("." + ARCHIVE_FORMAT);
                    }
                });
                for (File f : localArchiveFiles) {
                    String arcFileName = f.getName();
                    if (!arcFilesFromHeritrixOut.contains(arcFileName)) {
                        ArchiveFileStatus afs = new ArchiveFileStatus();
                        afs.setSize(f.length());
                        out.println(arcFileName + " " + afs.toString());
                    }
                }
            }

            out.close();
        } catch (IOException e) {
            throw new IOFailure("Failed to create " + reportFile.getName(), e);
        }

        return reportFile;
    }

    /**
     * Parses the heritrix.out file and maps to every found ARC file an
     * {@link ArchiveFileStatus} instance.
     * @return the map of found ARC/WARC files, and related ArchiveFileStatus
     */
    protected Map<String, ArchiveFileStatus> parseHeritrixOut() {

        Map<String, ArchiveFileStatus> arcFiles = new LinkedHashMap<String, ArchiveFileStatus>();

        try {
            BufferedReader heritrixOut = new BufferedReader(new FileReader(new File(crawlDir, "heritrix.out")));

            String line = null;
            while ((line = heritrixOut.readLine()) != null) {

                try {
                    Object[] params = FILE_OPEN_FORMAT.parse(line);

                    String openedDate = (String) params[0];
                    String arcFileName = new File((String) params[2]).getName();

                    ArchiveFileStatus afs = new ArchiveFileStatus();
                    afs.setOpenedDate(openedDate);

                    arcFiles.put(arcFileName, afs);
                } catch (ParseException e) {
                    // NOP, that's not the line we're looking for.
                }

                try {
                    Object[] params = FILE_CLOSE_FORMAT.parse(line);

                    String closedDate = (String) params[0];
                    String arcFileName = new File((String) params[2]).getName();
                    Long size = Long.parseLong((String) params[3]);

                    ArchiveFileStatus afs = arcFiles.get(arcFileName);
                    if (afs == null) {
                        throw new ArgumentNotValid(
                                ARCHIVE_FORMAT + " file " + arcFileName + " has no previous Opened record!");
                    }

                    afs.setClosedDate(closedDate);
                    afs.setSize(size);

                } catch (ParseException e) {
                    // NOP, that's not the line we're looking for.
                }

            }
            heritrixOut.close();
        } catch (IOException e) {
            LOG.error(e);
            return arcFiles;
        }

        return arcFiles;
    }
}