Java tutorial
/* File: $Id$ * Revision: $Revision$ * Author: $Author$ * Date: $Date$ * * The Netarchive Suite - Software to harvest and preserve websites * Copyright 2004-2012 The Royal Danish Library, the Danish State and * University Library, the National Library of France and the Austrian * National Library. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package dk.netarkivet.harvester.harvesting; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.text.MessageFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import dk.netarkivet.common.exceptions.ArgumentNotValid; import dk.netarkivet.common.exceptions.IOFailure; import dk.netarkivet.common.utils.Settings; import dk.netarkivet.harvester.HarvesterSettings; /** * This class generate a report that lists ARC/WARC files (depending on the configured archive * format) along with the opening date, closing date (if file was properly closed), * and size in bytes. * * Here is a sample of such a file: * * [ARCHIVEFILE] [Opened] [Closed] [Size] * 5-1-20100720161253-00000-bnf_test.arc.gz "2010-07-20 16:12:53.698" "2010-07-20 16:14:31.792" 162928 * * The file is named "archivefiles-report.txt" and is generated by parsing the * "heritrix.out" file located in the crawl directory. Useful lines match the * following examples: * * 2010-07-20 16:12:53.698 INFO thread-14 org.archive.io.WriterPoolMember.createFile() Opened /somepath/jobs/current/high/5_1279642368951/arcs/5-1-20100720161253-00000.arc.gz.open * * and * * 2010-07-20 16:14:31.792 INFO thread-29 org.archive.io.WriterPoolMember.close() Closed /somepath/jobs/current/high/5_1279642368951/arcs/5-1-20100720161253-00000-bnf_test.arc.gz, size 162928 * * In order to have such messages output to heritrix.out, * the "heritrix.properties" file must contain the following, uncommented line: * * org.archive.io.arc.ARCWriter.level = INFO * * Note that these strings have changed between Heritrix version 1.14.3 * and 1.14.4, so they might change again in the future. * */ class ArchiveFilesReportGenerator { private static final Log LOG = LogFactory.getLog(ArchiveFilesReportGenerator.class); private static final String ARCHIVE_FORMAT = Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT); private static final SimpleDateFormat ISO_8601_DATE_FORMAT = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); private static final SimpleDateFormat SOURCE_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); /** * Stores the opening date, closing date and size of an ARC file. */ static class ArchiveFileStatus { private static final String NOT_AVAILABLE = "-1"; String openedDate = NOT_AVAILABLE; String closedDate = NOT_AVAILABLE; long size = 0; protected String getOpenedDate() { return openedDate; } protected void setOpenedDate(String openedDate) { this.openedDate = getIsoDateString(openedDate); } protected String getClosedDate() { return closedDate; } protected void setClosedDate(String closedDate) { this.closedDate = getIsoDateString(closedDate); } protected long getSize() { return size; } protected void setSize(long size) { this.size = size; } @Override public String toString() { return openedDate + " " + closedDate + " " + Long.toString(size); } private String getIsoDateString(String dateStr) { try { return ISO_8601_DATE_FORMAT.format(SOURCE_DATE_FORMAT.parse(dateStr)); } catch (ParseException e) { return NOT_AVAILABLE; } } } /** * Format used to parse and extract values from lines of heritrix.out * pertaining to an ARC/WARC file opening. */ public static final MessageFormat FILE_OPEN_FORMAT = new MessageFormat( "{0} INFO thread-{1} " + "org.archive.io.WriterPoolMember.createFile() Opened " + "{2}.open"); /** * Format used to parse and extract values from lines of heritrix.out * pertaining to an ARC/WARC file closing. */ public static final MessageFormat FILE_CLOSE_FORMAT = new MessageFormat( "{0} INFO thread-{1} " + "org.archive.io.WriterPoolMember.close() Closed {2}" + ", size {3}"); /** * The name of the report file. It will be generated in the crawl directory. */ public static final String REPORT_FILE_NAME = Settings .get(HarvesterSettings.METADATA_ARCHIVE_FILES_REPORT_NAME); /** * The header line of the report file. */ public static final String REPORT_FILE_HEADER = Settings .get(HarvesterSettings.METADATA_ARCHIVE_FILES_REPORT_HEADER); /** * The Heritrix crawl directory. */ private File crawlDir; /** * Builds a ARC files report generator, given the Heritrix crawl directory. * @param crawlDir the Heritrix crawl directory. */ ArchiveFilesReportGenerator(File crawlDir) { this.crawlDir = crawlDir; } /** * Parses heritrix.out and generates the ARC/WARC files report. * @return the generated report file. */ protected File generateReport() { Map<String, ArchiveFileStatus> reportContents = parseHeritrixOut(); File reportFile = new File(crawlDir, REPORT_FILE_NAME); try { boolean created = reportFile.createNewFile(); if (!created) { throw new IOException("Unable to create '" + reportFile.getAbsolutePath() + "'."); } PrintWriter out = new PrintWriter(reportFile); out.println(REPORT_FILE_HEADER); HashSet<String> arcFilesFromHeritrixOut = new HashSet<String>(); for (Map.Entry<String, ArchiveFilesReportGenerator.ArchiveFileStatus> entry : reportContents .entrySet()) { String arcFileName = entry.getKey(); arcFilesFromHeritrixOut.add(arcFileName); ArchiveFileStatus afs = entry.getValue(); out.println(arcFileName + " " + afs.toString()); } // Inspect the contents of the local ARC folder //TODO check if this value is configurable File localArchiveFolder = new File(crawlDir, ARCHIVE_FORMAT + "s"); if (localArchiveFolder.exists() && localArchiveFolder.isDirectory()) { File[] localArchiveFiles = localArchiveFolder.listFiles(new FileFilter() { @Override public boolean accept(File f) { return f.isFile() && f.getName().contains("." + ARCHIVE_FORMAT); } }); for (File f : localArchiveFiles) { String arcFileName = f.getName(); if (!arcFilesFromHeritrixOut.contains(arcFileName)) { ArchiveFileStatus afs = new ArchiveFileStatus(); afs.setSize(f.length()); out.println(arcFileName + " " + afs.toString()); } } } out.close(); } catch (IOException e) { throw new IOFailure("Failed to create " + reportFile.getName(), e); } return reportFile; } /** * Parses the heritrix.out file and maps to every found ARC file an * {@link ArchiveFileStatus} instance. * @return the map of found ARC/WARC files, and related ArchiveFileStatus */ protected Map<String, ArchiveFileStatus> parseHeritrixOut() { Map<String, ArchiveFileStatus> arcFiles = new LinkedHashMap<String, ArchiveFileStatus>(); try { BufferedReader heritrixOut = new BufferedReader(new FileReader(new File(crawlDir, "heritrix.out"))); String line = null; while ((line = heritrixOut.readLine()) != null) { try { Object[] params = FILE_OPEN_FORMAT.parse(line); String openedDate = (String) params[0]; String arcFileName = new File((String) params[2]).getName(); ArchiveFileStatus afs = new ArchiveFileStatus(); afs.setOpenedDate(openedDate); arcFiles.put(arcFileName, afs); } catch (ParseException e) { // NOP, that's not the line we're looking for. } try { Object[] params = FILE_CLOSE_FORMAT.parse(line); String closedDate = (String) params[0]; String arcFileName = new File((String) params[2]).getName(); Long size = Long.parseLong((String) params[3]); ArchiveFileStatus afs = arcFiles.get(arcFileName); if (afs == null) { throw new ArgumentNotValid( ARCHIVE_FORMAT + " file " + arcFileName + " has no previous Opened record!"); } afs.setClosedDate(closedDate); afs.setSize(size); } catch (ParseException e) { // NOP, that's not the line we're looking for. } } heritrixOut.close(); } catch (IOException e) { LOG.error(e); return arcFiles; } return arcFiles; } }