Java tutorial
/* $Id: ArchiveReader.java 5369 2007-07-31 00:36:35Z gojomo $ * * Created on August 21st, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.cyberway.issue.io; import it.unimi.dsi.fastutil.io.RepositionableStream; import java.io.BufferedInputStream; import java.io.BufferedWriter; import java.io.EOFException; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import com.cyberway.issue.util.MimetypeUtils; /** * Reader for an Archive file of Archive {@link ArchiveRecord}s. * @author stack * @version $Date: 2007-07-31 00:36:35 +0000 (Tue, 31 Jul 2007) $ $Version$ */ public abstract class ArchiveReader implements ArchiveFileConstants { /** * Is this Archive file compressed? */ private boolean compressed = false; /** * Should we digest as we read? */ private boolean digest = true; /** * Should the parse be strict? */ private boolean strict = false; /** * Archive file input stream. * * Keep it around so we can close it when done. * * <p>Set in constructor. Must support {@link RepositionableStream} * interface. Make it protected so subclasses have access. */ private InputStream in = null; /** * Maximum amount of recoverable exceptions in a row. * If more than this amount in a row, we'll let out the exception rather * than go back in for yet another retry. */ public static final int MAX_ALLOWED_RECOVERABLES = 10; /** * The Record currently being read. * * Keep this ongoing reference so we'll close the record even if the caller * doesn't. */ private ArchiveRecord currentRecord = null; /** * Descriptive string for the Archive file we're going against: * full path, url, etc. -- depends on context in which file was made. */ private String identifier = null; /** * Archive file version. */ private String version = null; protected ArchiveReader() { super(); } /** * Convenience method used by subclass constructors. * @param i Identifier for Archive file this reader goes against. */ protected void initialize(final String i) { setReaderIdentifier(i); } /** * Convenience method for constructors. * * @param f File to read. * @param offset Offset at which to start reading. * @return InputStream to read from. * @throws IOException If failed open or fail to get a memory * mapped byte buffer on file. */ protected InputStream getInputStream(final File f, final long offset) throws IOException { return new RandomAccessBufferedInputStream(new RandomAccessInputStream(f, offset)); } public boolean isCompressed() { return this.compressed; } /** * Get record at passed <code>offset</code>. * * @param offset Byte index into file at which a record starts. * @return An Archive Record reference. * @throws IOException */ public ArchiveRecord get(long offset) throws IOException { cleanupCurrentRecord(); RepositionableStream ps = (RepositionableStream) this.in; long currentOffset = ps.position(); if (currentOffset != offset) { currentOffset = offset; ps.position(offset); } return createArchiveRecord(this.in, currentOffset); } /** * @return Return Archive Record created against current offset. * @throws IOException */ public ArchiveRecord get() throws IOException { return createArchiveRecord(this.in, ((RepositionableStream) this.in).position()); } public void close() throws IOException { if (this.in != null) { this.in.close(); this.in = null; } } /** * Rewinds stream to start of the Archive file. * @throws IOException if stream is not resettable. */ protected void rewind() throws IOException { cleanupCurrentRecord(); if (this.in instanceof RepositionableStream) { try { ((RepositionableStream) this.in).position(0); } catch (IOException e) { throw new RuntimeException(e); } } else { throw new IOException("Stream is not resettable."); } } /** * Cleanout the current record if there is one. * @throws IOException */ protected void cleanupCurrentRecord() throws IOException { if (this.currentRecord != null) { this.currentRecord.close(); gotoEOR(this.currentRecord); this.currentRecord = null; } } /** * Return an Archive Record homed on <code>offset</code> into * <code>is</code>. * @param is Stream to read Record from. * @param offset Offset to find Record at. * @return ArchiveRecord instance. * @throws IOException */ protected abstract ArchiveRecord createArchiveRecord(InputStream is, long offset) throws IOException; /** * Skip over any trailing new lines at end of the record so we're lined up * ready to read the next. * @param record * @throws IOException */ protected abstract void gotoEOR(ArchiveRecord record) throws IOException; public abstract String getFileExtension(); public abstract String getDotFileExtension(); /** * @return Version of this Archive file. */ public String getVersion() { return this.version; } /** * Validate the Archive file. * * This method iterates over the file throwing exception if it fails * to successfully parse any record. * * <p>Assumes the stream is at the start of the file. * @return List of all read Archive Headers. * * @throws IOException */ public List validate() throws IOException { return validate(-1); } /** * Validate the Archive file. * * This method iterates over the file throwing exception if it fails * to successfully parse. * * <p>We start validation from whereever we are in the stream. * * @param noRecords Number of records expected. Pass -1 if number is * unknown. * * @return List of all read metadatas. As we validate records, we add * a reference to the read metadata. * * @throws IOException */ public List validate(int noRecords) throws IOException { List<ArchiveRecordHeader> hs = new ArrayList<ArchiveRecordHeader>(); int count = 0; setStrict(true); for (Iterator<ArchiveRecord> i = iterator(); i.hasNext();) { count++; ArchiveRecord r = i.next(); if (r.getHeader().getLength() <= 0 && r.getHeader().getMimetype().equals(MimetypeUtils.NO_TYPE_MIMETYPE)) { throw new IOException("ARCRecord content is empty."); } r.close(); // Add reference to metadata into a list of metadatas. hs.add(r.getHeader()); } if (noRecords != -1) { if (count != noRecords) { throw new IOException("Count of records, " + Integer.toString(count) + " is less than expected " + Integer.toString(noRecords)); } } return hs; } /** * Test Archive file is valid. * Assumes the stream is at the start of the file. Be aware that this * method makes a pass over the whole file. * @return True if file can be successfully parsed. */ public boolean isValid() { boolean valid = false; try { validate(); valid = true; } catch (Exception e) { // File is not valid if exception thrown parsing. valid = false; } return valid; } /** * @return Returns the strict. */ public boolean isStrict() { return this.strict; } /** * @param s The strict to set. */ public void setStrict(boolean s) { this.strict = s; } /** * @param d True if we're to digest. */ public void setDigest(boolean d) { this.digest = d; } /** * @return True if we're digesting as we read. */ public boolean isDigest() { return this.digest; } protected Logger getLogger() { return Logger.getLogger(this.getClass().getName()); } protected InputStream getInputStream() { return this.in; } /** * Returns an ArchiveRecord iterator. * Of note, on IOException, especially if ZipException reading compressed * ARCs, rather than fail the iteration, try moving to the next record. * If {@link ArchiveReader#strict} is not set, this will usually succeed. * @return An iterator over ARC records. */ public Iterator<ArchiveRecord> iterator() { // Eat up any record outstanding. try { cleanupCurrentRecord(); } catch (IOException e) { throw new RuntimeException(e); } // Now reset stream to the start of the arc file. try { rewind(); } catch (IOException e) { throw new RuntimeException(e); } return new ArchiveRecordIterator(); } protected void setCompressed(boolean compressed) { this.compressed = compressed; } /** * @return The current ARC record or null if none. * After construction has the arcfile header record. * @see #get() */ protected ArchiveRecord getCurrentRecord() { return this.currentRecord; } protected ArchiveRecord currentRecord(final ArchiveRecord currentRecord) { this.currentRecord = currentRecord; return currentRecord; } protected InputStream getIn() { return in; } protected void setIn(InputStream in) { this.in = in; } protected void setVersion(String version) { this.version = version; } public String getReaderIdentifier() { return this.identifier; } protected void setReaderIdentifier(final String i) { this.identifier = i; } /** * Log on stderr. * Logging should go via the logging system. This method * bypasses the logging system going direct to stderr. * Should not generally be used. Its used for rare messages * that come of cmdline usage of ARCReader ERRORs and WARNINGs. * Override if using ARCReader in a context where no stderr or * where you'd like to redirect stderr to other than System.err. * @param level Level to log message at. * @param message Message to log. */ public void logStdErr(Level level, String message) { System.err.println(level.toString() + " " + message); } /** * Add buffering to RandomAccessInputStream. */ protected class RandomAccessBufferedInputStream extends BufferedInputStream implements RepositionableStream { public RandomAccessBufferedInputStream(RandomAccessInputStream is) throws IOException { super(is); } public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size) throws IOException { super(is, size); } public long position() throws IOException { // Current position is the underlying files position // minus the amount thats in the buffer yet to be read. return ((RandomAccessInputStream) this.in).position() - (this.count - this.pos); } public void position(long position) throws IOException { // Force refill of buffer whenever there's been a seek. this.pos = 0; this.count = 0; ((RandomAccessInputStream) this.in).position(position); } public int available() throws IOException { // Avoid overflow on large datastreams long amount = (long) in.available() + (long) (count - pos); return (amount >= Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int) amount; } } /** * Inner ArchiveRecord Iterator class. * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if * trouble pulling record from underlying stream. * @author stack */ protected class ArchiveRecordIterator implements Iterator<ArchiveRecord> { private final Logger logger = Logger.getLogger(this.getClass().getName()); /** * @return True if we have more records to read. * @exception RuntimeException Can throw an IOException wrapped in a * RuntimeException if a problem reading underlying stream (Corrupted * gzip, etc.). */ public boolean hasNext() { // Call close on any extant record. This will scoot us past // any content not yet read. try { cleanupCurrentRecord(); } catch (IOException e) { if (isStrict()) { throw new RuntimeException(e); } if (e instanceof EOFException) { logger.warning("Premature EOF cleaning up " + currentRecord.getHeader().toString() + ": " + e.getMessage()); return false; } // If not strict, try going again. We might be able to skip // over the bad record. logger.warning("Trying skip of failed record cleanup of " + currentRecord.getHeader().toString() + ": " + e.getMessage()); } return innerHasNext(); } protected boolean innerHasNext() { long offset = -1; try { offset = ((RepositionableStream) getInputStream()).position(); return getInputStream().available() > 0; } catch (IOException e) { throw new RuntimeException("Offset " + offset, e); } } /** * Tries to move to next record if we get * {@link RecoverableIOException}. If not <code>strict</code> * tries to move to next record if we get an * {@link IOException}. * @return Next object. * @exception RuntimeException Throws a runtime exception, * usually a wrapping of an IOException, if trouble getting * a record (Throws exception rather than return null). */ public ArchiveRecord next() { long offset = -1; try { offset = ((RepositionableStream) getInputStream()).position(); return exceptionNext(); } catch (IOException e) { if (!isStrict()) { // Retry though an IOE. Maybe we will succeed reading // subsequent record. try { if (hasNext()) { getLogger().warning("Bad Record. Trying skip " + "(Current offset " + offset + "): " + e.getMessage()); return exceptionNext(); } // Else we are at last record. Iterator#next is // expecting value. We do not have one. Throw exception. throw new RuntimeException("Retried but no next " + "record (Offset " + offset + ")", e); } catch (IOException e1) { throw new RuntimeException("After retry (Offset " + offset + ")", e1); } } throw new RuntimeException("(Offset " + offset + ")", e); } } /** * A next that throws exceptions and has handling of * recoverable exceptions moving us to next record. Can call * hasNext which itself may throw exceptions. * @return Next record. * @throws IOException * @throws RuntimeException Thrown when we've reached maximum * retries. */ protected ArchiveRecord exceptionNext() throws IOException, RuntimeException { ArchiveRecord result = null; IOException ioe = null; for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 && result == null; i--) { ioe = null; try { result = innerNext(); } catch (RecoverableIOException e) { ioe = e; getLogger().warning(e.getMessage()); if (hasNext()) { continue; } // No records left. Throw exception rather than // return null. The caller is expecting to get // back a record since they've just called // hasNext. break; } } if (ioe != null) { // Then we did MAX_ALLOWED_RECOVERABLES retries. Throw // the recoverable ioe wrapped in a RuntimeException so // it goes out pass checks for IOE. throw new RuntimeException("Retried " + MAX_ALLOWED_RECOVERABLES + " times in a row", ioe); } return result; } protected ArchiveRecord innerNext() throws IOException { return get(((RepositionableStream) getInputStream()).position()); } public void remove() { throw new UnsupportedOperationException(); } } protected static String stripExtension(final String name, final String ext) { return (!name.endsWith(ext)) ? name : name.substring(0, name.length() - ext.length()); } /** * @return short name of Archive file. */ public String getFileName() { return (new File(getReaderIdentifier())).getName(); } /** * @return short name of Archive file. */ public String getStrippedFileName() { return getStrippedFileName(getFileName(), getDotFileExtension()); } /** * @param name Name of ARCFile. * @param dotFileExtension '.arc' or '.warc', etc. * @return short name of Archive file. */ public static String getStrippedFileName(String name, final String dotFileExtension) { name = stripExtension(name, ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION); return stripExtension(name, dotFileExtension); } /** * @param value Value to test. * @return True if value is 'true', else false. */ protected static boolean getTrueOrFalse(final String value) { if (value == null || value.length() <= 0) { return false; } return Boolean.TRUE.toString().equals(value.toLowerCase()); } /** * @param format Format to use outputting. * @throws IOException * @throws java.text.ParseException * @return True if handled. */ protected boolean output(final String format) throws IOException, java.text.ParseException { boolean result = true; // long start = System.currentTimeMillis(); // Write output as pseudo-CDX file. See // http://www.archive.org/web/researcher/cdx_legend.php // and http://www.archive.org/web/researcher/example_cdx.php. // Hash is hard-coded straight SHA-1 hash of content. if (format.equals(DUMP)) { // No point digesting dumping. setDigest(false); dump(false); } else if (format.equals(GZIP_DUMP)) { // No point digesting dumping. setDigest(false); dump(true); } else if (format.equals(CDX)) { cdxOutput(false); } else if (format.equals(CDX_FILE)) { cdxOutput(true); } else { result = false; } return result; } protected void cdxOutput(boolean toFile) throws IOException { BufferedWriter cdxWriter = null; if (toFile) { String cdxFilename = stripExtension(getReaderIdentifier(), DOT_COMPRESSED_FILE_EXTENSION); cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); cdxFilename += ('.' + CDX); cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); } String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") + " n g"; if (toFile) { cdxWriter.write(header); cdxWriter.newLine(); } else { System.out.println(header); } String strippedFileName = getStrippedFileName(); try { for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) { ArchiveRecord r = ii.next(); if (toFile) { cdxWriter.write(r.outputCdx(strippedFileName)); cdxWriter.newLine(); } else { System.out.println(r.outputCdx(strippedFileName)); } } } finally { if (toFile) { cdxWriter.close(); } } } /** * Output passed record using passed format specifier. * @param format What format to use outputting. * @throws IOException * @return True if handled. */ public boolean outputRecord(final String format) throws IOException { boolean result = true; if (format.equals(CDX)) { System.out.println(get().outputCdx(getStrippedFileName())); } else if (format.equals(ArchiveFileConstants.DUMP)) { // No point digesting if dumping content. setDigest(false); get().dump(); } else { result = false; } return result; } /** * Dump this file on STDOUT * @throws compress True if dumped output is compressed. * @throws IOException * @throws java.text.ParseException */ public abstract void dump(final boolean compress) throws IOException, java.text.ParseException; /** * @return an ArchiveReader that will delete a local file on close. Used * when we bring Archive files local and need to clean up afterward. */ public abstract ArchiveReader getDeleteFileOnCloseReader(final File f); /** * Output passed record using passed format specifier. * @param r ARCReader instance to output. * @param format What format to use outputting. * @throws IOException */ protected static void outputRecord(final ArchiveReader r, final String format) throws IOException { if (!r.outputRecord(format)) { throw new IOException("Unsupported format" + " (or unsupported on a single record): " + format); } } /** * @return Base Options object filled out with help, digest, strict, etc. * options. */ protected static Options getOptions() { Options options = new Options(); options.addOption(new Option("h", "help", false, "Prints this message and exits.")); options.addOption(new Option("o", "offset", true, "Outputs record at this offset into file.")); options.addOption(new Option("d", "digest", true, "Pass true|false. Expensive. Default: true (SHA-1).")); options.addOption( new Option("s", "strict", false, "Strict mode. Fails parse if incorrectly formatted file.")); options.addOption(new Option("f", "format", true, "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," + "'or 'nohead'. Default: 'cdx'.")); return options; } }