Java tutorial
/* $Id: ARCReader.java 5039 2007-04-06 00:29:39Z gojomo $ * * Created on May 1, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.cyberway.issue.io.arc; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import com.cyberway.issue.io.ArchiveReader; import com.cyberway.issue.io.ArchiveRecord; import com.cyberway.issue.io.ArchiveRecordHeader; import com.cyberway.issue.io.RecoverableIOException; import com.cyberway.issue.io.WriterPoolMember; import com.cyberway.issue.util.ArchiveUtils; import com.cyberway.issue.util.InetAddressUtil; import com.cyberway.issue.util.TextUtils; /** * Get an iterator on an ARC file or get a record by absolute position. * * ARC files are described here: * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc * File Format</a>. * * <p>This class knows how to parse an ARC file. Pass it a file path * or an URL to an ARC. It can parse ARC Version 1 and 2. * * <p>Iterator returns <code>ARCRecord</code> * though {@link Iterator#next()} is returning * java.lang.Object. Cast the return. * * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the * latter slightly slower -- but not by much. TODO: Test more. Just * change {@link #getInputStream(File, long)}. * * @author stack * @version $Date: 2007-04-06 00:29:39 +0000 (Fri, 06 Apr 2007) $ $Revision: 5039 $ */ public abstract class ARCReader extends ArchiveReader implements ARCConstants { Logger logger = Logger.getLogger(ARCReader.class.getName()); /** * Set to true if we are aligned on first record of Archive file. * We used depend on offset. If offset was zero, then we were * aligned on first record. This is no longer necessarily the case when * Reader is created at an offset into an Archive file: The offset is zero * but its relative to where we started reading. */ private boolean alignedOnFirstRecord = true; /** * Assumed maximum size of a record meta header line. * * This 100k which seems massive but its the same as the LINE_LENGTH from * <code>alexa/include/a_arcio.h</code>: * <pre> * #define LINE_LENGTH (100*1024) * </pre> */ private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100; /** * Array of field names. * * Used to initialize <code>headerFieldNameKeys</code>. */ private final String[] headerFieldNameKeysArray = { URL_FIELD_KEY, IP_HEADER_FIELD_KEY, DATE_FIELD_KEY, MIMETYPE_FIELD_KEY, LENGTH_FIELD_KEY }; /** * An array of the header field names found in the ARC file header on * the 3rd line. * * We used to read these in from the arc file first record 3rd line but * now we hardcode them for sake of improved performance. */ private final List<String> headerFieldNameKeys = Arrays.asList(this.headerFieldNameKeysArray); private boolean parseHttpHeaders = true; ARCReader() { super(); } /** * Skip over any trailing new lines at end of the record so we're lined up * ready to read the next. * @param record * @throws IOException */ protected void gotoEOR(ArchiveRecord record) throws IOException { if (getIn().available() <= 0) { return; } // Remove any trailing LINE_SEPARATOR int c = -1; while (getIn().available() > 0) { if (getIn().markSupported()) { getIn().mark(1); } c = getIn().read(); if (c != -1) { if (c == LINE_SEPARATOR) { continue; } if (getIn().markSupported()) { // We've overread. We're probably in next record. There is // no way of telling for sure. It may be dross at end of // current record. Backup. getIn().reset(); break; } ArchiveRecordHeader h = (getCurrentRecord() != null) ? record.getHeader() : null; throw new IOException("Read " + (char) c + " when only " + LINE_SEPARATOR + " expected. " + getReaderIdentifier() + ((h != null) ? h.getHeaderFields().toString() : "")); } } } /** * Create new arc record. * * Encapsulate housekeeping that has to do w/ creating a new record. * * <p>Call this method at end of constructor to read in the * arcfile header. Will be problems reading subsequent arc records * if you don't since arcfile header has the list of metadata fields for * all records that follow. * * <p>When parsing through ARCs writing out CDX info, we spend about * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine * -- of which 16% is reading. * * @param is InputStream to use. * @param offset Absolute offset into arc file. * @return An arc record. * @throws IOException */ protected ARCRecord createArchiveRecord(InputStream is, long offset) throws IOException { ArrayList<String> firstLineValues = new ArrayList<String>(20); getTokenizedHeaderLine(is, firstLineValues); int bodyOffset = 0; if (offset == 0 && isAlignedOnFirstRecord()) { // If offset is zero and we were aligned at first record on // creation (See #alignedOnFirstRecord for more on this), then no // records have been read yet and we're reading our first one, the // record of ARC file meta info. Its special. In ARC versions // 1.x, first record has three lines of meta info. We've just read // the first line. There are two more. The second line has misc. // info. We're only interested in the first field, the version // number. The third line is the list of field names. Here's what // ARC file version 1.x meta content looks like: // // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\ // 20040107015752 text/plain 77 // 1 0 InternetArchive // URL IP-address Archive-date Content-type Archive-length // ArrayList<String> secondLineValues = new ArrayList<String>(20); bodyOffset += getTokenizedHeaderLine(is, secondLineValues); setVersion((String) secondLineValues.get(0) + "." + (String) secondLineValues.get(1)); // Just read over the 3rd line. We used to parse it and use // values found here but now we just hardcode them to avoid // having to read this 3rd line even for random arc file accesses. bodyOffset += getTokenizedHeaderLine(is, null); } try { currentRecord( new ARCRecord(is, (ArchiveRecordHeader) computeMetaData(this.headerFieldNameKeys, firstLineValues, getVersion(), offset), bodyOffset, isDigest(), isStrict(), isParseHttpHeaders())); } catch (IOException e) { if (e instanceof RecoverableIOException) { // Don't mess with RecoverableIOExceptions. Let them out. throw e; } IOException newE = new IOException(e.getMessage() + " (Offset " + offset + ")."); newE.setStackTrace(e.getStackTrace()); throw newE; } return (ARCRecord) getCurrentRecord(); } /** * Returns version of this ARC file. Usually read from first record of ARC. * If we're reading without having first read the first record -- e.g. * random access into middle of an ARC -- then version will not have been * set. For now, we return a default, version 1.1. Later, if more than * just one version of ARC, we could look at such as the meta line to see * what version of ARC this is. * @return Version of this ARC file. */ public String getVersion() { return (super.getVersion() == null) ? "1.1" : super.getVersion(); } /** * Get a record header line as list of tokens. * * We keep reading till we find a LINE_SEPARATOR or we reach the end * of file w/o finding a LINE_SEPARATOR or the line length is crazy. * * @param stream InputStream to read from. * @param list Empty list that gets filled w/ string tokens. * @return Count of characters read. * @exception IOException If problem reading stream or no line separator * found or EOF before EOL or we didn't get minimum header fields. */ private int getTokenizedHeaderLine(final InputStream stream, List<String> list) throws IOException { // Preallocate usual line size. StringBuilder buffer = new StringBuilder(2048 + 20); int read = 0; int previous = -1; for (int c = -1; true;) { previous = c; c = stream.read(); if (c == -1) { throw new RecoverableIOException("Hit EOF before header EOL."); } c &= 0xff; read++; if (read > MAX_HEADER_LINE_LENGTH) { throw new IOException("Header line longer than max allowed " + " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) + " -- or passed buffer doesn't contain a line (Read: " + buffer.length() + "). Here's" + " some of what was read: " + buffer.substring(0, Math.min(buffer.length(), 256))); } if (c == LINE_SEPARATOR) { if (buffer.length() == 0) { // Empty line at start of buffer. Skip it and try again. continue; } if (list != null) { list.add(buffer.toString()); } // LOOP TERMINATION. break; } else if (c == HEADER_FIELD_SEPARATOR) { if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) { // Early ARCs sometimes had multiple spaces between fields. continue; } if (list != null) { list.add(buffer.toString()); } // reset to empty buffer.setLength(0); } else { buffer.append((char) c); } } // List must have at least 3 elements in it and no more than 10. If // it has other than this, then bogus parse. if (list != null && (list.size() < 3 || list.size() > 100)) { throw new IOException("Unparseable header line: " + list); } return read; } /** * Compute metadata fields. * * Here we check the meta field has right number of items in it. * * @param keys Keys to use composing headerFields map. * @param values Values to set into the headerFields map. * @param v The version of this ARC file. * @param offset Offset into arc file. * * @return Metadata structure for this record. * * @exception IOException If no. of keys doesn't match no. of values. */ private ARCRecordMetaData computeMetaData(List<String> keys, List<String> values, String v, long offset) throws IOException { if (keys.size() != values.size()) { List<String> originalValues = values; if (!isStrict()) { values = fixSpaceInURL(values, keys.size()); // If values still doesn't match key size, try and do // further repair. if (keys.size() != values.size()) { // Early ARCs had a space in mimetype. if (values.size() == (keys.size() + 1) && values.get(4).toLowerCase().startsWith("charset=")) { List<String> nuvalues = new ArrayList<String>(keys.size()); nuvalues.add(0, values.get(0)); nuvalues.add(1, values.get(1)); nuvalues.add(2, values.get(2)); nuvalues.add(3, values.get(3) + values.get(4)); nuvalues.add(4, values.get(5)); values = nuvalues; } else if ((values.size() + 1) == keys.size() && isLegitimateIPValue(values.get(1)) && isDate(values.get(2)) && isNumber(values.get(3))) { // Mimetype is empty. List<String> nuvalues = new ArrayList<String>(keys.size()); nuvalues.add(0, values.get(0)); nuvalues.add(1, values.get(1)); nuvalues.add(2, values.get(2)); nuvalues.add(3, "-"); nuvalues.add(4, values.get(3)); values = nuvalues; } } } if (keys.size() != values.size()) { throw new IOException( "Size of field name keys does" + " not match count of field values: " + values); } // Note that field was fixed on stderr. logStdErr(Level.WARNING, "Fixed spaces in metadata line at " + "offset " + offset + " Original: " + originalValues + ", New: " + values); } Map<Object, Object> headerFields = new HashMap<Object, Object>(keys.size() + 2); for (int i = 0; i < keys.size(); i++) { headerFields.put(keys.get(i), values.get(i)); } // Add a check for tabs in URLs. If any, replace with '%09'. // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966, // [ 1010966 ] crawl.log has URIs with spaces in them. String url = (String) headerFields.get(URL_FIELD_KEY); if (url != null && url.indexOf('\t') >= 0) { headerFields.put(URL_FIELD_KEY, TextUtils.replaceAll("\t", url, "%09")); } headerFields.put(VERSION_FIELD_KEY, v); headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); return new ARCRecordMetaData(getReaderIdentifier(), headerFields); } protected boolean isDate(final String date) { if (date.length() != 14) { return false; } return isNumber(date); } protected boolean isNumber(final String n) { for (int i = 0; i < n.length(); i++) { if (!Character.isDigit(n.charAt(i))) { return false; } } return true; } protected boolean isLegitimateIPValue(final String ip) { if ("-".equals(ip)) { return true; } Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip); return m != null && m.matches(); } /** * Fix space in URLs. * The ARCWriter used to write into the ARC URLs with spaces in them. * See <a * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ] * crawl.log has URIs with spaces in them</a>. * This method does fix up on such headers converting all spaces found * to '%20'. * @param values List of metadata values. * @param requiredSize Expected size of resultant values list. * @return New list if we successfully fixed up values or original if * fixup failed. */ protected List<String> fixSpaceInURL(List<String> values, int requiredSize) { // Do validity check. 3rd from last is a date of 14 numeric // characters. The 4th from last is IP, all before the IP // should be concatenated together with a '%20' joiner. // In the below, '4' is 4th field from end which has the IP. if (!(values.size() > requiredSize) || values.size() < 4) { return values; } // Test 3rd field is valid date. if (!isDate((String) values.get(values.size() - 3))) { return values; } // Test 4th field is valid IP. if (!isLegitimateIPValue((String) values.get(values.size() - 4))) { return values; } List<String> newValues = new ArrayList<String>(requiredSize); StringBuffer url = new StringBuffer(); for (int i = 0; i < (values.size() - 4); i++) { if (i > 0) { url.append("%20"); } url.append(values.get(i)); } newValues.add(url.toString()); for (int i = values.size() - 4; i < values.size(); i++) { newValues.add(values.get(i)); } return newValues; } protected boolean isAlignedOnFirstRecord() { return alignedOnFirstRecord; } protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) { this.alignedOnFirstRecord = alignedOnFirstRecord; } /** * @return Returns the parseHttpHeaders. */ public boolean isParseHttpHeaders() { return this.parseHttpHeaders; } /** * @param parse The parseHttpHeaders to set. */ public void setParseHttpHeaders(boolean parse) { this.parseHttpHeaders = parse; } public String getFileExtension() { return ARC_FILE_EXTENSION; } public String getDotFileExtension() { return DOT_ARC_FILE_EXTENSION; } protected boolean output(final String format) throws IOException, java.text.ParseException { boolean result = super.output(format); if (!result && (format.equals(NOHEAD) || format.equals(HEADER))) { throw new IOException(format + " format only supported for single Records"); } return result; } public boolean outputRecord(final String format) throws IOException { boolean result = super.outputRecord(format); if (result) { return result; } if (format.equals(NOHEAD)) { // No point digesting if dumping content. setDigest(false); ARCRecord r = (ARCRecord) get(); r.skipHttpHeader(); r.dump(); result = true; } else if (format.equals(HEADER)) { // No point digesting if dumping content. setDigest(false); ARCRecord r = (ARCRecord) get(); r.dumpHttpHeader(); result = true; } return result; } public void dump(final boolean compress) throws IOException, java.text.ParseException { // No point digesting if we're doing a dump. setDigest(false); boolean firstRecord = true; ARCWriter writer = null; for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) { ARCRecord r = (ARCRecord) ii.next(); // We're to dump the arc on stdout. // Get the first record's data if any. ARCRecordMetaData meta = r.getMetaData(); if (firstRecord) { firstRecord = false; // Get an ARCWriter. ByteArrayOutputStream baos = new ByteArrayOutputStream(r.available()); // This is slow but done only once at top of ARC. while (r.available() > 0) { baos.write(r.read()); } List<String> listOfMetadata = new ArrayList<String>(); listOfMetadata.add(baos.toString(WriterPoolMember.UTF8)); // Assume getArc returns full path to file. ARCWriter // or new File will complain if it is otherwise. writer = new ARCWriter(new AtomicInteger(), System.out, new File(meta.getArc()), compress, meta.getDate(), listOfMetadata); continue; } writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(), ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(), (int) meta.getLength(), r); } // System.out.println(System.currentTimeMillis() - start); } /** * @return an ArchiveReader that will delete a local file on close. Used * when we bring Archive files local and need to clean up afterward. */ public ARCReader getDeleteFileOnCloseReader(final File f) { final ARCReader d = this; return new ARCReader() { private final ARCReader delegate = d; private File archiveFile = f; public void close() throws IOException { this.delegate.close(); if (this.archiveFile != null) { if (archiveFile.exists()) { archiveFile.delete(); } this.archiveFile = null; } } public ArchiveRecord get(long o) throws IOException { return this.delegate.get(o); } public boolean isDigest() { return this.delegate.isDigest(); } public boolean isStrict() { return this.delegate.isStrict(); } public Iterator<ArchiveRecord> iterator() { return this.delegate.iterator(); } public void setDigest(boolean d) { this.delegate.setDigest(d); } public void setStrict(boolean s) { this.delegate.setStrict(s); } public List validate() throws IOException { return this.delegate.validate(); } @Override public ArchiveRecord get() throws IOException { return this.delegate.get(); } @Override public String getVersion() { return this.delegate.getVersion(); } @Override public List validate(int noRecords) throws IOException { return this.delegate.validate(noRecords); } @Override protected ARCRecord createArchiveRecord(InputStream is, long offset) throws IOException { return this.delegate.createArchiveRecord(is, offset); } @Override protected void gotoEOR(ArchiveRecord record) throws IOException { this.delegate.gotoEOR(record); } @Override public void dump(boolean compress) throws IOException, java.text.ParseException { this.delegate.dump(compress); } @Override public String getDotFileExtension() { return this.delegate.getDotFileExtension(); } @Override public String getFileExtension() { return this.delegate.getFileExtension(); } }; } // Static methods follow. /** * * @param formatter Help formatter instance. * @param options Usage options. * @param exitCode Exit code. */ private static void usage(HelpFormatter formatter, Options options, int exitCode) { formatter.printHelp("java com.cyberway.issue.io.arc.ARCReader" + " [--digest=true|false] \\\n" + " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" + " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL", options); System.exit(exitCode); } /** * Write out the arcfile. * * @param reader * @param format Format to use outputting. * @throws IOException * @throws java.text.ParseException */ protected static void output(ARCReader reader, String format) throws IOException, java.text.ParseException { if (!reader.output(format)) { throw new IOException("Unsupported format: " + format); } } /** * Generate a CDX index file for an ARC file. * * @param urlOrPath The ARC file to generate a CDX index for * @throws IOException * @throws java.text.ParseException */ public static void createCDXIndexFile(String urlOrPath) throws IOException, java.text.ParseException { ARCReader r = ARCReaderFactory.get(urlOrPath); r.setStrict(false); r.setParseHttpHeaders(true); r.setDigest(true); output(r, CDX_FILE); } /** * Command-line interface to ARCReader. * * Here is the command-line interface: * <pre> * usage: java com.cyberway.issue.io.arc.ARCReader [--offset=#] ARCFILE * -h,--help Prints this message and exits. * -o,--offset Outputs record at this offset into arc file.</pre> * * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll * take care of classpaths and the calling of ARCReader. * * <p>Outputs using a pseudo-CDX format as described here: * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX * Legent</a> and here * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>. * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'. * Hash is hard-coded straight SHA-1 hash of content. * * @param args Command-line arguments. * @throws ParseException Failed parse of the command line. * @throws IOException * @throws java.text.ParseException */ public static void main(String[] args) throws ParseException, IOException, java.text.ParseException { Options options = getOptions(); options.addOption(new Option("p", "parse", false, "Parse headers.")); PosixParser parser = new PosixParser(); CommandLine cmdline = parser.parse(options, args, false); List cmdlineArgs = cmdline.getArgList(); Option[] cmdlineOptions = cmdline.getOptions(); HelpFormatter formatter = new HelpFormatter(); // If no args, print help. if (cmdlineArgs.size() <= 0) { usage(formatter, options, 0); } // Now look at options passed. long offset = -1; boolean digest = false; boolean strict = false; boolean parse = false; String format = CDX; for (int i = 0; i < cmdlineOptions.length; i++) { switch (cmdlineOptions[i].getId()) { case 'h': usage(formatter, options, 0); break; case 'o': offset = Long.parseLong(cmdlineOptions[i].getValue()); break; case 's': strict = true; break; case 'p': parse = true; break; case 'd': digest = getTrueOrFalse(cmdlineOptions[i].getValue()); break; case 'f': format = cmdlineOptions[i].getValue().toLowerCase(); boolean match = false; // List of supported formats. final String[] supportedFormats = { CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE }; for (int ii = 0; ii < supportedFormats.length; ii++) { if (supportedFormats[ii].equals(format)) { match = true; break; } } if (!match) { usage(formatter, options, 1); } break; default: throw new RuntimeException("Unexpected option: " + +cmdlineOptions[i].getId()); } } if (offset >= 0) { if (cmdlineArgs.size() != 1) { System.out.println("Error: Pass one arcfile only."); usage(formatter, options, 1); } ARCReader arc = ARCReaderFactory.get((String) cmdlineArgs.get(0), offset); arc.setStrict(strict); // We must parse headers if we need to skip them. if (format.equals(NOHEAD) || format.equals(HEADER)) { parse = true; } arc.setParseHttpHeaders(parse); outputRecord(arc, format); } else { for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) { String urlOrPath = (String) i.next(); try { ARCReader r = ARCReaderFactory.get(urlOrPath); r.setStrict(strict); r.setParseHttpHeaders(parse); r.setDigest(digest); output(r, format); } catch (RuntimeException e) { // Write out name of file we failed on to help with // debugging. Then print stack trace and try to keep // going. We do this for case where we're being fed // a bunch of ARCs; just note the bad one and move // on to the next. System.err.println("Exception processing " + urlOrPath + ": " + e.getMessage()); e.printStackTrace(System.err); System.exit(1); } } } } }