Java tutorial
/* ARCRecord * * $Id: ARCRecord.java 5943 2008-08-01 23:01:27Z gojomo $ * * Created on Jan 7, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.cyberway.issue.io.arc; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; import org.apache.commons.httpclient.StatusLine; import org.apache.commons.httpclient.util.EncodingUtil; import com.cyberway.issue.io.ArchiveRecord; import com.cyberway.issue.io.ArchiveRecordHeader; import com.cyberway.issue.io.RecoverableIOException; /** * An ARC file record. * Does not compass the ARCRecord metadata line, just the record content. * @author stack */ public class ARCRecord extends ArchiveRecord implements ARCConstants { /** * Http status line object. * * May be null if record is not http. */ private StatusLine httpStatus = null; /** * Http header bytes. * * If non-null and bytes available, give out its contents before we * go back to the underlying stream. */ private InputStream httpHeaderStream = null; /** * Http headers. * * Only populated after reading of headers. */ private Header[] httpHeaders = null; /** * Minimal http header length. * * I've seen in arcs content length of 1 with no * header. */ private static final long MIN_HTTP_HEADER_LENGTH = "HTTP/1.1 200 OK\r\n".length(); /** * Constructor. * * @param in Stream cue'd up to be at the start of the record this instance * is to represent. * @param metaData Meta data. * @throws IOException */ public ARCRecord(InputStream in, ArchiveRecordHeader metaData) throws IOException { this(in, metaData, 0, true, false, true); } /** * Constructor. * * @param in Stream cue'd up to be at the start of the record this instance * is to represent. * @param metaData Meta data. * @param bodyOffset Offset into the body. Usually 0. * @param digest True if we're to calculate digest for this record. Not * digesting saves about ~15% of cpu during an ARC parse. * @param strict Be strict parsing (Parsing stops if ARC inproperly * formatted). * @param parseHttpHeaders True if we are to parse HTTP headers. Costs * about ~20% of CPU during an ARC parse. * @throws IOException */ public ARCRecord(InputStream in, ArchiveRecordHeader metaData, int bodyOffset, boolean digest, boolean strict, final boolean parseHttpHeaders) throws IOException { super(in, metaData, bodyOffset, digest, strict); if (parseHttpHeaders) { this.httpHeaderStream = readHttpHeader(); } } /** * Skip over the the http header if one present. * * Subsequent reads will get the body. * * <p>Calling this method in the midst of reading the header * will make for strange results. Otherwise, safe to call * at any time though before reading any of the arc record * content is only time that it makes sense. * * <p>After calling this method, you can call * {@link #getHttpHeaders()} to get the read http header. * * @throws IOException */ public void skipHttpHeader() throws IOException { if (this.httpHeaderStream != null) { // Empty the httpHeaderStream for (int available = this.httpHeaderStream.available(); this.httpHeaderStream != null && (available = this.httpHeaderStream.available()) > 0;) { // We should be in this loop once only we should only do this // buffer allocation once. byte[] buffer = new byte[available]; // The read nulls out httpHeaderStream when done with it so // need check for null in the loop control line. read(buffer, 0, available); } } } public void dumpHttpHeader() throws IOException { if (this.httpHeaderStream == null) { return; } // Dump the httpHeaderStream to STDOUT for (int available = this.httpHeaderStream.available(); this.httpHeaderStream != null && (available = this.httpHeaderStream.available()) > 0;) { // We should be in this loop only once and should do this // buffer allocation once. byte[] buffer = new byte[available]; // The read nulls out httpHeaderStream when done with it so // need check for null in the loop control line. int read = read(buffer, 0, available); System.out.write(buffer, 0, read); } } /** * Read http header if present. Technique borrowed from HttpClient HttpParse * class. * * @return ByteArrayInputStream with the http header in it or null if no * http header. * @throws IOException */ private InputStream readHttpHeader() throws IOException { // If judged a record that doesn't have an http header, return // immediately. if (!getHeader().getUrl().startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return null; } byte[] statusBytes = HttpParser.readRawLine(getIn()); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new IOException("Failed to read http status where one was expected: " + ((statusBytes == null) ? "" : new String(statusBytes))); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { if (statusLine.startsWith("DELETED")) { // Some old ARCs have deleted records like following: // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202 // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist // (follows ~29K spaces) // For now, throw a RecoverableIOException so if iterating over // records, we keep going. TODO: Later make a legitimate // ARCRecord from the deleted record rather than throw // exception. throw new DeletedARCRecordIOException(statusLine); } else { throw new IOException("Failed parse of http status line."); } } this.httpStatus = new StatusLine(statusLine); // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since // its all supposed to be ascii. ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024); baos.write(statusBytes); // Now read rest of the header lines looking for the separation // between header and body. for (byte[] lineBytes = null; true;) { lineBytes = HttpParser.readRawLine(getIn()); eolCharCount = getEolCharsCount(lineBytes); if (eolCharCount <= 0) { throw new IOException( "Failed reading http headers: " + ((lineBytes != null) ? new String(lineBytes) : null)); } // Save the bytes read. baos.write(lineBytes); if ((lineBytes.length - eolCharCount) <= 0) { // We've finished reading the http header. break; } } byte[] headerBytes = baos.toByteArray(); // Save off where body starts. this.getMetaData().setContentBegin(headerBytes.length); ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes); if (!bais.markSupported()) { throw new IOException("ByteArrayInputStream does not support mark"); } bais.mark(headerBytes.length); // Read the status line. Don't let it into the parseHeaders function. // It doesn't know what to do with it. bais.read(statusBytes, 0, statusBytes.length); this.httpHeaders = HttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING); this.getMetaData().setStatusCode(Integer.toString(getStatusCode())); bais.reset(); return bais; } private static class DeletedARCRecordIOException extends RecoverableIOException { public DeletedARCRecordIOException(final String reason) { super(reason); } } /** * Return status code for this record. * * This method will return -1 until the http header has been read. * @return Status code. */ public int getStatusCode() { return (this.httpStatus == null) ? -1 : this.httpStatus.getStatusCode(); } /** * @param bytes Array of bytes to examine for an EOL. * @return Count of end-of-line characters or zero if none. */ private int getEolCharsCount(byte[] bytes) { int count = 0; if (bytes != null && bytes.length >= 1 && bytes[bytes.length - 1] == '\n') { count++; if (bytes.length >= 2 && bytes[bytes.length - 2] == '\r') { count++; } } return count; } /** * @return Meta data for this record. */ public ARCRecordMetaData getMetaData() { return (ARCRecordMetaData) getHeader(); } /** * @return http headers (Only available after header has been read). */ public Header[] getHttpHeaders() { return this.httpHeaders; } /** * @return Next character in this ARCRecord's content else -1 if at end of * this record. * @throws IOException */ public int read() throws IOException { int c = -1; if (this.httpHeaderStream != null && (this.httpHeaderStream.available() > 0)) { // If http header, return bytes from it before we go to underlying // stream. c = this.httpHeaderStream.read(); // If done with the header stream, null it out. if (this.httpHeaderStream.available() <= 0) { this.httpHeaderStream = null; } incrementPosition(); } else { c = super.read(); } return c; } public int read(byte[] b, int offset, int length) throws IOException { int read = -1; if (this.httpHeaderStream != null && (this.httpHeaderStream.available() > 0)) { // If http header, return bytes from it before we go to underlying // stream. read = Math.min(length, this.httpHeaderStream.available()); if (read == 0) { read = -1; } else { read = this.httpHeaderStream.read(b, offset, read); } // If done with the header stream, null it out. if (this.httpHeaderStream.available() <= 0) { this.httpHeaderStream = null; } incrementPosition(read); } else { read = super.read(b, offset, length); } return read; } /** * @return Offset at which the body begins (Only known after * header has been read) or -1 if none or if we haven't read * headers yet. Usually length of HTTP headers (does not include ARC * metadata line length). */ public int getBodyOffset() { return this.getMetaData().getContentBegin(); } @Override protected String getIp4Cdx(ArchiveRecordHeader h) { String result = null; if (h instanceof ARCRecordMetaData) { result = ((ARCRecordMetaData) h).getIp(); } return (result != null) ? result : super.getIp4Cdx(h); } @Override protected String getStatusCode4Cdx(ArchiveRecordHeader h) { String result = null; if (h instanceof ARCRecordMetaData) { result = ((ARCRecordMetaData) h).getStatusCode(); } return (result != null) ? result : super.getStatusCode4Cdx(h); } @Override protected String getDigest4Cdx(ArchiveRecordHeader h) { String result = null; if (h instanceof ARCRecordMetaData) { result = ((ARCRecordMetaData) h).getDigest(); } return (result != null) ? result : super.getDigest4Cdx(h); } }