Java tutorial
/* HDFSWriterDocument * * $Id$ * * Created on January 28th, 2007 * * Copyright (C) 2007 Zvents * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.io.hdfs; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; import java.util.Iterator; import java.util.StringTokenizer; import java.util.Map.Entry; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.Writable; import org.apache.log4j.Logger; import org.archive.io.hdfs.util.ByteScan; /** * A class to parse and generate the document objects created by the hdfs-writer-processor * * @author Doug Judd */ public class HDFSWriterDocument implements Writable { @SuppressWarnings("unused") private final Logger LOG = Logger.getLogger(this.getClass().getName()); private byte[] header = { 'H', 'D', 'F', 'S', 'W', 'r', 'i', 't', 'e', 'r', '/', '0', '.', '3', '\r', '\n' }; private char[] contentTypeChars = { 'c', 'o', 'n', 't', 'e', 'n', 't', '-', 't', 'y', 'p', 'e' }; /*private char [] httpLower = { 'h','t','t','p' }; private char [] httpUpper = { 'H','T','T','P' };*/ private static int HIGH_WATER_BUFFER_LENGTH = 524288; protected byte[] buf = null; protected int pos; protected int length; private byte[] requestBase; private int requestOffset; private int requestLength; private byte[] responseBase; private int responseOffset; private int responseBodyOffset; private int responseLength; private int responseCode; private String charset; private String contentType; private HashMap<String, String> fieldMap; private boolean isHttp = false; private boolean isModified = false; private String url = null; private String scheme = null; private ByteScan.State bss = new ByteScan.State(); private ByteScan.State tmpBss = new ByteScan.State(); private String extension = null; /** * Returns the byte array holding the document */ public byte[] getBytes() { reconstructDocument(); return buf; } public byte[] getRequestBytes() { return (requestBase != null) ? requestBase : buf; } public int getRequestOffset() { return requestOffset; } public int getRequestLength() { return requestLength; } public String getRequestString() throws UnsupportedEncodingException { return StringUtils .chomp(new String(getRequestBytes(), getRequestOffset(), getRequestLength(), getValidCharset())); } public byte[] getResponseBytes() { return (responseBase != null) ? responseBase : buf; } public int getResponseOffset() { return responseOffset; } public int getResponseBodyOffset() { return responseBodyOffset; } public int getResponseLength() { return responseLength; } public String getResponseString() throws UnsupportedEncodingException { return StringUtils .chomp(new String(getResponseBytes(), getResponseOffset(), getResponseLength(), getValidCharset())); } public int getResponseCode() { return responseCode; } public String getCharset() { return charset; } //private static byte [] testBytes = { 'f','o','o' }; public String getValidCharset() { if (charset == null) return "ISO-8859-1"; try { //String tmp = new String(testBytes, charset); return charset; } catch (Exception e) { } return "ISO-8859-1"; } public String getContentType() { return contentType; } public String getURLScheme() { return scheme; } /** * Returns lowercased file extension */ public String getURLFileExtension() { if (extension != null) return extension; if (url == null) return null; try { URL u = new URL(url); String path = u.getPath(); int lastDot = path.lastIndexOf('.'); int lastSlash = path.lastIndexOf('/'); if (lastDot == -1 || lastDot < lastSlash) return null; extension = path.substring(lastDot + 1).toLowerCase(); if (extension.length() == 0) extension = null; } catch (MalformedURLException e) { // do nothing } return extension; } /** * Adds a field mapping * * @param label field label * @param value field value */ public void setField(String label, String value) { if (fieldMap == null) fieldMap = new HashMap<String, String>(); fieldMap.put(label, value); if (label.equals("URL")) { url = value; int colon = url.indexOf(':'); if (colon > 0) { scheme = url.substring(0, colon).toLowerCase(); isHttp = scheme.equals("http") ? true : false; } } isModified = true; } /** * Get a field mapping * * @param label field label * @return value for <code>label</code> */ public String getField(String label) { return fieldMap.get(label); } /** * Returns a hash map of field mappings * * @return HashMap of field mappings */ public HashMap<String, String> getFieldMap() { return fieldMap; } /** * Set the HTTP request * * @param httpRequest byte array holding the CRLF terminated HTTP request * @param offset offset within <code>httpRequest</code> of request start * @param length length of request */ public void setHttpRequest(byte[] httpRequest, int offset, int length) { requestBase = httpRequest; requestOffset = offset; requestLength = length; isModified = true; } /** * Set the HTTP response * * @param httpResponse byte array holding the HTTP response * @param offset Offset within <code>httpResponse</code> of response start * @param length Length of response */ public void setHttpResponse(byte[] httpResponse, int offset, int length) { responseBase = httpResponse; responseOffset = responseBodyOffset = offset; responseLength = length; parseResponse(); isModified = true; } public void load(byte[] docBytes) throws IOException { load(docBytes, 4, docBytes.length - 4); } /** * Parses the given document, populating all of the interal attributes * * @param docBytes byte array holding the document * @param offset offset into <code>docBytes</code> where document begins * @param length length of document */ public void load(byte[] docBytes, int offset, int length) throws IOException { int base; isHttp = false; isModified = false; url = null; scheme = null; extension = null; // allocate new buffer if necessary if (buf == null || buf.length < length || buf.length > HIGH_WATER_BUFFER_LENGTH) { buf = new byte[length]; } System.arraycopy(docBytes, offset, buf, 0, length); this.length = length; this.pos = 0; this.responseCode = 0; this.charset = null; this.contentType = null; this.requestBase = buf; this.requestOffset = offset; this.requestLength = length; this.responseBase = buf; this.responseOffset = offset; this.responseBodyOffset = offset; this.responseLength = length; this.url = null; this.scheme = null; this.extension = null; if (this.length < header.length) throw new IOException("document truncated"); for (pos = 0; pos < header.length; pos++) { if (buf[pos] != header[pos]) throw new IOException("bad document header at position " + pos + "(" + (new String(buf, 0, header.length + 4)) + ")"); } /** * Load ANVLRecord */ fieldMap = new HashMap<String, String>(); while (pos < this.length - 1) { // check for ANVL termination if (buf[pos] == '\n') { pos++; break; } else if (buf[pos] == '\r' && buf[pos + 1] == '\n') { pos += 2; break; } base = pos; // find colon while (pos < this.length && buf[pos] != ':' && buf[pos] != '\n') pos++; if (buf[pos] == '\n') { pos++; break; } else { boolean isUrl = false; String label = new String(buf, base, pos - base); if (pos - base >= 3 && buf[base] == 'U' && buf[base + 1] == 'R' && buf[base + 2] == 'L') isUrl = true; pos++; // skip whitespace while (pos < this.length && (buf[pos] == ' ' || buf[pos] == '\t')) pos++; base = pos; // find LF while (pos < this.length && buf[pos] != '\n') pos++; int endpos = (buf[pos - 1] == '\r') ? pos - 1 : pos; String value = new String(buf, base, endpos - base); // check for http if (isUrl) { url = value; int colon = url.indexOf(':'); if (colon > 0) { scheme = url.substring(0, colon).toLowerCase(); isHttp = scheme.equals("http") ? true : false; } } if (value.length() > 0) fieldMap.put(label, value); pos++; } } requestBase = buf; /** * Read HTTP Request */ if (isHttp) { for (requestOffset = pos; pos < this.length; pos++) { if (buf[pos] == '\n') { if (pos < this.length - 2) { if (buf[pos + 1] == '\n') { requestLength = (pos + 2) - requestOffset; pos += 2; break; } else if (buf[pos + 1] == '\r' && buf[pos + 2] == '\n') { requestLength = (pos + 3) - requestOffset; pos += 3; break; } } else { requestLength = this.length - requestOffset; pos = this.length; break; } } } if (pos == this.length) requestLength = this.length - requestOffset; } else { requestOffset = 0; requestLength = 0; } /** * Parse response */ responseBase = buf; responseOffset = responseBodyOffset = pos; responseLength = this.length - responseOffset; if (isHttp) parseResponse(); } /** * Parses the HTTP response section, determining the body offset and * setting the charset field */ private void parseResponse() { //int base; boolean parsingContentType = false; bss.init(responseBase, responseOffset, responseOffset + responseLength); if (!ByteScan.SkipToWhitespace(bss)) return; if (!ByteScan.SkipWhitespace(bss)) return; ByteScan.ParseInt(bss); responseCode = bss.ival; while (!bss.eob()) { if (bss.buf[bss.offset] == '\n') { if (parsingContentType) { tmpBss.init(bss.buf, bss.mark, bss.offset); parseContentType(tmpBss); parsingContentType = false; } if (bss.offset < bss.end - 2) { if (bss.buf[bss.offset + 1] == '\n') { responseBodyOffset = bss.offset + 2; bss.offset += 2; break; } else if (bss.buf[bss.offset + 1] == '\r' && bss.buf[bss.offset + 2] == '\n') { responseBodyOffset = bss.offset + 3; bss.offset += 3; break; } } else { responseBodyOffset = bss.end; bss.offset = bss.end; break; } bss.offset++; bss.mark(); } else if (bss.buf[bss.offset] == ':') { tmpBss.init(bss.buf, bss.mark, bss.offset); if (ByteScan.Equals(tmpBss, contentTypeChars)) parsingContentType = true; bss.offset++; bss.mark(); } else bss.offset++; } responseBodyOffset = bss.offset; if (contentType == null || contentType.startsWith("text")) findAndParseContentType(bss); } private char[] metaChars = { 'm', 'e', 't', 'a' }; private char[] xmlChars = { '?', 'x', 'm', 'l' }; private char[] httpEquivChars = { 'h', 't', 't', 'p', '-', 'e', 'q', 'u', 'i', 'v' }; private char[] contentChars = { 'c', 'o', 'n', 't', 'e', 'n', 't' }; private char[] encodingChars = { 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g' }; private char[] closeHeadChars = { '/', 'h', 'e', 'a', 'd' }; private char[] xmlEndChars = { '?', '>' }; /** * Looks for <META http-equiv="Content-Type" ... or * <?xml ... encoding=" and extracts content-type and/or * charset info */ private void findAndParseContentType(ByteScan.State bss) { if (!ByteScan.FindSkip(bss, '<')) return; /** * XML */ if (ByteScan.StartsWithSkip(bss, xmlChars)) { // set byte scan region to that inside <?xml ... ?> bss.mark(); if (!ByteScan.Find(bss, xmlEndChars)) return; bss.end = bss.offset; bss.flip(); // find encoding if (!ByteScan.FindSkip(bss, encodingChars)) return; // find starting doublequote if (!ByteScan.FindSkip(bss, '"')) return; bss.mark(); if (!ByteScan.Find(bss, '"')) return; charset = bss.toString().toUpperCase(); return; } /** * HTML */ bss.offset--; while (ByteScan.FindSkip(bss, '<')) { if (ByteScan.StartsWithSkip(bss, metaChars)) { // make sure next comes whitespace bss.mark(); if (!ByteScan.SkipWhitespace(bss)) return; if (bss.offset == bss.mark) continue; if (!ByteScan.StartsWithSkip(bss, httpEquivChars)) continue; if (!ByteScan.SkipWhitespace(bss)) return; // skip '=' character if (bss.buf[bss.offset++] != '=') continue; if (!ByteScan.SkipWhitespace(bss)) return; // skip '"' character if (bss.buf[bss.offset++] != '"') continue; bss.mark(); if (!ByteScan.FindSkip(bss, '"')) return; bss.flip(); if (!ByteScan.StartsWith(bss, contentTypeChars)) continue; bss.flip(); if (!ByteScan.SkipWhitespace(bss)) return; if (!ByteScan.StartsWithSkip(bss, contentChars)) return; if (!ByteScan.SkipWhitespace(bss)) return; // skip '=' character if (bss.buf[bss.offset++] != '=') continue; if (!ByteScan.SkipWhitespace(bss)) return; // skip '=' character if (bss.buf[bss.offset++] != '"') continue; bss.mark(); if (!ByteScan.Find(bss, '"')) return; tmpBss.init(bss.buf, bss.mark, bss.offset); parseContentType(tmpBss); if (contentType == null) contentType = "text/html"; break; } else if (ByteScan.Equals(bss, closeHeadChars)) { break; } } } /** * Parses the Content-Type HTTP header */ private void parseContentType(ByteScan.State bss) { if (!ByteScan.SkipWhitespace(bss)) return; if (bss.buf[bss.offset] == '"') bss.offset++; if (!ByteScan.SkipWhitespace(bss)) return; bss.mark(); while (!bss.eob() && (Character.isLetterOrDigit(bss.buf[bss.offset]) || bss.buf[bss.offset] == '/' || bss.buf[bss.offset] == '-' || bss.buf[bss.offset] == '+' || bss.buf[bss.offset] == '.')) bss.offset++; contentType = bss.toString().toLowerCase().trim(); if (contentType != null && contentType.indexOf("/") == -1) contentType = null; if (!ByteScan.Find(bss, ';')) return; if (!bss.eob()) { bss.mark(); bss.offset = bss.end; String inputStr = bss.toString().toLowerCase(); String paramStr; StringTokenizer st = new StringTokenizer(inputStr, ";"); while (st.hasMoreTokens()) { paramStr = st.nextToken().trim(); if (paramStr.startsWith("charset")) { int eqOff = paramStr.indexOf('=', 7); if (eqOff >= 0) { charset = paramStr.substring(eqOff + 1).trim().toUpperCase(); charset = cleanCharset(charset); } } } } } private String cleanCharset(String charset) { if (charset.startsWith("\"") && charset.endsWith("\"")) charset = charset.substring(1, charset.length() - 1); char[] csetChars = charset.toCharArray(); int len = 0; while (len < csetChars.length && (Character.isLetterOrDigit(csetChars[len]) || csetChars[len] == '-')) len++; if (len < csetChars.length) charset = new String(csetChars, 0, len); if (!charset.startsWith("ISO-") && ((charset.endsWith("8859-1") || charset.endsWith("8859_1")))) { charset = "ISO-8859-1"; } else if (charset.endsWith("UTF8")) { charset = "UTF-8"; } return charset; } /** * Reconstructs document, from internal fields, into byte array for serialization */ private void reconstructDocument() { if (isModified) { StringBuilder anvlBlock = new StringBuilder(); for (Iterator<Entry<String, String>> iter = fieldMap.entrySet().iterator(); iter.hasNext();) { Entry<String, String> entry = (Entry<String, String>) iter.next(); String key = (String) entry.getKey(); String value = (String) entry.getValue(); anvlBlock.append(key + ": " + value + "\r\n"); } anvlBlock.append("\r\n"); byte[] anvlHeaderBytes = anvlBlock.toString().getBytes(); byte[] newbuf = new byte[header.length + anvlHeaderBytes.length + requestLength + responseLength]; int pos = 0; // write header System.arraycopy(header, 0, newbuf, pos, header.length); pos += header.length; // write ANVL fields System.arraycopy(anvlHeaderBytes, 0, newbuf, pos, anvlHeaderBytes.length); pos += anvlHeaderBytes.length; // write request if (requestLength > 0) { System.arraycopy(requestBase, requestOffset, newbuf, pos, requestLength); pos += requestLength; } // write response System.arraycopy(responseBase, responseOffset, newbuf, pos, responseLength); buf = newbuf; isModified = false; } } /** * Writes the fields of this object to <code>out</code>. * * @param out output object to serialize to */ public void write(DataOutput out) throws IOException { reconstructDocument(); out.writeInt(buf.length); out.write(buf); } /** * Reads the fields of this object from <code>in</code>. * * @param in input object to de-serialize from */ public void readFields(DataInput in) throws IOException { int length = in.readInt(); byte[] docBytes = new byte[length]; in.readFully(docBytes); load(docBytes, 0, length); } static void printUsage() { System.out.println("HDFSWriterDocument <input-file>"); System.exit(1); } /** * Test driver for HDFSWriterDocument * @throws IOException When there is an IO error */ public static void main(String[] args) throws IOException { HDFSWriterDocument hdfsDoc = new HDFSWriterDocument(); HDFSWriterDocument hdfsDocNew = new HDFSWriterDocument(); if (args.length < 1) printUsage(); FileInputStream fis = new FileInputStream(args[0]); DataInputStream dis = new DataInputStream(fis); hdfsDoc.readFields(dis); System.out.println("\nANVL FIELDS:"); HashMap<String, String> fmap = hdfsDoc.getFieldMap(); // Assuming map = Map<String, String> for (Iterator<Entry<String, String>> iter = fmap.entrySet().iterator(); iter.hasNext();) { Entry<String, String> entry = (Entry<String, String>) iter.next(); String key = (String) entry.getKey(); String value = (String) entry.getValue(); System.out.println(key + ": " + value); hdfsDocNew.setField(key, value); } System.out.println(); if (hdfsDoc.getRequestLength() > 0) { byte[] requestBytes = hdfsDoc.getRequestBytes(); String requestStr = new String(requestBytes, hdfsDoc.getRequestOffset(), hdfsDoc.getRequestLength()); System.out.println("REQUEST:"); System.out.print(requestStr); hdfsDocNew.setHttpRequest(requestBytes, hdfsDoc.getRequestOffset(), hdfsDoc.getRequestLength()); } if (hdfsDoc.getResponseLength() > 0) { byte[] responseBytes = hdfsDoc.getResponseBytes(); String responseStr = new String(responseBytes, hdfsDoc.getResponseOffset(), 512); System.out.println("RESPONSE:"); System.out.print(responseStr); System.out.print("\n[...]\n\n"); /* if (hdfsDoc.getResponseBodyOffset() > 0) { responseStr = new String(responseBytes, hdfsDoc.getResponseBodyOffset(), 256); System.out.println("BODY:"); System.out.print(responseStr); System.out.print("\n[...]\n\n"); } */ hdfsDocNew.setHttpResponse(responseBytes, hdfsDoc.getResponseOffset(), hdfsDoc.getResponseLength()); } System.out.println("DERIVED FIELDS:"); System.out.println("Content-Type = '" + hdfsDoc.getContentType() + "'"); System.out.println("Charset = '" + hdfsDoc.getCharset() + "'"); System.out.println("Response Code = " + hdfsDoc.getResponseCode() + "'"); System.out.println("Scheme = '" + hdfsDoc.getURLScheme() + "'"); System.out.println("File Extension = '" + hdfsDoc.getURLFileExtension() + "'"); System.out.println(); if (args.length == 2) { FileOutputStream fos = new FileOutputStream(args[1]); DataOutputStream dos = new DataOutputStream(fos); hdfsDocNew.write(dos); dos.flush(); } } }