com.digitalpebble.behemoth.io.warc.HttpResponse.java Source code

Java tutorial

Introduction

Here is the source code for com.digitalpebble.behemoth.io.warc.HttpResponse.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.digitalpebble.behemoth.io.warc;

// JDK imports
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

import org.apache.commons.lang.StringUtils;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;

/** An HTTP response - borrowed from Nutch */
public class HttpResponse {

    private byte[] content;
    private int code;
    private Metadata headers = new org.apache.nutch.metadata.Metadata();

    public static final int BUFFER_SIZE = 8 * 1024;

    public HttpResponse(byte[] response) throws ProtocolException, IOException {

        PushbackInputStream in = // process response
                new PushbackInputStream(new ByteArrayInputStream(response), BUFFER_SIZE);

        StringBuilder line = new StringBuilder();

        boolean haveSeenNonContinueStatus = false;
        while (!haveSeenNonContinueStatus) {
            // parse status code line
            this.code = parseStatusLine(in, line);
            // parse headers
            parseHeaders(in, line);
            haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
        }

        readPlainContent(in);
    }

    public int getCode() {
        return code;
    }

    public String getHeader(String name) {
        return headers.get(name);
    }

    public Metadata getHeaders() {
        return headers;
    }

    public byte[] getContent() {
        return content;
    }

    /*
     * ------------------------- * <implementation:Response> *
     * -------------------------
     */

    private void readPlainContent(InputStream in) throws IOException {

        int contentLength = Integer.MAX_VALUE; // get content length
        String contentLengthString = headers.get(Response.CONTENT_LENGTH);
        if (StringUtils.isNotBlank(contentLengthString)) {
            contentLengthString = contentLengthString.trim();
            try {
                contentLength = Integer.parseInt(contentLengthString);
            } catch (NumberFormatException e) {
                throw new IOException("bad content length: " + contentLengthString);
            }
        }

        ByteArrayOutputStream out = new ByteArrayOutputStream(BUFFER_SIZE);
        byte[] bytes = new byte[BUFFER_SIZE];
        int length = 0; // read content
        for (int i = in.read(bytes); i != -1; i = in.read(bytes)) {

            out.write(bytes, 0, i);
            length += i;
            if (length >= contentLength)
                break;
        }
        content = out.toByteArray();
    }

    private int parseStatusLine(PushbackInputStream in, StringBuilder line) throws IOException {
        // skip first character if "\n"
        if (peek(in) == '\n') {
            in.read();
        }
        readLine(in, line, false);

        int codeStart = line.indexOf(" ");
        int codeEnd = line.indexOf(" ", codeStart + 1);

        // handle lines with no plaintext result code, ie:
        // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
        if (codeEnd == -1)
            codeEnd = line.length();

        int code;
        try {
            code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
        } catch (NumberFormatException e) {
            throw new IOException("bad status line '" + line + "': " + e.getMessage(), e);
        }

        return code;
    }

    private void processHeaderLine(StringBuilder line) throws IOException {

        int colonIndex = line.indexOf(":"); // key is up to colon
        if (colonIndex == -1) {
            int i;
            for (i = 0; i < line.length(); i++)
                if (!Character.isWhitespace(line.charAt(i)))
                    break;
            if (i == line.length())
                return;
            throw new IOException("No colon in header:" + line);
        }
        String key = line.substring(0, colonIndex);

        int valueStart = colonIndex + 1; // skip whitespace
        while (valueStart < line.length()) {
            int c = line.charAt(valueStart);
            if (c != ' ' && c != '\t')
                break;
            valueStart++;
        }
        String value = line.substring(valueStart);
        headers.set(key, value);
    }

    // Adds headers to our headers Metadata
    private void parseHeaders(PushbackInputStream in, StringBuilder line) throws IOException {

        while (readLine(in, line, true) != 0) {

            // handle HTTP responses with missing blank line after headers
            int pos;
            if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
                    || ((pos = line.indexOf("<html")) != -1)) {

                in.unread(line.substring(pos).getBytes("UTF-8"));
                line.setLength(pos);

                try {
                    // TODO: (CM) We don't know the header names here
                    // since we're just handling them generically. It would
                    // be nice to provide some sort of mapping function here
                    // for the returned header names to the standard metadata
                    // names in the ParseData class
                    processHeaderLine(line);
                } catch (Exception e) {

                }
                return;
            }

            processHeaderLine(line);
        }
    }

    private static int readLine(PushbackInputStream in, StringBuilder line, boolean allowContinuedLine)
            throws IOException {
        line.setLength(0);
        for (int c = in.read(); c != -1; c = in.read()) {
            switch (c) {
            case '\r':
                if (peek(in) == '\n') {
                    in.read();
                }
            case '\n':
                if (line.length() > 0) {
                    // at EOL -- check for continued line if the current
                    // (possibly continued) line wasn't blank
                    if (allowContinuedLine)
                        switch (peek(in)) {
                        case ' ':
                        case '\t': // line is continued
                            in.read();
                            continue;
                        }
                }
                return line.length(); // else complete
            default:
                line.append((char) c);
            }
        }
        throw new EOFException();
    }

    private static int peek(PushbackInputStream in) throws IOException {
        int value = in.read();
        in.unread(value);
        return value;
    }

}