org.voyanttools.trombone.input.source.UriInputSource.java Source code

Java tutorial

Introduction

Here is the source code for org.voyanttools.trombone.input.source.UriInputSource.java

Source

/*******************************************************************************
 * Trombone is a flexible text processing and analysis library used
 * primarily by Voyant Tools (voyant-tools.org).
 * 
 * Copyright () 2007-2012 Stfan Sinclair & Geoffrey Rockwell
 * 
 * This file is part of Trombone.
 * 
 * Trombone is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Trombone is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Trombone.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package org.voyanttools.trombone.input.source;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URLConnection;

import org.apache.commons.codec.digest.DigestUtils;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.DocumentMetadata;

/**
 * An {@link InputSource} associated with a URI.
 * 
 * @author Stfan Sinclair
 */
public class UriInputSource implements InputSource {

    /**
     * the URI for this input source
     */
    private URI uri;

    /**
     * the id (hash) for this input source
     */
    private String id;

    /**
     * the metadata for this input source
     */
    private DocumentMetadata metadata;

    /**
     * Create a new instance with the specified URI.
     * 
     * @param uri the URI associated with this input source
     * @throws IOException
     *             thrown when there's a problem creating or accessing header
     *             information for the URI
     * @throws MalformedURLException
     *             thrown if the URI is malformed
     */
    public UriInputSource(URI uri) throws IOException {
        this.uri = uri;
        this.metadata = new DocumentMetadata();
        this.metadata.setLocation(uri.toString());

        this.metadata.setSource(Source.URI);

        String path = uri.getPath();
        if (path.isEmpty() || path.equals("/")) { // no path, use host
            metadata.setTitle(uri.getHost());
        } else if (path.endsWith("/")) { // ends in slash, use full path
            metadata.setTitle(path);
        } else { // try to use file part of URI
            metadata.setTitle(new File(path).getName());
        }

        StringBuilder idBuilder = new StringBuilder(uri.toString());

        // establish connection to find other and default metadata
        URLConnection c = null;
        try {
            c = getURLConnection(uri, 15000, 10000);

            // last modified of file
            long modified = c.getLastModified();
            this.metadata.setModified(modified);
            idBuilder.append(modified);

            // try and get length for id
            int length = c.getContentLength();
            idBuilder.append(length);

            String format = c.getContentType();
            if (format != null && format.isEmpty() == false) {
                idBuilder.append(format);
                DocumentFormat docFormat = DocumentFormat.fromContentType(format);
                if (docFormat != DocumentFormat.UNKNOWN) {
                    this.metadata.setDefaultFormat(docFormat);
                }
            }

        } finally {
            if (c != null && c instanceof HttpURLConnection) {
                ((HttpURLConnection) c).disconnect();
            }
        }

        this.id = DigestUtils.md5Hex(idBuilder.toString());
    }

    private URLConnection getURLConnection(URI uri) throws IOException {
        return getURLConnection(uri, 60000, 15000);
    }

    private URLConnection getURLConnection(URI uri, int readTimeoutMilliseconds, int connectTimeoutMilliseconds)
            throws IOException {
        URLConnection c;
        try {
            c = uri.toURL().openConnection();
        } catch (MalformedURLException e) {
            throw new IllegalArgumentException("Attempt to use a malformed URL: " + uri, e);
        }
        c.addRequestProperty("User-Agent", "Mozilla/4.0 (compatible; Trombone)");
        c.setReadTimeout(readTimeoutMilliseconds);
        c.setConnectTimeout(connectTimeoutMilliseconds);
        return c;
    }

    public InputStream getInputStream() throws MalformedURLException, IOException {
        // let's hope that the connection is close when the stream is closed
        URLConnection c = getURLConnection(uri);
        return c.getInputStream();
    }

    public DocumentMetadata getMetadata() {
        return this.metadata;
    }

    public String getUniqueId() {
        return this.id;
    }

}