ru.ksu.niimm.cll.mocassin.crawl.arxiv.ArxivDAOFacadeImpl.java Source code

Java tutorial

Introduction

Here is the source code for ru.ksu.niimm.cll.mocassin.crawl.arxiv.ArxivDAOFacadeImpl.java

Source

/*******************************************************************************
 * Copyright (c) 2010-2012 Nikita Zhiltsov.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *     Nikita Zhiltsov - initial API and implementation
 *     Azat Khasanshin - implementation
 ******************************************************************************/
package ru.ksu.niimm.cll.mocassin.crawl.arxiv;

import java.io.IOException;
import java.io.InputStream;
import java.net.Authenticator;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.PasswordAuthentication;
import java.net.Proxy;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.zip.GZIPInputStream;

import org.slf4j.Logger;

import ru.ksu.niimm.cll.mocassin.util.inject.log.InjectLogger;
import ru.ksu.niimm.cll.mocassin.util.model.ArticleMetadata;
import ru.ksu.niimm.cll.mocassin.util.model.ArticleMetadataReader;
import ru.ksu.niimm.cll.mocassin.util.model.Link;
import ru.ksu.niimm.cll.mocassin.util.model.Link.PdfLinkPredicate;

import com.google.common.collect.Iterables;
import com.google.inject.Inject;
import com.google.inject.name.Named;

/**
 * The class implements calling ArXiv.org API and handling its response
 * 
 * @author Nikita Zhiltsov
 * 
 */
public class ArxivDAOFacadeImpl implements ArxivDAOFacade {
    private static final String USER_AGENT = "Mozilla/4.0";

    private static final String XML_CONTENT_TYPE = "application/xml";

    private static final String TXT_CONTENT_TYPE = "application/txt";

    private static final String PDF_CONTENT_TYPE = "application/pdf";

    private static final String SOURCE_PREFIX = "e-print";

    private static final String ABSTRACT_PREFIX = "abs";

    private static final String DEFAULT_CHARSET = "UTF-8";

    private String arxivConnectionUrl;

    private boolean isUseProxy;

    private String proxyHost;

    private int proxyPort;

    private String proxyUser;

    private String proxyPassword;

    @InjectLogger
    private Logger logger;

    @Inject
    private ArxivDAOFacadeImpl(@Named("arxiv.api.connection.url") String arxivConnectionUrl,
            @Named("isUseProxy") boolean isUseProxy, @Named("proxy.host") String proxyHost,
            @Named("proxy.port") int proxyPort, @Named("proxy.user") String proxyUser,
            @Named("proxy.password") String proxyPassword) {
        this.arxivConnectionUrl = arxivConnectionUrl;
        this.isUseProxy = isUseProxy;
        this.proxyHost = proxyHost;
        this.proxyPort = proxyPort;
        this.proxyUser = proxyUser;
        this.proxyPassword = proxyPassword;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public ArticleMetadata retrieve(String arxivId) {
        // TODO: more subtle validation is required
        if (arxivId == null)
            throw new NullPointerException("arxiv id cannot be null");
        try {
            String paramValue = arxivId;
            String query = String.format("id_list=%s", URLEncoder.encode(paramValue, DEFAULT_CHARSET));
            URL url = new URL(this.arxivConnectionUrl + "?" + query);
            InputStream response = loadFromUrl(url, XML_CONTENT_TYPE);

            ArticleMetadata metadata = ArticleMetadataReader.read(response);

            if (isUseProxy) {
                Authenticator.setDefault(null);
            }
            return metadata;

        } catch (Exception e) {
            logger.error("Failed to process the request with arXiv id='{}'", arxivId, e);
            return null;
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public InputStream loadSource(ArticleMetadata metadata) {
        String id = metadata.getId();
        String sourceLink = id.replace(ABSTRACT_PREFIX, SOURCE_PREFIX);
        InputStream inputStream = null;
        try {
            URL sourceUrl = new URL(sourceLink);
            inputStream = new GZIPInputStream(loadFromUrl(sourceUrl, TXT_CONTENT_TYPE));
        } catch (MalformedURLException e) {
            logger.error("Failed to prepare a source URL from: {}", sourceLink, e);
        } catch (IOException e) {
            logger.error("Failed to get the source of {}", id, e);
        }
        if (isUseProxy) {
            Authenticator.setDefault(null);
        }
        return inputStream;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public InputStream loadPDF(ArticleMetadata metadata) {
        Link pdfLink = Iterables.find(metadata.getLinks(), new PdfLinkPredicate(), null);
        if (pdfLink == null || (pdfLink.getHref() == null || pdfLink.getHref().length() == 0)) {
            logger.warn("Article metadata with id='{}' doesn't include PDF link; empty stream will be returned",
                    metadata.getId());
            return null;
        }
        InputStream inputStream = null;
        try {
            URL sourceUrl = new URL(pdfLink.getHref());
            inputStream = loadFromUrl(sourceUrl, PDF_CONTENT_TYPE);
        } catch (MalformedURLException e) {
            logger.error("Failed to prepare source URL from: {}", pdfLink.getHref(), e);
        } catch (IOException e) {
            logger.error("Failed to get the source of {}", metadata.getId(), e);
        }
        if (isUseProxy) {
            Authenticator.setDefault(null);
        }
        return inputStream;
    }

    private InputStream loadFromUrl(URL url, String contentType) throws IOException {
        Proxy proxy = null;
        if (isUseProxy) {
            Authenticator.setDefault(new Authenticator() {
                protected PasswordAuthentication getPasswordAuthentication() {
                    return new PasswordAuthentication(proxyUser, proxyPassword.toCharArray());
                }
            });
            proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyHost, proxyPort));

        } else {
            proxy = Proxy.NO_PROXY;
        }
        URLConnection connection = url.openConnection(proxy);

        connection.setRequestProperty("Accept-Charset", DEFAULT_CHARSET);
        connection.setRequestProperty("Content-Type", contentType);
        connection.setRequestProperty("User-Agent", USER_AGENT);

        InputStream response = connection.getInputStream();
        return response;
    }

}