net.sf.jabref.importer.fetcher.OAI2Fetcher.java Source code

Java tutorial

Introduction

Here is the source code for net.sf.jabref.importer.fetcher.OAI2Fetcher.java

Source

/*  Copyright (C) 2003-2016 JabRef contributors.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package net.sf.jabref.importer.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import java.util.Locale;

import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import net.sf.jabref.gui.help.HelpFile;
import net.sf.jabref.importer.ImportInspector;
import net.sf.jabref.importer.OAI2Handler;
import net.sf.jabref.importer.OutputPrinter;
import net.sf.jabref.logic.l10n.Localization;
import net.sf.jabref.model.entry.BibEntry;
import net.sf.jabref.model.entry.FieldName;
import net.sf.jabref.model.entry.IdGenerator;
import net.sf.jabref.model.entry.MonthUtil;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 *
 * This class can be used to access any archive offering an OAI2 interface. By
 * default it will access ArXiv.org
 *
 * @author Ulrich Stärk
 * @author Christian Kopf
 */
public class OAI2Fetcher implements EntryFetcher {

    private static final Log LOGGER = LogFactory.getLog(OAI2Fetcher.class);
    private static final String OAI2_ARXIV_PREFIXIDENTIFIER = "oai%3AarXiv.org%3A";
    private static final String OAI2_ARXIV_HOST = "export.arxiv.org";
    private static final String OAI2_ARXIV_SCRIPT = "oai2";
    private static final String OAI2_ARXIV_METADATAPREFIX = "arXiv";
    private static final String OAI2_ARXIV_ARCHIVENAME = "ArXiv.org";
    private static final String OAI2_IDENTIFIER_FIELD = "oai2identifier";
    private SAXParser saxParser;
    private final String oai2Host;
    private final String oai2Script;
    private final String oai2MetaDataPrefix;
    private final String oai2PrefixIdentifier;
    private final String oai2ArchiveName;
    private boolean shouldContinue = true;
    private OutputPrinter status;
    private long waitTime = -1;
    private Date lastCall;

    /**
     *
     *
     * @param oai2Host
     *            the host to query without leading http:// and without trailing /
     * @param oai2Script
     *            the relative location of the oai2 interface without leading
     *            and trailing /
     * @param oai2Metadataprefix
     *            the urlencoded metadataprefix
     * @param oai2Prefixidentifier
     *            the urlencoded prefix identifier
     * @param waitTimeMs
     *            Time to wait in milliseconds between query-requests.
     */
    public OAI2Fetcher(String oai2Host, String oai2Script, String oai2Metadataprefix, String oai2Prefixidentifier,
            String oai2ArchiveName, long waitTimeMs) {
        this.oai2Host = oai2Host;
        this.oai2Script = oai2Script;
        this.oai2MetaDataPrefix = oai2Metadataprefix;
        this.oai2PrefixIdentifier = oai2Prefixidentifier;
        this.oai2ArchiveName = oai2ArchiveName;
        this.waitTime = waitTimeMs;
        try {
            SAXParserFactory parserFactory = SAXParserFactory.newInstance();
            saxParser = parserFactory.newSAXParser();
        } catch (ParserConfigurationException | SAXException e) {
            LOGGER.error("Error creating SAXParser for OAI2Fetcher", e);
        }
    }

    /**
     * Default Constructor. The archive queried will be ArXiv.org
     *
     */
    public OAI2Fetcher() {
        this(OAI2Fetcher.OAI2_ARXIV_HOST, OAI2Fetcher.OAI2_ARXIV_SCRIPT, OAI2Fetcher.OAI2_ARXIV_METADATAPREFIX,
                OAI2Fetcher.OAI2_ARXIV_PREFIXIDENTIFIER, OAI2Fetcher.OAI2_ARXIV_ARCHIVENAME, 20000L);
    }

    /**
     * Construct the query URL
     *
     * @param key
     *            The key of the OAI2 entry that the url should point to.
     *
     * @return a String denoting the query URL
     */
    public String constructUrl(String key) {
        String identifier;
        try {
            identifier = URLEncoder.encode(key, StandardCharsets.UTF_8.name());
        } catch (UnsupportedEncodingException e) {
            return "";
        }
        return "http://" + oai2Host + "/" + oai2Script + "?" + "verb=GetRecord" + "&identifier="
                + oai2PrefixIdentifier + identifier + "&metadataPrefix=" + oai2MetaDataPrefix;
    }

    /**
     * some archives - like ArXiv.org - might expect of you to wait some time
     */
    private boolean shouldWait() {
        return waitTime > 0;
    }

    /**
     * Strip subcategories from ArXiv key.
     *
     * @param key The key to fix.
     * @return Fixed key.
     */
    public static String fixKey(String key) {

        String resultingKey = key;
        if (resultingKey.toLowerCase(Locale.ENGLISH).startsWith("arxiv:")) {
            resultingKey = resultingKey.substring(6);
        }

        int dot = resultingKey.indexOf('.');
        int slash = resultingKey.indexOf('/');

        if ((dot > -1) && (dot < slash)) {
            resultingKey = resultingKey.substring(0, dot) + resultingKey.substring(slash, resultingKey.length());
        }

        return resultingKey;
    }

    public static String correctLineBreaks(String s) {
        String result = s.replaceAll("\\n(?!\\s*\\n)", " ");
        result = result.replaceAll("\\s*\\n\\s*", "\n");
        return result.replaceAll(" {2,}", " ").replaceAll("(^\\s*|\\s+$)", "");
    }

    /**
     * Import an entry from an OAI2 archive. The BibEntry provided has to
     * have the field OAI2_IDENTIFIER_FIELD set to the search string.
     *
     * @param key
     *            The OAI2 key to fetch from ArXiv.
     * @return The imported BibEntry or null if none.
     */
    public BibEntry importOai2Entry(String key) {
        /**
         * Fix for problem reported in mailing-list:
         *   https://sourceforge.net/forum/message.php?msg_id=4087158
         */
        String fixedKey = OAI2Fetcher.fixKey(key);

        String url = constructUrl(fixedKey);
        try {
            URL oai2Url = new URL(url);
            HttpURLConnection oai2Connection = (HttpURLConnection) oai2Url.openConnection();
            oai2Connection.setRequestProperty("User-Agent", "JabRef");

            /* create an empty BibEntry and set the oai2identifier field */
            BibEntry be = new BibEntry(IdGenerator.next(), "article");
            be.setField(OAI2Fetcher.OAI2_IDENTIFIER_FIELD, fixedKey);
            DefaultHandler handlerBase = new OAI2Handler(be);

            try (InputStream inputStream = oai2Connection.getInputStream()) {

                /* parse the result */
                saxParser.parse(inputStream, handlerBase);

                /* Correct line breaks and spacing */
                for (String name : be.getFieldNames()) {
                    be.getFieldOptional(name)
                            .ifPresent(content -> be.setField(name, OAI2Fetcher.correctLineBreaks(content)));
                }

                if (fixedKey.matches("\\d\\d\\d\\d\\..*")) {
                    be.setField(FieldName.YEAR, "20" + fixedKey.substring(0, 2));

                    int monthNumber = Integer.parseInt(fixedKey.substring(2, 4));
                    MonthUtil.Month month = MonthUtil.getMonthByNumber(monthNumber);
                    if (month.isValid()) {
                        be.setField(FieldName.MONTH, month.bibtexFormat);
                    }
                }
            }
            return be;
        } catch (IOException e) {
            status.showMessage(Localization.lang("An Exception occurred while accessing '%0'", url) + "\n\n" + e,
                    getTitle(), JOptionPane.ERROR_MESSAGE);
        } catch (SAXException e) {
            status.showMessage(Localization.lang("An SAXException occurred while parsing '%0':", url) + "\n\n"
                    + e.getMessage(), getTitle(), JOptionPane.ERROR_MESSAGE);
        } catch (RuntimeException e) {
            status.showMessage(
                    Localization.lang("Error while fetching from %0", "OAI2 source (" + url + "):") + "\n\n"
                            + e.getMessage() + "\n\n" + Localization
                                    .lang("Note: A full text search is currently not supported for %0", getTitle()),
                    getTitle(), JOptionPane.ERROR_MESSAGE);
        }
        return null;
    }

    @Override
    public HelpFile getHelpPage() {
        return HelpFile.FETCHER_OAI2_ARXIV;
    }

    @Override
    public JPanel getOptionsPanel() {
        // we have no additional options
        return null;
    }

    @Override
    public String getTitle() {
        return "ArXiv.org";
    }

    @Override
    public boolean processQuery(String query, ImportInspector dialog, OutputPrinter statusOP) {

        status = statusOP;

        try {
            shouldContinue = true;

            /* multiple keys can be delimited by ; or space */
            String[] keys = query.replace(" ", ";").split(";");
            for (int i = 0; i < keys.length; i++) {
                String key = keys[i];

                /*
                 * some archives - like arxive.org - might expect of you to wait
                 * some time
                 */
                if (shouldWait() && (lastCall != null)) {

                    long elapsed = new Date().getTime() - lastCall.getTime();

                    while (elapsed < waitTime) {
                        status.setStatus(
                                Localization.lang("Waiting for ArXiv...") + ((waitTime - elapsed) / 1000) + " s");
                        Thread.sleep(1000);
                        elapsed = new Date().getTime() - lastCall.getTime();
                    }
                }

                status.setStatus(Localization.lang("Processing %0", key));

                /* the cancel button has been hit */
                if (!shouldContinue) {
                    break;
                }

                /* query the archive and load the results into the BibEntry */
                BibEntry be = importOai2Entry(key);

                if (shouldWait()) {
                    lastCall = new Date();
                }

                /* add the entry to the inspection dialog */
                if (be != null) {
                    dialog.addEntry(be);
                }

                /* update the dialogs progress bar */
                dialog.setProgress(i + 1, keys.length);
            }

            return true;
        } catch (Exception e) {
            status.setStatus(Localization.lang("Error while fetching from %0", "OAI2"));
            LOGGER.error("Error while fetching from OAI2", e);
        }
        return false;
    }

    @Override
    public void stopFetching() {
        shouldContinue = false;
    }
}