net.sf.jabref.logic.fulltext.DoiResolution.java Source code

Introduction

Here is the source code for net.sf.jabref.logic.fulltext.DoiResolution.java
Source

/*  Copyright (C) 2015 JabRef contributors.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
package net.sf.jabref.logic.fulltext;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;

import net.sf.jabref.logic.util.DOI;
import net.sf.jabref.model.entry.BibEntry;
import net.sf.jabref.model.entry.FieldName;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * FullTextFinder implementation that follows the DOI resolution redirects and scans for a full-text PDF URL.
 */
public class DoiResolution implements FullTextFinder {
    private static final Log LOGGER = LogFactory.getLog(DoiResolution.class);

    @Override
    public Optional<URL> findFullText(BibEntry entry) throws IOException {
        Objects.requireNonNull(entry);
        Optional<URL> pdfLink = Optional.empty();

        Optional<DOI> doi = entry.getFieldOptional(FieldName.DOI).flatMap(DOI::build);

        if (doi.isPresent()) {
            String sciLink = doi.get().getURIAsASCIIString();

            // follow all redirects and scan for a single pdf link
            if (!sciLink.isEmpty()) {
                try {
                    Connection connection = Jsoup.connect(sciLink);
                    connection.followRedirects(true);
                    connection.ignoreHttpErrors(true);
                    // some publishers are quite slow (default is 3s)
                    connection.timeout(5000);

                    Document html = connection.get();
                    // scan for PDF
                    Elements elements = html.body().select("[href]");
                    List<Optional<URL>> links = new ArrayList<>();

                    for (Element element : elements) {
                        String href = element.attr("abs:href");
                        // Only check if pdf is included in the link
                        // See https://github.com/lehner/LocalCopy for scrape ideas
                        if (href.contains("pdf") && MimeTypeDetector.isPdfContentType(href)) {
                            links.add(Optional.of(new URL(href)));
                        }
                    }
                    // return if only one link was found (high accuracy)
                    if (links.size() == 1) {
                        LOGGER.info("Fulltext PDF found @ " + sciLink);
                        pdfLink = links.get(0);
                    }
                } catch (IOException e) {
                    LOGGER.warn("DoiResolution fetcher failed: ", e);
                }
            }
        }
        return pdfLink;
    }
}