org.lockss.plugin.springer.api.SpringerApiCrawlSeed.java Source code

Introduction

Here is the source code for org.lockss.plugin.springer.api.SpringerApiCrawlSeed.java
Source

/*
 * $Id$
 */

/*
    
Copyright (c) 2000-2015 Board of Trustees of Leland Stanford Jr. University,
all rights reserved.
    
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
    
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
    
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
STANFORD UNIVERSITY BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    
Except as contained in this notice, the name of Stanford University shall not
be used in advertising or otherwise to promote the sale, use or other dealings
in this Software without prior written authorization from Stanford University.
    
*/

package org.lockss.plugin.springer.api;

import java.io.*;
import java.util.*;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.lockss.crawler.*;
import org.lockss.daemon.*;
import org.lockss.daemon.Crawler.CrawlerFacade;
import org.lockss.extractor.LinkExtractor.Callback;
import org.lockss.plugin.*;
import org.lockss.plugin.ArchivalUnit.ConfigurationException;
import org.lockss.plugin.UrlFetcher.FetchResult;
import org.lockss.plugin.base.SimpleUrlConsumer;
import org.lockss.util.*;
import org.lockss.util.urlconn.CacheException;

/**
 * <p>
 * A crawl seed that queries Springer's Meta API to enumerate article metadata
 * and synthesize start URLs for crawls.
 * </p>
 * <p>
 * Note that this is the newer Meta API, not the older Metadata API.
 * </p>
 * 
 * @since 1.67.5
 * @see https://dev.springer.com/
 */
public class SpringerApiCrawlSeed extends BaseCrawlSeed {

    /**
     * <p>
     * A logger for this class.
     * </p>
     * 
     * @since 1.67.5
     */
    private static final Logger log = Logger.getLogger(SpringerApiCrawlSeed.class);

    // Will become a definitional param
    private static final String CDN_URL = "http://download.springer.com/";

    private static final String API_KEY;
    static {
        InputStream is = null;
        BufferedReader br = null;
        try {
            is = SpringerApiCrawlSeed.class.getResourceAsStream("api-key.txt");
            if (is == null) {
                throw new ExceptionInInitializerError("Plugin external not found");
            }
            br = new BufferedReader(new InputStreamReader(is, Constants.ENCODING_US_ASCII));
            API_KEY = br.readLine();
            if (StringUtils.isEmpty(API_KEY)) {
                throw new ExceptionInInitializerError("Plugin external not loaded");
            }
        } catch (IOException ioe) {
            ExceptionInInitializerError eiie = new ExceptionInInitializerError("Error reading plugin external");
            eiie.initCause(ioe);
            throw eiie;
        } finally {
            IOUtils.closeQuietly(br);
            IOUtils.closeQuietly(is);
        }
    }

    /**
     * <p>
     * A crawl rate limit for requests to the API service.
     * </p>
     * 
     * @since 1.67.5
     */
    // Overall: 50,000 hits per day or 1 hit per 1.728s
    // Over 100 boxes: 1 hit per 172.8s rounded to 1 hit per 173s
    private static final String API_CRAWL_RATE_LIMIT = "1/173s";

    /**
     * <p>
     * A crawl rate limiter for requests to the API service.
     * </p>
     * 
     * @since 1.67.5
     */
    private static final CrawlRateLimiter API_CRAWL_RATE_LIMITER = new FileTypeCrawlRateLimiter(
            new RateLimiterInfo(SpringerApiCrawlSeed.class.getSimpleName(), API_CRAWL_RATE_LIMIT));

    /**
     * <p>
     * A constant for the maximum number of records expected per response from the
     * API. If the API behaves otherwise, a site warning will be logged.
     * </p>
     * 
     * @since 1.67.5
     */
    protected static final int EXPECTED_RECORDS_PER_RESPONSE = 100;

    /**
     * <p>
     * The API URL (<code>api_url</code>) of this crawl seed's AU.
     * </p>
     * 
     * @since 1.67.5
     */
    protected String apiUrl;

    /**
     * <p>
     * The journal ISSN (<code>journal_issn</code>) of this crawl seed's AU.
     * </p>
     * 
     * @since 1.67.5
     */
    protected String issn;

    /**
     * <p>
     * The volume name (<code>volume_name</code>) of this crawl seed's AU.
     * </p>
     * 
     * @since 1.67.5
     */
    protected String volume;

    /**
     * <p>
     * This crawl seed's crawler faade.
     * </p>
     * 
     * @since 1.67.5
     */
    protected CrawlerFacade facade;

    /**
     * <p>
     * This crawl seed's list of start URLs.
     * </p>
     * 
     * @since 1.67.5
     */
    protected List<String> urlList;

    /**
     * <p>
     * Builds a new crawl seed with the given crawler faade.
     * </p>
     * 
     * @param facade
     *          A crawler faade for this crawl seed.
     * @since 1.67.5
     */
    public SpringerApiCrawlSeed(CrawlerFacade facade) {
        super(facade);
        if (au == null) {
            throw new IllegalArgumentException("Valid archival unit required for crawl seed");
        }
        this.facade = facade;
    }

    @Override
    protected void initialize() throws ConfigurationException, PluginException, IOException {
        super.initialize();
        this.apiUrl = au.getConfiguration().get("api_url");
        this.issn = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ISSN.getKey());
        this.volume = au.getConfiguration().get(ConfigParamDescr.VOLUME_NAME.getKey());
        this.urlList = null;
    }

    @Override
    public Collection<String> doGetStartUrls() throws PluginException, IOException {
        if (urlList == null) {
            populateUrlList();
        }
        return urlList;
    }

    /**
     * <p>
     * Populates the URL list with start URLs.
     * </p>
     * 
     * @throws IOException
     * @since 1.67.5
     */
    protected void populateUrlList() throws IOException {
        // Initialization
        boolean siteWarning = false; // Flag to log the potential siteWarning only once
        urlList = new ArrayList<String>();
        int index = 1; // API numbers records starting with 1
        SpringerApiPamLinkExtractor ple = new SpringerApiPamLinkExtractor();

        // Query API until done
        while (!ple.isDone()) {
            log.debug2("Beginning at index " + index);

            if (facade.isAborted()) {
                log.debug2("Crawl aborted");
                return;
            }

            // Make URL fetcher for this request
            String url = makeApiUrl(index);
            String loggerUrl = loggerUrl(url);
            UrlFetcher uf = makeApiUrlFetcher(ple, url, loggerUrl);
            log.debug2("Request URL: " + loggerUrl);
            facade.getCrawlerStatus().addPendingUrl(loggerUrl);

            // Make request
            FetchResult fr = null;
            try {
                fr = uf.fetch();
            } catch (CacheException ce) {
                log.debug2("Stopping due to fatal CacheException", ce);
                Throwable cause = ce.getCause();
                if (cause != null && IOException.class.equals(cause.getClass())) {
                    throw (IOException) cause; // Unwrap IOException
                } else {
                    throw ce;
                }
            }
            if (fr == FetchResult.FETCHED) {
                facade.getCrawlerStatus().removePendingUrl(loggerUrl);
                facade.getCrawlerStatus().signalUrlFetched(loggerUrl);
            } else {
                log.debug2("Stopping due to fetch result " + fr);
                Map<String, String> errors = facade.getCrawlerStatus().getUrlsWithErrors();
                if (errors.containsKey(url)) {
                    errors.put(loggerUrl, errors.remove(url));
                } else {
                    facade.getCrawlerStatus().signalErrorForUrl(loggerUrl, "Cannot fetch seed URL");
                }
                throw new CacheException("Cannot fetch seed URL");
            }

            // Site warning for unexpected response length
            int records = ple.getPageLength();
            if (records != EXPECTED_RECORDS_PER_RESPONSE && !siteWarning) {
                siteWarning = true;
                log.siteWarning(
                        String.format("Unexpected number of records per response in %s: expected %d, got %d",
                                loggerUrl, EXPECTED_RECORDS_PER_RESPONSE, records));
            }

            // Next batch of records
            index += records;
        }
        log.debug2(String.format("Ending with %d URLs", urlList.size()));
        if (log.isDebug3()) {
            log.debug3("Start URLs: " + urlList.toString());
        }
    }

    /**
     * <p>
     * Assembles the query URL for a given starting index.
     * </p>
     * 
     * @param startingIndex
     *          A starting index (starts at 1).
     * @return The query URL for the given starting index.
     * @since 1.67.5
     */
    protected String makeApiUrl(int startingIndex) {
        String url = String.format("%smeta/v1/pam?q=issn:%s%%20volume:%s&api_key=%s&p=%d&s=%d", apiUrl, issn,
                volume, API_KEY, EXPECTED_RECORDS_PER_RESPONSE, startingIndex);
        return url;
    }

    /**
     * <p>
     * Makes a URL fetcher for the given API request, that will parse the result
     * using the given {@link SpringerApiPamLinkExtractor} instance.
     * </p>
     * 
     * @param ple
     *          A {@link SpringerApiPamLinkExtractor} instance to parse the API
     *          response with.
     * @param url
     *          A query URL.
     * @return A URL fetcher for the given query URL.
     * @since 1.67.5
     */
    protected UrlFetcher makeApiUrlFetcher(final SpringerApiPamLinkExtractor ple, final String url,
            final String loggerUrl) {
        // Make a URL fetcher
        UrlFetcher uf = facade.makeUrlFetcher(url);

        // Set refetch flag
        BitSet permFetchFlags = uf.getFetchFlags();
        permFetchFlags.set(UrlCacher.REFETCH_FLAG);
        uf.setFetchFlags(permFetchFlags);

        // Set custom crawl rate limiter
        uf.setCrawlRateLimiter(API_CRAWL_RATE_LIMITER);

        // Set custom URL consumer factory
        uf.setUrlConsumerFactory(new UrlConsumerFactory() {
            @Override
            public UrlConsumer createUrlConsumer(CrawlerFacade ucfFacade, FetchedUrlData ucfFud) {
                // Make custom URL consumer
                return new SimpleUrlConsumer(ucfFacade, ucfFud) {
                    @Override
                    public void consume() throws IOException {
                        // Apply link extractor to URL and output results into a list
                        final List<String> partial = new ArrayList<String>();
                        try {
                            ple.extractUrls(au, fud.input, AuUtil.getCharsetOrDefault(fud.headers), // FIXME
                                    loggerUrl, // rather than fud.origUrl
                                    new Callback() {
                                        @Override
                                        public void foundLink(String url) {
                                            partial.add(url);
                                        }
                                    });
                        } catch (IOException ioe) {
                            log.debug2("Link extractor threw", ioe);
                            throw new IOException("Error while parsing PAM response for " + url, ioe);
                        } finally {
                            // Logging
                            log.debug2(String.format("Step ending with %d URLs", partial.size()));
                            if (log.isDebug3()) {
                                log.debug3("URLs from step: " + partial.toString());
                            }
                            // Output accumulated URLs to start URL list
                            urlList.addAll(partial);
                        }
                    }
                };
            }
        });
        return uf;
    }

    public static final String loggerUrl(String srcUrl) {
        return srcUrl.replaceAll("&api_key=[^&]*", "");
    }

}