de.fhg.iais.asc.oai.strategy.AbstractHarvester.java Source code

Java tutorial

Introduction

Here is the source code for de.fhg.iais.asc.oai.strategy.AbstractHarvester.java

Source

package de.fhg.iais.asc.oai.strategy;

/******************************************************************************
 * Copyright 2011 (c) Fraunhofer IAIS Netmedia  http://www.iais.fraunhofer.de *
 * ************************************************************************** *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may    *
 * not use this file except in compliance with the License.                   *
 * You may obtain a copy of the License at                                    *
 * http://www.apache.org/licenses/LICENSE-2.0                                 *
 * Unless required by applicable law or agreed to in writing,                 *
 * software distributed under the License is distributed on an "AS IS" BASIS, *
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   *
 * See the License for the specific language governing permissions and        *
 * limitations under the License.                                             *
 ******************************************************************************/

import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import se.kb.oai.OAIException;
import se.kb.oai.pmh.ErrorResponseException;
import se.kb.oai.pmh.Header;
import se.kb.oai.pmh.IdentifiersList;
import se.kb.oai.pmh.OaiPmhServer;
import se.kb.oai.pmh.ResumptionToken;
import se.kb.oai.pmh.SetsList;
import de.fhg.iais.asc.commons.AscConfiguration;
import de.fhg.iais.asc.commons.LogMessageBuilder;
import de.fhg.iais.asc.commons.exceptions.AscTechnicalErrorException;
import de.fhg.iais.asc.commons.state.ASCState;
import de.fhg.iais.asc.contexts.AscProviderIngest;
import de.fhg.iais.asc.oai.consumer.identifier.IIdConsumer;
import de.fhg.iais.asc.oai.consumer.identifier.IdCounter;
import de.fhg.iais.asc.oai.consumer.set.ISetConsumer;
import de.fhg.iais.asc.oai.localwriter.RepositoryWriter;
import de.fhg.iais.commons.dbc.Check;

public abstract class AbstractHarvester {

    private static final Logger LOG = LoggerFactory.getLogger(AbstractHarvester.class);

    protected AscConfiguration config;
    protected ASCState ascState;
    protected AscProviderIngest ingestEvent;

    protected OaiPmhServer server;

    protected String uri = null;
    protected String metadataPrefix = null;

    protected String fromDate = null;
    protected String untilDate = null;

    /**
     * @param uri The repository url.
     * @param metadataPrefix The metadata prefix to harvest.
     * @param proxyHost
     * @param proxyPort
     * @param fromDateParameter
     * @param untilDateParameter
     * @param config The current ASC configuration, non-<code>null</code>.
     * @param ascState the state object to report the progress, non-<code>null</code>.
     * @param ingestEvent The ingest event in which the harvesting takes place, non-<code>null</code>.
     */
    public AbstractHarvester(String uri, String metadataPrefix, String proxyHost, Integer proxyPort,
            String fromDateParameter, String untilDateParameter, AscConfiguration config, ASCState ascState,
            AscProviderIngest ingestEvent) {

        Check.notNull(config, "config must be non-null");
        Check.notNull(ascState, "ascState must be non-null");
        Check.notNull(ingestEvent, "ingestEvent must be non-null");

        this.uri = uri;
        this.metadataPrefix = metadataPrefix;

        this.fromDate = StringUtils.defaultIfEmpty(fromDateParameter, null);
        this.untilDate = StringUtils.defaultIfEmpty(untilDateParameter, null);

        int connectionTimeout = config.get(AscConfiguration.HARVESTING_CONNECTION_TIMEOUT, 600000);
        LOG.info("set connection and socket timeouts to approx. " + (connectionTimeout / 1000) + " seconds"); // request from a.schenk DDB-724

        HttpParams httpParams = new BasicHttpParams();
        HttpConnectionParams.setConnectionTimeout(httpParams, connectionTimeout);
        HttpConnectionParams.setSoTimeout(httpParams, connectionTimeout);
        HttpClient client = new DefaultHttpClient(httpParams);

        // set some parameters that might help but will not harm
        // see: http://hc.apache.org/httpclient-legacy/preference-api.html

        // the user-agent:
        // tell them who we are (they see that from the IP anyway), thats a good habit,
        // shows that we are professional and not some script kiddies
        // and this is also a little bit of viral marketing :-)
        client.getParams().setParameter("http.useragent", "myCortex Harvester; http://www.iais.fraunhofer.de/");
        // the following option "can result in noticeable performance improvement" (see api docs)
        // it may switch on a keep-alive, may reduce load on server side (if they are smart)
        // and might reduce latency
        client.getParams().setParameter("http.protocol.expect-continue", true);

        // ignore all cookies because some OAI-PMH implementations don't know how to handle cookies
        client.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.IGNORE_COOKIES);

        // setting of the proxy if needed
        if (proxyHost != null && !proxyHost.isEmpty()) {
            HttpHost proxy = new HttpHost(proxyHost, proxyPort);
            client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
        }

        this.server = new OaiPmhServer(client, this.uri);

        this.config = config;
        this.ascState = ascState;
        this.ingestEvent = ingestEvent;
    }

    /**
     * Harvest all available sets.
     * 
     * @param writer
     * @return
     */
    public abstract int retrieveAll(final RepositoryWriter writer);

    /**
     * Harvest a specific set.
     * 
     * @param spec Set
     * @param writer
     * @return
     */
    public abstract int retrieveSet(final String spec, final RepositoryWriter writer);

    /**
     * Downloads the list of available sets from the OAI-PMH repository. Every set is given to a {@link ISetConsumer}.
     * 
     * @param setConsumer The {@link ISetConsumer} to which sets should be given, non-<code>null</code>.
     * @return A boolean that indicates whether the process was canceled because <code>setConsumer</code> returned <code>true</code> when set was given.
     */
    protected boolean listSets(ISetConsumer setConsumer) {
        Check.notNull(setConsumer, "setConsumer must be non-null");

        SetsList list = null;
        try {
            list = this.server.listSets();
        } catch (OAIException e) {
            throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e);
        }

        boolean more = list.size() > 0;
        while (more) {
            for (se.kb.oai.pmh.Set set : list.asList()) {
                // Let the consumer consume the set.
                if (setConsumer.consume(set.getName(), set.getSpec())) {
                    return true; // stop processing
                }
                if (list.getResumptionToken() == null) {
                    more = false;
                } else {
                    try {
                        list = this.server.listSets(list.getResumptionToken());
                    } catch (Exception e) {
                        throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e);
                    }
                }
            }
        }
        return false;
    }

    /**
     * Downloads a list of available record identifiers from the OAI-PMH repository, optionally restricted to
     * one set. The records themselves are not downloaded. Every identifier is given to a {@link IIdConsumer}.
     * 
     * @param idConsumer The {@link IIdConsumer} to which record identifiers should be given, non- <code>null</code>.
     * @param setSpec The <code>setSpec</code> of the repository set that should be included, may be <code>null</code> to indicate all sets.
     * @return A boolean that indicates whether the process was canceled, either because <code>idConsumer</code> returned <code>true</code> when set was given
     *         or because the user
     *         requested a cancel through {@link ASCState}.
     */
    protected boolean listIdentifiers(IIdConsumer idConsumer, String setSpec) {
        Check.notNull(idConsumer, "idConsumer must be non-null");

        IdentifiersList list = null;
        try {
            list = this.server.listIdentifiers(this.metadataPrefix, this.fromDate, this.untilDate, setSpec);
        } catch (ErrorResponseException e) {
            if (e.getCode().equals("noRecordsMatch")) {
                list = null;
            } else {
                throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e);
            }
        } catch (OAIException e) {
            throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e);
        }

        boolean more = list != null && list.size() > 0;
        while (more) {
            for (Header header : list.asList()) {
                String id = header.getIdentifier();
                String dateString = header.getDatestamp();
                List<String> setSpecs = header.getSetSpecs();

                // Interrupt if needed.
                if (this.ascState.isCancelRequested()) {
                    return true;
                }

                // Let the consumer consume the identifier.
                if (idConsumer.consume(id, dateString, setSpecs)) {
                    return true; // stop processing
                }
            }
            if (list.getResumptionToken() == null) {
                more = false;
            } else {
                try {
                    // Interrupt if needed.
                    if (this.ascState.isCancelRequested()) {
                        return true;
                    }
                    list = this.server.listIdentifiers(list.getResumptionToken());
                } catch (ErrorResponseException e) {
                    if (e.getCode().equals("noRecordsMatch")) {
                        more = false;
                    } else {
                        throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e);
                    }
                } catch (OAIException e) {
                    throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e);
                }
            }
        }
        return false;
    }

    /**
     * Determines the number of records in a repository.
     * 
     * @param setSpec The <code>setSpec</code> of the repository set that should be counted, may be <code>null</code> to indicate all sets.
     * @return The number of records in the repository or the repository set.
     */
    protected int retrieveRepositorySize(String setSpec) {
        if (this.ascState.getProgressInstance().getMaxFilesEach() == 0) {
            return retrieveRepositorySizeStandard(setSpec);
        } else {
            return retrieveRepositorySizeMaxFilesEach(setSpec);
        }
    }

    /**
     * This method is for determining the size of the repository. We first try to get the complete repository
     * size out of the resumption token. If this doesn't work we count the ids manually. This method is
     * invoked if maxFilesEach isn't set.
     * 
     * @param setSpec The <code>setSpec</code> of the repository set that should be counted, may be <code>null</code> to indicate all sets.
     * @return The number of records in the repository or the repository set.
     */
    private int retrieveRepositorySizeStandard(String setSpec) {
        int totalSize = -1;

        // First try to get the complete repository size out of the resumption token.
        try {
            IdentifiersList list = this.server.listIdentifiers(this.metadataPrefix, this.fromDate, this.untilDate,
                    setSpec);
            ResumptionToken resumptionToken = list.getResumptionToken();
            if (resumptionToken != null) {
                totalSize = resumptionToken.getCompleteListSize();
                if (totalSize == -1) {
                    LOG.info("Resumption token doesn't provide the number of records available in the repository.");
                } else {
                    LOG.info("Repository size as retrieved from the resumption token is " + totalSize);
                }
            } else {
                LOG.info("Resumption token doesn't provide the number of records available in the repository.");
            }
        } catch (OAIException e) {
            String message = LogMessageBuilder.getMessage(
                    "Error while retrieving the repository's size from the resumption token", this.config);
            LOG.warn(message);
        }

        // If the repository size could not be determined, try to count the size manually.
        if (totalSize == -1) {
            LOG.info("Counting number of records manually.");
            IdCounter idCounter = new IdCounter();
            listIdentifiers(idCounter, setSpec);
            totalSize = idCounter.getCounter();
            LOG.debug("Retrieved repository size by counting identifiers is " + totalSize);
        }

        return totalSize;
    }

    // TODO: This can be replaced by retrieveRepositorySizeStandard(maxSizeEach), if retrieveRepositorySizeStandard() and IdCounter are given an optional maximum value.
    private int retrieveRepositorySizeMaxFilesEach(String setSpec) {
        int totalsize = 0;

        try {
            IdentifiersList list = this.server.listIdentifiers(this.metadataPrefix, this.fromDate, this.untilDate,
                    setSpec);
            ResumptionToken resumptionToken = list.getResumptionToken();
            if (resumptionToken != null) {
                int listsize = resumptionToken.getCompleteListSize();
                if (listsize > this.ascState.getProgressInstance().getMaxFilesEach()) {
                    listsize = this.ascState.getProgressInstance().getMaxFilesEach();
                }
                totalsize += listsize;
            } else {
                // we assume that we have the complete list because there's no resumption token
                if (list.size() > this.ascState.getProgressInstance().getMaxFilesEach()) {
                    totalsize += this.ascState.getProgressInstance().getMaxFilesEach();
                } else {
                    totalsize += list.size();
                }
            }
        } catch (OAIException e) {
            throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e);
        }

        return totalsize;
    }

}