Java tutorial
package de.fhg.iais.asc.oai.strategy; /****************************************************************************** * Copyright 2011 (c) Fraunhofer IAIS Netmedia http://www.iais.fraunhofer.de * * ************************************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); you may * * not use this file except in compliance with the License. * * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * * software distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * ******************************************************************************/ import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.http.HttpHost; import org.apache.http.client.HttpClient; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.HttpConnectionParams; import org.apache.http.params.HttpParams; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import se.kb.oai.OAIException; import se.kb.oai.pmh.ErrorResponseException; import se.kb.oai.pmh.Header; import se.kb.oai.pmh.IdentifiersList; import se.kb.oai.pmh.OaiPmhServer; import se.kb.oai.pmh.ResumptionToken; import se.kb.oai.pmh.SetsList; import de.fhg.iais.asc.commons.AscConfiguration; import de.fhg.iais.asc.commons.LogMessageBuilder; import de.fhg.iais.asc.commons.exceptions.AscTechnicalErrorException; import de.fhg.iais.asc.commons.state.ASCState; import de.fhg.iais.asc.contexts.AscProviderIngest; import de.fhg.iais.asc.oai.consumer.identifier.IIdConsumer; import de.fhg.iais.asc.oai.consumer.identifier.IdCounter; import de.fhg.iais.asc.oai.consumer.set.ISetConsumer; import de.fhg.iais.asc.oai.localwriter.RepositoryWriter; import de.fhg.iais.commons.dbc.Check; public abstract class AbstractHarvester { private static final Logger LOG = LoggerFactory.getLogger(AbstractHarvester.class); protected AscConfiguration config; protected ASCState ascState; protected AscProviderIngest ingestEvent; protected OaiPmhServer server; protected String uri = null; protected String metadataPrefix = null; protected String fromDate = null; protected String untilDate = null; /** * @param uri The repository url. * @param metadataPrefix The metadata prefix to harvest. * @param proxyHost * @param proxyPort * @param fromDateParameter * @param untilDateParameter * @param config The current ASC configuration, non-<code>null</code>. * @param ascState the state object to report the progress, non-<code>null</code>. * @param ingestEvent The ingest event in which the harvesting takes place, non-<code>null</code>. */ public AbstractHarvester(String uri, String metadataPrefix, String proxyHost, Integer proxyPort, String fromDateParameter, String untilDateParameter, AscConfiguration config, ASCState ascState, AscProviderIngest ingestEvent) { Check.notNull(config, "config must be non-null"); Check.notNull(ascState, "ascState must be non-null"); Check.notNull(ingestEvent, "ingestEvent must be non-null"); this.uri = uri; this.metadataPrefix = metadataPrefix; this.fromDate = StringUtils.defaultIfEmpty(fromDateParameter, null); this.untilDate = StringUtils.defaultIfEmpty(untilDateParameter, null); int connectionTimeout = config.get(AscConfiguration.HARVESTING_CONNECTION_TIMEOUT, 600000); LOG.info("set connection and socket timeouts to approx. " + (connectionTimeout / 1000) + " seconds"); // request from a.schenk DDB-724 HttpParams httpParams = new BasicHttpParams(); HttpConnectionParams.setConnectionTimeout(httpParams, connectionTimeout); HttpConnectionParams.setSoTimeout(httpParams, connectionTimeout); HttpClient client = new DefaultHttpClient(httpParams); // set some parameters that might help but will not harm // see: http://hc.apache.org/httpclient-legacy/preference-api.html // the user-agent: // tell them who we are (they see that from the IP anyway), thats a good habit, // shows that we are professional and not some script kiddies // and this is also a little bit of viral marketing :-) client.getParams().setParameter("http.useragent", "myCortex Harvester; http://www.iais.fraunhofer.de/"); // the following option "can result in noticeable performance improvement" (see api docs) // it may switch on a keep-alive, may reduce load on server side (if they are smart) // and might reduce latency client.getParams().setParameter("http.protocol.expect-continue", true); // ignore all cookies because some OAI-PMH implementations don't know how to handle cookies client.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.IGNORE_COOKIES); // setting of the proxy if needed if (proxyHost != null && !proxyHost.isEmpty()) { HttpHost proxy = new HttpHost(proxyHost, proxyPort); client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } this.server = new OaiPmhServer(client, this.uri); this.config = config; this.ascState = ascState; this.ingestEvent = ingestEvent; } /** * Harvest all available sets. * * @param writer * @return */ public abstract int retrieveAll(final RepositoryWriter writer); /** * Harvest a specific set. * * @param spec Set * @param writer * @return */ public abstract int retrieveSet(final String spec, final RepositoryWriter writer); /** * Downloads the list of available sets from the OAI-PMH repository. Every set is given to a {@link ISetConsumer}. * * @param setConsumer The {@link ISetConsumer} to which sets should be given, non-<code>null</code>. * @return A boolean that indicates whether the process was canceled because <code>setConsumer</code> returned <code>true</code> when set was given. */ protected boolean listSets(ISetConsumer setConsumer) { Check.notNull(setConsumer, "setConsumer must be non-null"); SetsList list = null; try { list = this.server.listSets(); } catch (OAIException e) { throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e); } boolean more = list.size() > 0; while (more) { for (se.kb.oai.pmh.Set set : list.asList()) { // Let the consumer consume the set. if (setConsumer.consume(set.getName(), set.getSpec())) { return true; // stop processing } if (list.getResumptionToken() == null) { more = false; } else { try { list = this.server.listSets(list.getResumptionToken()); } catch (Exception e) { throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e); } } } } return false; } /** * Downloads a list of available record identifiers from the OAI-PMH repository, optionally restricted to * one set. The records themselves are not downloaded. Every identifier is given to a {@link IIdConsumer}. * * @param idConsumer The {@link IIdConsumer} to which record identifiers should be given, non- <code>null</code>. * @param setSpec The <code>setSpec</code> of the repository set that should be included, may be <code>null</code> to indicate all sets. * @return A boolean that indicates whether the process was canceled, either because <code>idConsumer</code> returned <code>true</code> when set was given * or because the user * requested a cancel through {@link ASCState}. */ protected boolean listIdentifiers(IIdConsumer idConsumer, String setSpec) { Check.notNull(idConsumer, "idConsumer must be non-null"); IdentifiersList list = null; try { list = this.server.listIdentifiers(this.metadataPrefix, this.fromDate, this.untilDate, setSpec); } catch (ErrorResponseException e) { if (e.getCode().equals("noRecordsMatch")) { list = null; } else { throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e); } } catch (OAIException e) { throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e); } boolean more = list != null && list.size() > 0; while (more) { for (Header header : list.asList()) { String id = header.getIdentifier(); String dateString = header.getDatestamp(); List<String> setSpecs = header.getSetSpecs(); // Interrupt if needed. if (this.ascState.isCancelRequested()) { return true; } // Let the consumer consume the identifier. if (idConsumer.consume(id, dateString, setSpecs)) { return true; // stop processing } } if (list.getResumptionToken() == null) { more = false; } else { try { // Interrupt if needed. if (this.ascState.isCancelRequested()) { return true; } list = this.server.listIdentifiers(list.getResumptionToken()); } catch (ErrorResponseException e) { if (e.getCode().equals("noRecordsMatch")) { more = false; } else { throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e); } } catch (OAIException e) { throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e); } } } return false; } /** * Determines the number of records in a repository. * * @param setSpec The <code>setSpec</code> of the repository set that should be counted, may be <code>null</code> to indicate all sets. * @return The number of records in the repository or the repository set. */ protected int retrieveRepositorySize(String setSpec) { if (this.ascState.getProgressInstance().getMaxFilesEach() == 0) { return retrieveRepositorySizeStandard(setSpec); } else { return retrieveRepositorySizeMaxFilesEach(setSpec); } } /** * This method is for determining the size of the repository. We first try to get the complete repository * size out of the resumption token. If this doesn't work we count the ids manually. This method is * invoked if maxFilesEach isn't set. * * @param setSpec The <code>setSpec</code> of the repository set that should be counted, may be <code>null</code> to indicate all sets. * @return The number of records in the repository or the repository set. */ private int retrieveRepositorySizeStandard(String setSpec) { int totalSize = -1; // First try to get the complete repository size out of the resumption token. try { IdentifiersList list = this.server.listIdentifiers(this.metadataPrefix, this.fromDate, this.untilDate, setSpec); ResumptionToken resumptionToken = list.getResumptionToken(); if (resumptionToken != null) { totalSize = resumptionToken.getCompleteListSize(); if (totalSize == -1) { LOG.info("Resumption token doesn't provide the number of records available in the repository."); } else { LOG.info("Repository size as retrieved from the resumption token is " + totalSize); } } else { LOG.info("Resumption token doesn't provide the number of records available in the repository."); } } catch (OAIException e) { String message = LogMessageBuilder.getMessage( "Error while retrieving the repository's size from the resumption token", this.config); LOG.warn(message); } // If the repository size could not be determined, try to count the size manually. if (totalSize == -1) { LOG.info("Counting number of records manually."); IdCounter idCounter = new IdCounter(); listIdentifiers(idCounter, setSpec); totalSize = idCounter.getCounter(); LOG.debug("Retrieved repository size by counting identifiers is " + totalSize); } return totalSize; } // TODO: This can be replaced by retrieveRepositorySizeStandard(maxSizeEach), if retrieveRepositorySizeStandard() and IdCounter are given an optional maximum value. private int retrieveRepositorySizeMaxFilesEach(String setSpec) { int totalsize = 0; try { IdentifiersList list = this.server.listIdentifiers(this.metadataPrefix, this.fromDate, this.untilDate, setSpec); ResumptionToken resumptionToken = list.getResumptionToken(); if (resumptionToken != null) { int listsize = resumptionToken.getCompleteListSize(); if (listsize > this.ascState.getProgressInstance().getMaxFilesEach()) { listsize = this.ascState.getProgressInstance().getMaxFilesEach(); } totalsize += listsize; } else { // we assume that we have the complete list because there's no resumption token if (list.size() > this.ascState.getProgressInstance().getMaxFilesEach()) { totalsize += this.ascState.getProgressInstance().getMaxFilesEach(); } else { totalsize += list.size(); } } } catch (OAIException e) { throw AscTechnicalErrorException.wrap(e.getLocalizedMessage(), e); } return totalsize; } }