xc.mst.harvester.HarvestManager.java Source code

Java tutorial

Introduction

Here is the source code for xc.mst.harvester.HarvestManager.java

Source

/**
 * Copyright (c) 2009 eXtensible Catalog Organization
 *
 * This program is free software; you can redistribute it and/or modify it under the terms of the MIT/X11 license. The text of the
 * license can be found at http://www.opensource.org/licenses/mit-license.php and copy of the license can be found on the project
 * website http://www.extensiblecatalog.org/.
 *
 */
package xc.mst.harvester;

import gnu.trove.TLongByteHashMap;

import java.io.File;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.URLEncoder;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.httpclient.HttpException;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;

import xc.mst.bo.harvest.Harvest;
import xc.mst.bo.harvest.HarvestSchedule;
import xc.mst.bo.harvest.HarvestScheduleStep;
import xc.mst.bo.provider.Format;
import xc.mst.bo.provider.Provider;
import xc.mst.bo.provider.Set;
import xc.mst.bo.record.Record;
import xc.mst.bo.record.RecordCounts;
import xc.mst.cache.DynKeyLongMap;
import xc.mst.constants.Constants;
import xc.mst.constants.Status;
import xc.mst.dao.DataException;
import xc.mst.dao.DatabaseConfigException;
import xc.mst.email.Emailer;
import xc.mst.scheduling.WorkerThread;
import xc.mst.utils.LogWriter;
import xc.mst.utils.MSTConfiguration;
import xc.mst.utils.TimingLogger;
import xc.mst.utils.XmlHelper;

public class HarvestManager extends WorkerThread {

    /**
     * A reference to the logger which writes to the HarvestIn log file
     */
    private static Logger log = Logger.getLogger("harvestIn");
    private static Logger LOG = Logger.getLogger(HarvestManager.class);

    protected static DateTimeFormatter UTC_SECOND_FORMATTER = null;
    protected static DateTimeFormatter UTC_DAY_FORMATTER = null;
    static {
        UTC_SECOND_FORMATTER = ISODateTimeFormat.dateTime();
        UTC_SECOND_FORMATTER = UTC_SECOND_FORMATTER.withZone(DateTimeZone.UTC);

        UTC_DAY_FORMATTER = ISODateTimeFormat.date();
        UTC_DAY_FORMATTER = UTC_DAY_FORMATTER.withZone(DateTimeZone.UTC);
    }
    // Map<MostSigToken, ListOfAllOaoIdsThatHaveToken<EntireOaiId, recordId>>
    protected boolean cacheSetup = false;
    protected DynKeyLongMap oaiIdCache = new DynKeyLongMap();
    protected TLongByteHashMap previousStatuses = new TLongByteHashMap();

    protected static int LARGE_HARVEST_THRESHOLD_DEFAULT = 10000;
    protected int largeHarvestThreshold = LARGE_HARVEST_THRESHOLD_DEFAULT;

    // The is public and static simply for the MockHarvestTest
    public static String lastOaiRequest = null;
    protected HarvestSchedule harvestSchedule = null;
    protected List<HarvestScheduleStep> harvestScheduleSteps = null;
    protected boolean hssFirstTime = true;
    protected int harvestScheduleStepIndex = 0;
    protected Harvest currentHarvest = null;
    protected Date startDate = null;
    protected String resumptionToken = null;
    protected int requestsSent4Step = 0;
    protected long startTime = 0;

    protected long recordsProcessedThisRun = 0l;
    protected long records2ProcessThisRun = 0l;

    public String printDateTime(Date d) {
        String s = UTC_SECOND_FORMATTER.print(d.getTime());
        s = s.substring(0, s.length() - 5) + "Z";
        return s;
    }

    public String printDate(Date d) {
        String s = UTC_DAY_FORMATTER.print(d.getTime());
        return s;
    }

    /**
     * The policy for tracking deleted records that the OAI repository uses (either DELETED_RECORD_NO, DELETED_RECORD_TRANSIENT, or DELETED_RECORD_PERSISTENT)
     */
    protected int deletedRecord = -1;

    protected Emailer mailer = (Emailer) MSTConfiguration.getInstance().getBean("Emailer");

    public String getName() {
        if (harvestSchedule != null && harvestSchedule.getProvider() != null) {
            return "harvest-" + harvestSchedule.getProvider().getName();
        } else {
            return "harvest";
        }
    }

    public String getDetailedStatus() {
        return "processed " + this.recordsProcessedThisRun + " of " + this.records2ProcessThisRun;
    }

    public void setHarvestSchedule(HarvestSchedule harvestSchedule) {
        this.harvestSchedule = harvestSchedule;
    }

    public void setup() {

        try {
            hssFirstTime = true;
            this.resumptionToken = null;
            startTime = new Date().getTime();
            // BDA - I added this check for 0 becuase the initialization of HarvestSchedule.steps creates a new
            // list of size zero. The DAO which creates the harvestSchedule doesn't inject steps into it. So
            // there's really no other way to tell.
            if (harvestSchedule.getSteps() == null || harvestSchedule.getSteps().size() == 0) {
                harvestScheduleSteps = getHarvestScheduleStepDAO().getStepsForSchedule(harvestSchedule.getId());
            } else {
                harvestScheduleSteps = harvestSchedule.getSteps();
            }
            harvestScheduleStepIndex = 0;
            repo = getRepositoryService().getRepository(harvestSchedule.getProvider());

            String strLHT = MSTConfiguration.getInstance().getProperty(Constants.CONFIG_LARGE_HARVEST_THRESHOLD);
            largeHarvestThreshold = LARGE_HARVEST_THRESHOLD_DEFAULT;
            if (strLHT != null) {
                try {
                    largeHarvestThreshold = Integer.parseInt(strLHT);
                } catch (NumberFormatException e) {
                    largeHarvestThreshold = LARGE_HARVEST_THRESHOLD_DEFAULT;
                }
            }
            // no longer set up cache for all harvests; only do so for "large" ones
            //setupCache();

            this.currentHarvest = getScheduleService().getHarvest(harvestSchedule);
            this.incomingRecordCounts = new RecordCounts(this.currentHarvest.getEndTime(), RecordCounts.INCOMING);
        } catch (DatabaseConfigException e) {
            getUtil().throwIt(e);
        }
    }

    private void setupCache() {
        if (cacheSetup)
            return; // one-time event only
        oaiIdCache.clear();
        previousStatuses.clear();
        TimingLogger.outputMemory();
        getRepositoryDAO().populateHarvestCache(repo.getName(), oaiIdCache);
        TimingLogger.reset();
        getRepositoryDAO().populatePreviousStatuses(repo.getName(), previousStatuses, false);
        TimingLogger.reset();
        cacheSetup = true;
    }

    private Long getRecordId(String oaiId) {
        String nonRedundantId = getUtil().getNonRedundantOaiId(oaiId);
        Long recId = oaiIdCache.getLong(nonRedundantId);
        if (cacheSetup) {
            return recId;
        } else {
            if (recId != null && recId != 0)
                return recId;
            return getRepositoryDAO().getRecordId(repo.getName(), oaiId);
        }
    }

    private void cacheRecordId(String oaiId, Long recordId) {
        String nonRedundantId = getUtil().getNonRedundantOaiId(oaiId);
        oaiIdCache.put(nonRedundantId, recordId);
    }

    private char getPreviousStatus(Long recordId) {
        char prevStatus = (char) previousStatuses.get(recordId);
        if (cacheSetup) {
            return prevStatus;
        } else {
            if (prevStatus != (char) 0)
                return prevStatus;
            return getRepositoryDAO().getPreviousStatus(repo.getName(), recordId, false);
        }
    }

    private void cachePreviousStatus(Long recordId, byte status) {
        previousStatuses.put(recordId, status);
    }

    @Override
    public void finishInner(boolean success) {
        super.finishInner(success);
        RecordCounts mostRecentIncomingRecordCounts = getRecordCountsDAO()
                .getMostRecentIncomingRecordCounts(repo.getName());
        // I'm subtracting 1s from startTime because they might actually be equal by the second
        if (mostRecentIncomingRecordCounts == null) {
            LOG.error("*** HarvestManager.finishInner: mostRecentIncomingRecordCounts == null!");
            LogWriter.addInfo(harvestSchedule.getProvider().getLogFileName(),
                    "Harvest Manager - unable to print record counts, null mostRecentIncomingRecordCounts!");
        } else if (mostRecentIncomingRecordCounts.getHarvestStartDate() == null) {
            LOG.error(
                    "*** HarvestManager.finishInner: mostRecentIncomingRecordCounts.getHarvestStartDate() == null!");
            LogWriter.addInfo(harvestSchedule.getProvider().getLogFileName(),
                    "Harvest Manager - unable to print record counts, null harvest start date!");
        } else if (recordsProcessedThisRun > 0) {
            for (RecordCounts rc : new RecordCounts[] { mostRecentIncomingRecordCounts,
                    getRecordCountsDAO().getTotalIncomingRecordCounts(repo.getName()) }) {
                LOG.debug("harvestSchedule: " + harvestSchedule);
                LOG.debug("harvestSchedule.getProvider(): " + harvestSchedule.getProvider());
                LOG.debug("harvestSchedule.getProvider().getLogFileName(): "
                        + harvestSchedule.getProvider().getLogFileName());
                LOG.debug("rc: " + rc);
                LOG.debug("repo: " + repo);
                LogWriter.addInfo(harvestSchedule.getProvider().getLogFileName(), rc.toString(repo.getName()));
                // LogWriter.addInfo(harvestSchedule.getProvider().getLogFileName(), " %************************%");
            }
            LogWriter.addInfo(harvestSchedule.getProvider().getLogFileName(), repo.getRecordStatsByType());
            // LogWriter.addInfo(harvestSchedule.getProvider().getLogFileName(), " &&&&&&&&&&&&&&&&&&&&&&&&&&");

            // in case you find a reason to do record count calculations here, uncomment this, grab the desired type data from the map
            // below, and make your calculations.
            //
            // RecordCounts rc = getRecordCountsDAO().getTotalIncomingRecordCounts(repo.getName());
            // Map<String, AtomicInteger> counts4type = rc.getCounts().get(RecordCounts.TOTALS);
        } else {
            LOG.debug("HarvestManager will not write record counts to harvest log because recordsProcessThisRun="
                    + recordsProcessedThisRun);
        }
    }

    public void logError(Throwable t) {
        try {
            log.error(t.getMessage(), t);
            Provider provider = currentHarvest.getProvider();
            provider.setErrors(provider.getErrors() + 1);
            getProviderDAO().update(provider);
        } catch (DataException de) {
            throw new RuntimeException(de);
        }
        getUtil().throwIt(t);
    }

    public void validate(HarvestScheduleStep scheduleStep) throws DataException {
        Provider provider = harvestSchedule.getProvider();
        // Try to validate the repository. An exception will be thrown and caught if validation fails.
        // Validate that the repository conforms to the OAI protocol
        TimingLogger.log("about to validate repo");
        ValidateRepository validator = (ValidateRepository) MSTConfiguration.getInstance()
                .getBean("ValidateRepository");

        validator.validate(harvestSchedule.getProvider().getId());

        TimingLogger.log("validated repo");
        deletedRecord = validator.getDeletedRecordSupport();

        // Get the provider from the repository so we know the formats and sets it
        // supports according to the validation we just performed
        harvestSchedule.setProvider(getProviderDAO().getById(harvestSchedule.getProvider().getId()));
        provider = harvestSchedule.getProvider();

        String metadataPrefix = scheduleStep.getFormat().getName();

        // Get the format we're to harvest
        Format format = getFormatDAO().getByName(metadataPrefix);

        // If the provider no longer supports the requested format we can't harvest it
        if (!harvestSchedule.getProvider().getFormats().contains(format)) {
            String errorMsg = "The harvest could not be run because the MetadataFormat " + metadataPrefix
                    + " is no longer supported by the OAI repository " + provider.getOaiProviderUrl() + ".";

            LogWriter.addError(harvestSchedule.getProvider().getLogFileName(), errorMsg);
            sendReportEmail(errorMsg);
            throw new RuntimeException(errorMsg);
        } // end if(format no longer supported)

        String setSpec = null;

        // If there was a set, set up the setSpec
        if (scheduleStep.getSet() != null)
            setSpec = scheduleStep.getSet().getSetSpec();

        // If the provider no longer contains the requested set we can't harvest it
        if (setSpec != null
                && !harvestSchedule.getProvider().getSets().contains(getSetDAO().getBySetSpec(setSpec))) {
            String errorMsg = "The harvest could not be run because the Set " + setSpec
                    + " is no longer supported by the OAI repository " + provider.getOaiProviderUrl() + ".";

            LogWriter.addError(harvestSchedule.getProvider().getLogFileName(), errorMsg);
            sendReportEmail(errorMsg);
            throw new RuntimeException(errorMsg);
        }
    }

    public boolean doSomeWork() {
        running.lock();
        boolean retVal = true;
        /*
         * BDA - I moved this to the bottom of this method
        String testHarvestMaxRequests = config.getProperty("test.harvest.maxRequests");
        recordsProcessed;
        if (testHarvestMaxRequests != null) {
        int maxRequests = Integer.parseInt(testHarvestMaxRequests);
        if (maxRequests > 0 && maxRequests == requestsSent4Step) {
            retVal = false;
        }
        }
        */
        requestsSent4Step++;
        log.debug("harvestScheduleSteps.size(): " + harvestScheduleSteps.size());
        if (retVal && harvestScheduleStepIndex >= 0 && harvestScheduleStepIndex < harvestScheduleSteps.size()) {
            Provider provider = null;
            try {
                HarvestScheduleStep scheduleStep = harvestScheduleSteps.get(harvestScheduleStepIndex);

                String metadataPrefix = null;
                if (scheduleStep != null && scheduleStep.getFormat() != null) {
                    metadataPrefix = scheduleStep.getFormat().getName();
                }

                String setSpec = null;

                // If there was a set, set up the setSpec
                if (scheduleStep.getSet() != null)
                    setSpec = scheduleStep.getSet().getSetSpec();

                HarvestSchedule schedule = scheduleStep.getSchedule();
                String baseURL = currentHarvest.getProvider().getOaiProviderUrl();

                LogWriter.addInfo(scheduleStep.getSchedule().getProvider().getLogFileName(),
                        "Starting harvest of " + baseURL);

                provider = harvestSchedule.getProvider();
                String request = null;
                Document doc = null;
                if (baseURL.startsWith("file:")) {
                    File pwd = new File(".");
                    log.debug("pwd: " + pwd.getAbsolutePath());
                    pwd = new File(".");
                    log.debug("pwd: " + pwd.getAbsolutePath());
                    // pwd = new File(new URI("file://."));
                    // log.debug("pwd: "+pwd.getAbsolutePath());
                    String folderStr = baseURL.substring("file://".length());
                    log.debug("folderStr: " + folderStr);
                    File folder = new File(folderStr);
                    boolean nextOne = false;
                    File file2harvest = null;
                    log.debug("provider.getLastOaiRequest(): " + provider.getLastOaiRequest());
                    log.debug("provider: " + provider);
                    log.debug("provider.hashCode(): " + provider.hashCode());
                    if (!folder.exists()) {
                        throw new RuntimeException("folder " + folder.getAbsolutePath() + " does not exist");
                    }
                    if (folder.listFiles() != null) {
                        if (provider.getLastOaiRequest() != null) {
                            List<String> fileNames = new ArrayList<String>();
                            for (File file : folder.listFiles()) {
                                fileNames.add(file.getName());
                            }
                            if (!fileNames.contains(provider.getLastOaiRequest())) {
                                provider.setLastOaiRequest(null);
                            }
                        }

                        //TODO note during unit test, file_harvest, sometimes does not find file to harvest/hangs if a *~ file exists, why?
                        // maybe if running test 2x?
                        File[] files = folder.listFiles();
                        files = sortFiles(files);
                        for (File file : files) {
                            log.debug("file.getName(): " + file.getName());
                            log.debug("provider.getLastOaiRequest(): " + provider.getLastOaiRequest());
                            if (!file.getName().endsWith(".xml")) {
                                continue;
                            } else if (nextOne || provider.getLastOaiRequest() == null) {
                                file2harvest = file;
                                break;
                            } else {
                                if (provider.getLastOaiRequest().equals(file.getName())) {
                                    nextOne = true;
                                }
                            }
                        }
                    }
                    log.info("file2harvest: " + file2harvest);
                    if (file2harvest == null) {
                        return false;
                    }
                    provider.setLastOaiRequest(file2harvest.getName());
                    lastOaiRequest = file2harvest.getName();
                    doc = new XmlHelper().getJDomDocument(getUtil().slurp(file2harvest));
                } else if (baseURL.startsWith("http:")) {
                    String verb = "ListRecords";
                    request = baseURL;

                    String baseRequest = null;

                    // If this is the first request, setup a ListRecords request with the
                    // correct metadataPrefix. If we are supposed harvest a specific set
                    // or use a known from or until parameter, set them here as well.
                    // if (hssFirstTime) {
                    if (resumptionToken == null) {
                        this.recordsProcessedThisRun = 0;
                        this.records2ProcessThisRun = 0;
                        validate(scheduleStep);

                        request += "?verb=" + verb;
                        request += "&metadataPrefix=" + metadataPrefix;

                        if (setSpec != null && setSpec.length() > 0) {
                            // strip off the first part of the setSpec because it's the reponame
                            int idx0 = setSpec.indexOf(':');
                            request += "&set=" + URLEncoder.encode(setSpec.substring(idx0 + 1), "UTF-8");
                        }

                        baseRequest = request;

                        // both of these null checks are pointless at this time as I'll
                        // always be passing in a start and end date
                        if (currentHarvest.getStartTime() != null) {
                            if (Provider.DAY_GRANULARITY.equals(provider.getGranularity())) {
                                request += "&from=" + printDate(currentHarvest.getStartTime());
                            } else if (Provider.SECOND_GRANULARITY.equals(provider.getGranularity())) {
                                request += "&from=" + printDateTime(currentHarvest.getStartTime());
                            }
                        }
                        if (currentHarvest.getEndTime() != null) {
                            if (Provider.DAY_GRANULARITY.equals(provider.getGranularity())) {
                                request += "&until=" + printDate(currentHarvest.getEndTime());
                            } else if (Provider.SECOND_GRANULARITY.equals(provider.getGranularity())) {
                                request += "&until=" + printDateTime(currentHarvest.getEndTime());
                            }
                        }

                        harvestSchedule.setRequest(baseRequest);
                        harvestSchedule.setStatus(Status.RUNNING);
                        getHarvestScheduleDAO().update(harvestSchedule, false);
                        currentHarvest.setRequest(request);
                        getHarvestDAO().update(currentHarvest);
                    } else {
                        try {
                            resumptionToken = URLEncoder.encode(resumptionToken, "utf-8");
                        } catch (UnsupportedEncodingException uee) {
                            log.error("couldn't encode resumption token: " + resumptionToken);
                        }
                        request += "?verb=" + verb + "&resumptionToken=" + resumptionToken;
                    }

                    LogWriter.addInfo(schedule.getProvider().getLogFileName(), "The OAI request is " + request);

                    if (log.isDebugEnabled()) {
                        log.debug("Sending the OAI request: " + request);
                    }

                    // Perform the harvest
                    TimingLogger.start("sendRequest");
                    doc = getHttpService().sendRequest(request);
                    /*
                    log.debug("doc: ");
                    if (log.isDebugEnabled())
                    log.debug(new XmlHelper().getString(doc.getRootElement()));
                    */
                    TimingLogger.stop("sendRequest");

                    provider.setLastOaiRequest(request);
                }

                TimingLogger.start("parseRecords");
                resumptionToken = parseRecords(metadataPrefix, doc, baseURL);
                log.debug("resumptionToken: " + resumptionToken);
                TimingLogger.stop("parseRecords");

                getProviderDAO().update(provider, false);

                LogWriter.addInfo(scheduleStep.getSchedule().getProvider().getLogFileName(),
                        "Finished harvesting " + baseURL);
                // + ", " + recordsProcessed + " new records were returned by the OAI provider.");

            } catch (DataException de) {
                logError(de);
                retVal = false;
            } catch (HttpException he) {
                logError(he);
            } catch (Throwable t) {
                logError(t);
            }
            hssFirstTime = false;
            retVal = true;
            if (resumptionToken == null) {
                try {
                    if (provider.getNumberOfRecordsToHarvest() == 0) {
                        provider.setLastHarvestEndTime(new Date());
                        getProviderDAO().update(provider, false);
                    }
                } catch (Throwable t) {
                    LOG.debug("", t);
                }

                hssFirstTime = true;
                harvestScheduleStepIndex++;
                if (harvestScheduleStepIndex >= harvestScheduleSteps.size()) {
                    retVal = false;
                }
            }
            if (requestsSent4Step % 10 == 0) {
                long num = (recordsProcessedThisRun / (requestsSent4Step / 10));
                LOG.debug("**** Reset with performance, requestsSent4Step=" + requestsSent4Step
                        + " recordsProcessedThisRun=" + num);
                TimingLogger.reset(num);
            }
        } else {
            retVal = false;
            TimingLogger.reset();
        }
        if (harvestSchedule.getProvider().getNumberOfRecordsToHarvest() > 0
                && harvestSchedule.getProvider().getNumberOfRecordsToHarvest() <= this.recordsProcessedThisRun) {
            hssFirstTime = true;
            harvestScheduleStepIndex++;
            if (harvestScheduleStepIndex >= harvestScheduleSteps.size()) {
                retVal = false;
            }
        }
        repo.commitIfNecessary(false, recordsProcessedThisRun, this.incomingRecordCounts, null);
        running.unlock();
        return retVal;
    }

    // this is for debug use with a harvest from filesystem, files have format like:
    // 7_969999_970000_6679727.xml
    // 7_974999_975000_6679727.xml
    // must then sort by token after 2nd _
    // also note that the first file gets the special name, initial.xml
    //
    // assumptions: already know files != null
    private File[] sortFiles(File[] files) {
        if (files.length < 2) {
            return files;
        }
        String name = files[0].getName();
        if (!name.startsWith("initial")) {
            StringTokenizer st = new StringTokenizer(name, "_");
            if (st.countTokens() >= 3) {
                // have at least 2 underscores...can proceed with sorting by token after 2nd underscore.
                return reallySortFiles(files);
            } else {
                return files;
            }
        } else if (files.length > 1) {
            name = files[1].getName();
            StringTokenizer st = new StringTokenizer(name, "_");
            if (st.countTokens() >= 3) {
                // have at least 2 underscores...can proceed with sorting by token after 2nd underscore.
                return reallySortFiles(files);
            } else
                return files;
        }
        return files;
    }

    // this is for debug use with a harvest from filesystem, files have format like:
    // 7_969999_970000_6679727.xml
    // 7_974999_975000_6679727.xml
    // must then sort by token after 2nd _
    //
    // by earlier method determined we have files containing 2 '_' characters, sort these by int after 2nd underscore.
    private File[] reallySortFiles(File[] files) {
        TreeMap<Long, File> map = new TreeMap<Long, File>();
        for (File file : files) {
            if (file.getName().startsWith("initial")) { // want this one to be 1st.
                map.put(0l, file);
            } else {
                StringTokenizer st = new StringTokenizer(file.getName(), "_");
                try {
                    st.nextToken();
                    st.nextToken();
                } catch (NoSuchElementException e) {
                    LOG.error("HarvestManager, trying to harvest from file, unexpected exception handling file "
                            + file.toString(), e);
                    return files;
                }
                try {
                    long tokL = Long.parseLong(st.nextToken());
                    map.put(tokL, file);
                } catch (NumberFormatException nfe) {
                    LOG.error("HarvestManager, trying to harvest from file, unexpected exception handling file "
                            + file.toString(), nfe);
                    return files;
                }
            }
        }
        Collection<File> collection = map.values();
        return collection.toArray(new File[0]);
    }

    @SuppressWarnings("unchecked")
    protected String parseRecords(String prefix, Document doc, String baseURL) {

        String resumption = null;
        Element root = doc.getRootElement();

        // Check whether or not the response contained an error
        // If it did, throw an exception describing the error
        Element errorEl = root.getChild("error", root.getNamespace());
        if (errorEl != null) {
            String errorCode = errorEl.getAttributeValue("code");
            log.info("errorCode: " + errorCode + " " + errorEl.getText());
            return null;
            // throw new RuntimeException("errorCode: "+errorCode+" "+errorEl.getText());
        }

        Element listRecordsEl = null;
        // Get the verb (ListRecords) element. Try to get it as though it were the child of
        // the root element. If that doesn't work, assume that it is the root element itself
        try {
            listRecordsEl = root.getChild("ListRecords", root.getNamespace());
        } catch (Throwable e) {
            listRecordsEl = root;
        }

        // If the record contained a resumption token, store that resumption token
        Element resumptionEl = listRecordsEl.getChild("resumptionToken", root.getNamespace());

        if (resumptionEl != null) {
            resumption = resumptionEl.getText();
        }
        log.debug("resumption: " + resumption);
        if (!StringUtils.isEmpty(resumption)) {
            try {
                this.records2ProcessThisRun = Integer.parseInt(resumptionEl.getAttributeValue("completeListSize"));
            } catch (Throwable t) {
                this.records2ProcessThisRun = -1;
            }
            log.debug("The resumption string is " + resumption);
        } else {
            resumption = null;
        }

        // Is this a "large" update?
        // If so, we will cache OAI IDs and previous statuses; otherwise, we hit the DB each time
        if (resumption != null) {
            if (this.records2ProcessThisRun >= largeHarvestThreshold) {
                log.info("This is a large update; we will cache OAI IDs (" + this.records2ProcessThisRun + " >= "
                        + largeHarvestThreshold + ").");
                oaiIdCache.ensureCapacity((int) this.records2ProcessThisRun);
                setupCache();
            } else {
                log.info("This is not a large update; we will not need to cache OAI IDs ("
                        + this.records2ProcessThisRun + " < " + largeHarvestThreshold + ").");
            }
        } else {
            log.info(
                    "This is not a large update; we will not need to cache OAI IDs (no resumptionToken; assuming it's a \"small\" update\").");
        }

        // Try to get the element containing the first record. It should be a child of the
        // verb element.
        Element recordEl = null;
        try {
            recordEl = listRecordsEl.getChild("record", root.getNamespace());
        } catch (Throwable e) {
            // Check the response for the request URL
            Element requestUrlElement;
            try {
                requestUrlElement = listRecordsEl.getChild("requestURL", root.getNamespace());
            } catch (Exception e1) {
                LogWriter.addError(currentHarvest.getProvider().getLogFileName(),
                        "The OAI provider returned an invalid response to the ListRecords request.");
                sendReportEmail("The OAI provider returned an invalid response to the ListRecords request.");
                throw new RuntimeException(
                        "The data provider returned an invalid response to the ListRecords request: "
                                + e.getMessage()); // exc. e more interesting than e1?
            }

            // If the response contained the URL, report the error "no records found"
            if (requestUrlElement != null) {
                LogWriter.addInfo(currentHarvest.getProvider().getLogFileName(),
                        "The OAI provider did not return any records");
                sendReportEmail("The OAI provider did not return any records");
                // Return null to show that there were no records returned
                return null;
            }

            // If we got here, the URL element wasn't found. In this
            // case report the error as "invalid OAI response"
            LogWriter.addError(currentHarvest.getProvider().getLogFileName(),
                    "The OAI provider returned an invalid response to the ListRecords request.");
            sendReportEmail("The OAI provider returned an invalid response to the ListRecords request.");
            throw new RuntimeException(
                    "The data provider returned an invalid response to the ListRecords request: " + e.getMessage());
        }

        // Loop over all records in the OAI response
        List<Element> recordsEl = listRecordsEl.getChildren("record", root.getNamespace());
        log.debug("recordsEl.size(): " + recordsEl.size());

        for (Object recordElObj : recordsEl) {
            recordEl = (Element) recordElObj;

            try {
                HarvestScheduleStep scheduleStep = harvestScheduleSteps.get(harvestScheduleStepIndex);
                TimingLogger.start("getRecordService().parse(recordEl)");
                Record record = getRecordService().parse(recordEl, currentHarvest.getProvider());
                TimingLogger.stop("getRecordService().parse(recordEl)");
                record.setFormat(scheduleStep.getFormat());
                record.setHarvest(currentHarvest);
                record.setProvider(currentHarvest.getProvider());

                String oaiId = record.getHarvestedOaiIdentifier();
                Long recordId = getRecordId(oaiId);
                char prevStatus = 0;
                if (recordId == null || recordId == 0) {
                    getRepositoryDAO().injectId(record);
                } else {
                    record.setId(recordId);
                    prevStatus = getPreviousStatus(recordId);
                    log.debug("found prevStatus: " + prevStatus);
                    record.setPreviousStatus(prevStatus);
                }
                cachePreviousStatus(record.getId(), (byte) record.getStatus());
                cacheRecordId(oaiId, record.getId());

                repo.addRecord(record);
                if (record.getSets() != null && record.getSets().size() > 1) {
                    for (Set s : record.getSets()) {
                        if (s.getSetSpec().contains(":")) {
                            incomingRecordCounts.incr(s.getSetTypeShort(), record.getStatus(), prevStatus);
                        }
                    }
                } else {
                    incomingRecordCounts.incr(RecordCounts.OTHER, record.getStatus(), prevStatus);
                }
                incomingRecordCounts.incr(null, record.getStatus(), prevStatus);
            } catch (Exception e) {
                log.error("An error occurred in insertion ", e);
            }
            this.recordsProcessedThisRun++;
        }

        return resumption;
    }

    /**
     * Builds and sends an email report about the harvest to the schedule's notify email address.
     *
     * @param problem
     *            The problem which prevented the harvest from finishing, or null if the harvest was successful
     */
    protected boolean sendReportEmail(String problem) {
        if (harvestSchedule.getNotifyEmail() != null && mailer.isConfigured()) {
            // The email's subject
            InetAddress addr = null;
            try {
                addr = InetAddress.getLocalHost();
            } catch (UnknownHostException e) {
                log.error("Host name query failed.", e);
            }
            String subject = "Results of harvesting " + harvestSchedule.getProvider().getOaiProviderUrl()
                    + " by MST Server on " + addr.getHostName();

            // The email's body
            StringBuilder body = new StringBuilder();

            // First report any problems which prevented the harvest from finishing
            if (problem != null)
                body.append("The harvest failed for the following reason: ").append(problem).append("\n\n");

            /*
            if(this.records2ProcessThisRun!=0) {
            body.append("Total number of records available for harvest =").append(totalRecords).append(" \n");
            body.append("Number of records harvested =").append(recordsProcessed).append(" \n");
            }
            */

            return mailer.sendEmail(harvestSchedule.getNotifyEmail(), subject, body.toString());
        } else {
            // note, after configuring email, seem to have to restart MST for it to work.
            log.debug("HarvestManager.sendReportEmail-mail is not configured right! sendto:"
                    + harvestSchedule.getNotifyEmail() + " isConfigured:" + mailer.isConfigured());
            return false;
        }
    }

    public long getRecords2ProcessThisRun() {
        return this.records2ProcessThisRun;
    }

    public long getRecordsProcessedThisRun() {
        return this.recordsProcessedThisRun;
    }
}