eionet.cr.harvest.PullHarvest.java Source code

Java tutorial

Introduction

Here is the source code for eionet.cr.harvest.PullHarvest.java

Source

/*
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is Content Registry 3
 *
 * The Initial Owner of the Original Code is European Environment
 * Agency. Portions created by Zero Technologies are Copyright
 * (C) European Environment Agency.  All Rights Reserved.
 *
 * Contributor(s):
 *        Jaanus Heinlaid
 */

package eionet.cr.harvest;

import static eionet.cr.harvest.ResponseCodeUtil.isError;
import static eionet.cr.harvest.ResponseCodeUtil.isNotModified;
import static eionet.cr.harvest.ResponseCodeUtil.isPermanentError;
import static eionet.cr.harvest.ResponseCodeUtil.isRedirect;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.httpclient.HttpConnectionManager;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.log4j.Logger;
import org.openrdf.model.vocabulary.XMLSchema;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.xml.sax.SAXException;

import eionet.cr.common.Predicates;
import eionet.cr.common.TempFilePathGenerator;
import eionet.cr.config.GeneralConfig;
import eionet.cr.dao.DAOException;
import eionet.cr.dao.DAOFactory;
import eionet.cr.dao.HarvestSourceDAO;
import eionet.cr.dao.HelperDAO;
import eionet.cr.dao.PostHarvestScriptDAO;
import eionet.cr.dto.HarvestMessageDTO;
import eionet.cr.dto.HarvestSourceDTO;
import eionet.cr.dto.ObjectDTO;
import eionet.cr.dto.SubjectDTO;
import eionet.cr.filestore.FileStore;
import eionet.cr.harvest.load.ContentLoader;
import eionet.cr.harvest.load.FeedFormatLoader;
import eionet.cr.harvest.load.RDFFormatLoader;
import eionet.cr.harvest.util.EndpointHttpClient;
import eionet.cr.harvest.util.HarvestMessageType;
import eionet.cr.harvest.util.MediaTypeToDcmiTypeConverter;
import eionet.cr.harvest.util.RDFMediaTypes;
import eionet.cr.util.FileDeletionJob;
import eionet.cr.util.Hashes;
import eionet.cr.util.URLUtil;
import eionet.cr.util.Util;
import eionet.cr.util.xml.ConversionsParser;

/**
 *
 * @author Jaanus Heinlaid
 */
public class PullHarvest extends BaseHarvest {

    /** */
    private static final int NO_RESPONSE = -1;

    /** */
    private static final Logger LOGGER = Logger.getLogger(PullHarvest.class);

    /** */
    private static final int MAX_REDIRECTIONS = 4;

    /** */
    private static final int MAX_CONTENTLENGTH = 5;

    /** */
    private static final String ACCEPT_HEADER = StringUtils.join(RDFMediaTypes.collection(), ',')
            + ",text/xml,*/*;q=0.6";

    /** */
    private boolean isSourceAvailable;

    /** */
    private final List<String> redirectedUrls = new ArrayList<String>();

    /**
     * @param contextUrl
     * @throws HarvestException
     */
    public PullHarvest(String contextUrl) throws HarvestException {
        super(contextUrl);
    }

    /**
     *
     * @param contextSourceDTO
     * @throws DAOException
     */
    public PullHarvest(HarvestSourceDTO contextSourceDTO) throws DAOException {
        super(contextSourceDTO);
    }

    /**
     * Harvests file already uploaded to a CR folder and residing in the filestore.
     *
     * @throws HarvestException
     *             if harvest fails
     */
    private void doLocalFileHarvest() throws HarvestException {
        String initialContextUrl = getContextUrl();

        httpResponseCode = NO_RESPONSE;
        File file = null;
        String responseMessage = null;

        try {

            String message = "Opening connection to local file";
            LOGGER.debug(loggerMsg(message));

            file = FileStore.getByUri(initialContextUrl);

            if (file == null) {
                finishWithError(NO_RESPONSE, "The file does not exist",
                        new HarvestException("The file does not exist"));
                return;
            }

            isSourceAvailable = true;

            // if URL connection returned no errors and its content has been modified since last harvest,
            // proceed to downloading

            // get content type and title from previously saved triples
            SubjectDTO subject = DAOFactory.get().getDao(HelperDAO.class).getFactsheet(initialContextUrl, null,
                    null);
            String contentType = (subject != null ? subject.getObjectValue(Predicates.CR_MEDIA_TYPE) : null);
            String fileTitle = (subject != null ? subject.getObjectValue(Predicates.DC_TITLE) : null);

            int noOfTriples = processLocalContent(file, contentType);

            // for local files store N/A as http response code

            setStoredTriplesCount(noOfTriples);
            LOGGER.debug(loggerMsg(noOfTriples + " triples loaded"));

            if (!StringUtils.isBlank(fileTitle)) {
                addSourceMetadata(Predicates.DC_TITLE, ObjectDTO.createLiteral(fileTitle));
            }
            if (!StringUtils.isBlank(contentType)) {
                addSourceMetadata(Predicates.CR_MEDIA_TYPE, ObjectDTO.createLiteral(contentType));
            }

            addSourceMetadata(Predicates.CR_BYTE_SIZE, ObjectDTO.createLiteral(String.valueOf(file.length())));
            addSourceMetadata(Predicates.CR_LAST_MODIFIED,
                    ObjectDTO.createLiteral(formatDate(new Date()), XMLSchema.DATETIME));

            httpResponseCode = 0;
            finishWithOK(null, noOfTriples);

        } catch (Exception e) {

            LOGGER.debug(loggerMsg("Exception occurred (will be further logged by caller below): " + e.toString()));

            // check what caused the DAOException - fatal flag is set to true
            checkAndSetFatalExceptionFlag(e.getCause());

            try {
                finishWithError(httpResponseCode, responseMessage, e);
            } catch (RuntimeException finishingException) {
                LOGGER.error("Error when finishing up: ", finishingException);
            }
            if (e instanceof HarvestException) {
                throw (HarvestException) e;
            } else {
                throw new HarvestException(e.getMessage(), e);
            }

        }

    }

    /**
     *
     * @throws HarvestException
     */
    private void doEndpointHarvest() throws HarvestException {

        throw new UnsupportedOperationException(
                "Method not supported due to backward compatibility issues with Sesame 2.7.x!");

        //        int numberOfTriples = 0;
        //        EndpointHttpClient httpClient = prepareEndpointHttpClient();
        //
        //        GraphQueryResult queryResult = null;
        //        RepositoryConnection localConn = null;
        //        RepositoryConnection endpointConn = null;
        //        try {
        //            // First see if this particular endpoint has any active harvest queries mapped to it at all.
        //            String endpointUrl = getContextUrl();
        //            List<EndpointHarvestQueryDTO> queries =
        //                    DAOFactory.get().getDao(EndpointHarvestQueryDAO.class).listByEndpointUrl(endpointUrl, true);
        //            if (queries == null || queries.isEmpty()) {
        //                LOGGER.warn(loggerMsg("Found no active harvest queries for this endpoint"));
        //                return;
        //            }
        //
        //            // Prepare remote repository connection
        //            SPARQLRepository sparqlRepository = new SPARQLRepository(endpointUrl);
        //            endpointConn = sparqlRepository.getConnection();
        //
        //            // Prepare local repository connection
        //            localConn = SesameUtil.getRepositoryConnection();
        //            localConn.setAutoCommit(false);
        //
        //            // Prepare local repository value factory
        //            ValueFactory vf = localConn.getValueFactory();
        //            org.openrdf.model.URI graphURI = vf.createURI(endpointUrl);
        //
        //            // Loop through the harvest queries, execute each one of them on the remote repository,
        //            // write the returned statements straight into local repository.
        //            for (EndpointHarvestQueryDTO queryDTO : queries) {
        //
        //                LOGGER.debug(loggerMsg("Executing endpoint harvest query with id = " + queryDTO.getId()));
        //                GraphQuery graphQuery = new SPARQLGraphQuery(httpClient, endpointUrl, queryDTO.getQuery());
        //
        //                // Note that the returned GraphQueryResult is always a org.openrdf.repository.sparql.query.BackgroundGraphResult
        //                // for remote endpoints, and it's a result that returns its statements (i.e. triples) AS THEY ARE PARSED.
        //                // i.e. the statements ARE NOT written to the memory first and then passed back. So no memory problems here, and
        //                // safe to use it this way.
        //                queryResult = graphQuery.evaluate();
        //                if (queryResult != null) {
        //                    while (queryResult.hasNext()) {
        //
        //                        // Clear the graph just before first insert.
        //                        if (numberOfTriples == 0) {
        //                            localConn.clear(graphURI);
        //                        }
        //                        localConn.add(queryResult.next(), graphURI);
        //                        numberOfTriples++;
        //                    }
        //                }
        //            }
        //
        //            localConn.commit();
        //
        //            setStoredTriplesCount(numberOfTriples);
        //            LOGGER.debug(loggerMsg("All queries executed, total of " + numberOfTriples + " triples loaded"));
        //            finishWithOK(null, numberOfTriples);
        //
        //        } catch (Exception e) {
        //
        //            SesameUtil.rollback(localConn);
        //            LOGGER.debug(loggerMsg("Exception occurred (will be further logged by caller below): " + e.toString()));
        //
        //            checkAndSetFatalExceptionFlag(e.getCause());
        //            try {
        //                finishWithError(httpClient.getLastExecutionResponseCode(), httpClient.getLastExecutionResponseText(), e);
        //            } catch (RuntimeException finishingException) {
        //                LOGGER.error("Error when finishing up: ", finishingException);
        //            }
        //
        //            if (e instanceof HarvestException) {
        //                throw (HarvestException) e;
        //            } else {
        //                throw new HarvestException(e.getMessage(), e);
        //            }
        //        } finally {
        //            SesameUtil.close(queryResult);
        //            SesameUtil.close(localConn);
        //            SesameUtil.close(endpointConn);
        //        }
    }

    /**
     *
     * @return
     */
    private EndpointHttpClient prepareEndpointHttpClient() {

        HttpConnectionManagerParams managerParams = new HttpConnectionManagerParams();
        managerParams.setDefaultMaxConnectionsPerHost(20);
        managerParams.setStaleCheckingEnabled(false);

        int httpTimeout = GeneralConfig.getIntProperty(GeneralConfig.HARVESTER_HTTP_TIMEOUT, getTimeout());
        managerParams.setConnectionTimeout(httpTimeout);
        managerParams.setSoTimeout(httpTimeout);

        HttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
        manager.setParams(managerParams);

        HttpClientParams clientParams = new HttpClientParams();
        clientParams.setParameter("http.useragent", URLUtil.userAgentHeader());
        clientParams.setParameter("http.protocol.max-redirects", MAX_REDIRECTIONS);

        HashMap<String, String> headers = new HashMap<String, String>();
        headers.put("Connection", "close");
        clientParams.setParameter("additionalHTTPHeaders", headers);

        return new EndpointHttpClient(clientParams, manager);
    }

    /**
     * Harvests external source.
     *
     * @throws HarvestException
     *             if harvest fails
     */
    private void doUrlHarvest() throws HarvestException {

        String initialContextUrl = getContextUrl();
        HttpURLConnection urlConn = null;
        httpResponseCode = NO_RESPONSE;
        String responseMessage = null;
        int noOfRedirections = 0;

        try {
            String connectUrl = getContextUrl();
            do {
                String message = "Opening URL connection";
                if (!connectUrl.equals(getContextUrl())) {
                    message = message + " to " + connectUrl;
                }
                LOGGER.debug(loggerMsg(message));

                urlConn = openUrlConnection(connectUrl);

                try {
                    httpResponseCode = urlConn.getResponseCode();
                    responseMessage = urlConn.getResponseMessage();
                } catch (IOException ioe) {
                    // an error when connecting to server is considered a temporary error-
                    // don't throw it, but log in the database and exit
                    LOGGER.debug("Error when connecting to server: " + ioe);
                    finishWithError(NO_RESPONSE, null, ioe);
                    return;
                }

                // Throws exception when the content-length indicated in HTTP response is more than the maximum allowed.
                validateContentLength(urlConn);

                // Handle redirection.
                if (isRedirect(httpResponseCode)) {

                    noOfRedirections++;

                    // if number of redirections more than maximum allowed, throw exception
                    if (noOfRedirections > MAX_REDIRECTIONS) {
                        throw new TooManyRedirectionsException(
                                "Too many redirections, originally started from " + initialContextUrl);
                    }

                    // get redirected-to-url, throw exception if it's missing
                    String redirectedToUrl = getRedirectUrl(urlConn);
                    redirectedUrls.add(connectUrl);
                    redirectedHarvestSources.add(getContextSourceDTO());

                    if (StringUtils.isBlank(redirectedToUrl)) {
                        throw new NoRedirectLocationException(
                                "Redirection response code wihtout \"Location\" header!");
                    }
                    LOGGER.debug(loggerMsg(connectUrl + " redirects to " + redirectedToUrl));

                    // treat this as a redirection only if the context URL and the redirected-to-URL
                    // are not essentially the same
                    if (!URLUtil.equalUrls(getContextUrl(), redirectedToUrl)) {

                        finishRedirectedHarvest(redirectedToUrl, httpResponseCode);

                        LOGGER.debug(loggerMsg("Redirection details saved"));
                        startWithNewContext(redirectedToUrl);
                    } else {
                        LOGGER.debug(loggerMsg("Ignoring this redirection, as it is essentially to the same URL"));
                    }

                    connectUrl = redirectedToUrl;
                    // Close redirected URL connection
                    URLUtil.disconnect(urlConn);
                }
            } while (isRedirect(httpResponseCode));

            // if URL connection returned no errors and its content has been modified since last harvest,
            // proceed to downloading
            if (!isError(httpResponseCode) && !isNotModified(httpResponseCode)) {

                int noOfTriples = downloadAndProcessContent(urlConn);
                setStoredTriplesCount(noOfTriples);
                LOGGER.debug(loggerMsg(noOfTriples + " triples loaded"));
                finishWithOK(urlConn, noOfTriples);

            } else if (isNotModified(httpResponseCode)) {
                LOGGER.debug(loggerMsg("Source not modified since last harvest"));
                finishWithNotModified(urlConn, 0);

            } else if (isError(httpResponseCode)) {
                LOGGER.debug(loggerMsg("Server returned error code " + httpResponseCode));
                finishWithError(httpResponseCode, responseMessage, null);
            }
        } catch (Exception e) {

            LOGGER.debug(loggerMsg("Exception occurred (will be further logged by caller below): " + e.toString()));

            // check what caused the DAOException - fatal flag is set to true
            checkAndSetFatalExceptionFlag(e.getCause());

            try {
                finishWithError(httpResponseCode, responseMessage, e);
            } catch (RuntimeException finishingException) {
                LOGGER.error("Error when finishing up: ", finishingException);
            }
            if (e instanceof HarvestException) {
                throw (HarvestException) e;
            } else {
                throw new HarvestException(e.getMessage(), e);
            }
        } finally {
            URLUtil.disconnect(urlConn);
        }
    }

    /*
     * (non-Javadoc)
     *
     * @see eionet.cr.harvest.BaseHarvest#doHarvest()
     */
    @Override
    protected void doHarvest() throws HarvestException {

        // check if the file is on the local filestore folder
        boolean isLocalFile = FileStore.isFileStoreUri(getContextUrl());

        if (isLocalFile) {
            doLocalFileHarvest();
        } else if (getContextSourceDTO().isSparqlEndpoint()) {
            doEndpointHarvest();
        } else {
            doUrlHarvest();
        }
    }

    /**
     *
     * @param urlConn
     * @param noOfTriples
     */
    private void finishWithOK(HttpURLConnection urlConn, int noOfTriples) {

        // update context source DTO with the results of this harvest
        getContextSourceDTO().setStatements(noOfTriples);
        getContextSourceDTO().setLastHarvest(new Date());
        getContextSourceDTO().setLastHarvestFailed(false);
        getContextSourceDTO().setPermanentError(false);
        getContextSourceDTO().setCountUnavail(0);

        // add source metadata resulting from this harvest
        addSourceMetadata(urlConn, 0, null, null);

        // since the harvest went OK, clean previously harvested metadata of this source
        setCleanAllPreviousSourceMetadata(true);
    }

    /**
     *
     * @param urlConn
     * @param noOfTriples
     */
    private void finishWithNotModified(HttpURLConnection urlConn, int noOfTriples) {

        addHarvestMessage("Source not modified since last harvest", HarvestMessageType.INFO);
        isSourceAvailable = true;

        // update context source DTO (since the server returned source-not-modified,
        // the number of harvested statements stays as it already is, i.e. we're not setting it)
        getContextSourceDTO().setLastHarvest(new Date());
        getContextSourceDTO().setLastHarvestFailed(false);
        getContextSourceDTO().setPermanentError(false);
        getContextSourceDTO().setCountUnavail(0);

        // since the server returned source-not-modified, we're keeping the old metadata,
        // but still updating the cr:lastRefreshed
        setCleanAllPreviousSourceMetadata(false);
        addSourceMetadata(Predicates.CR_LAST_REFRESHED,
                ObjectDTO.createLiteral(formatDate(new Date()), XMLSchema.DATETIME));
    }

    /**
     * @param responseCode
     * @param exception
     */
    private void finishWithError(int responseCode, String responseMessage, Exception exception) {

        // source is unavailable if there was no response, or it was an error code, or the exception cause is RDFParseException
        boolean isRDFParseException = exception != null && (exception.getCause() instanceof RDFParseException);
        boolean sourceNotAvailable = responseCode == NO_RESPONSE || isError(responseCode) || isRDFParseException;

        // if source was not available, the new unavailability-count is increased by one, otherwise reset
        int countUnavail = sourceNotAvailable ? getContextSourceDTO().getCountUnavail() + 1 : 0;

        // if permanent error, the last harvest date will be set to now, otherwise special logic used
        Date now = new Date();
        Date lastHarvest = isPermanentError(responseCode) ? now : temporaryErrorLastHarvest(now);

        // if permanent error, clean previously harvested metadata of this source,
        // and if not a priority source, clean all previously harvested content of this source too
        int noOfStatements = getContextSourceDTO().getStatements();
        if (isPermanentError(responseCode)) {

            setCleanAllPreviousSourceMetadata(true);
            if (!getContextSourceDTO().isPrioritySource()) {
                try {
                    getHarvestSourceDAO().clearGraph(getContextUrl());
                    noOfStatements = 0;
                } catch (DAOException e) {
                    LOGGER.error("Failed to delete previous content after permanent error", e);
                }
            }
        }

        // update context source DTO with the results of this harvest
        getContextSourceDTO().setStatements(noOfStatements);
        getContextSourceDTO().setLastHarvest(lastHarvest);
        getContextSourceDTO().setLastHarvestFailed(true);
        getContextSourceDTO().setPermanentError(isPermanentError(responseCode));
        getContextSourceDTO().setCountUnavail(countUnavail);

        // save same error parameters to parent sources where this source was redirected from
        handleRedirectedHarvestDTOs(lastHarvest, responseCode, sourceNotAvailable);

        // add harvest message about the given exception if it's not null
        if (exception != null) {
            String message = exception.getMessage() == null ? exception.toString() : exception.getMessage();
            String stackTrace = Util.getStackTrace(exception);
            stackTrace = StringUtils.replace(stackTrace, "\r", "");
            addHarvestMessage(message, HarvestMessageType.ERROR, stackTrace);
        }

        // add harvest message about the given response code, if it's an error code (because it could also be
        // a "no response" code, meaning an exception was raised before the response code could be obtained)
        if (isError(responseCode)) {
            if (responseMessage == null) {
                responseMessage = "";
            }
            addHarvestMessage(
                    "Server returned error: " + responseMessage + " (HTTP response code: " + responseCode + ")",
                    HarvestMessageType.ERROR);
        }

        // add source metadata resulting from this harvest
        addSourceMetadata(null, responseCode, responseMessage, exception);
    }

    /**
     * Marks redirected sources with error markers.
     *
     * @param lastHarvest
     *            last harvest time
     * @param responseCode
     *            http response code
     * @param sourceNotAvailable
     *            shows if source was available
     */
    private void handleRedirectedHarvestDTOs(Date lastHarvest, int responseCode, boolean sourceNotAvailable) {
        for (HarvestSourceDTO harvestSourceDTO : redirectedHarvestSources) {
            setErrorsToRedirectedHarvestDTO(harvestSourceDTO, lastHarvest, responseCode, sourceNotAvailable);
        }
    }

    /**
     * Stores error in HarvestSource DTO.
     *
     * @param harvestSourceDTO
     *            / source DTO object
     */
    private void setErrorsToRedirectedHarvestDTO(HarvestSourceDTO harvestSourceDTO, Date lastHarvest,
            int responseCode, boolean sourceNotAvailable) {

        // if source was not available, the new unavailability-count is increased by one, otherwise reset
        int countUnavail = sourceNotAvailable ? harvestSourceDTO.getCountUnavail() + 1 : 0;

        harvestSourceDTO.setStatements(0);
        harvestSourceDTO.setLastHarvest(lastHarvest);
        harvestSourceDTO.setLastHarvestFailed(true);
        harvestSourceDTO.setPermanentError(isPermanentError(responseCode));
        harvestSourceDTO.setCountUnavail(countUnavail);

    }

    /**
     * Returns the {@link Date} to which the source's last harvest time should be set in case of temporary harvest error. It should
     * be set to "now - harvest_interval + max(harvest_interval*0,1, 120 min)". The "now" is given as method input.
     *
     * @param now As indicated above.
     * @return The calculated last harvest date as indicated above.
     */
    private Date temporaryErrorLastHarvest(Date now) {

        // The source's harvesting interval in minutes.
        int intervalMinutes = getContextSourceDTO().getIntervalMinutes();

        // The new last harvest will be "now - interval + interval*0,1", but at least two hours (i.e. 120 minutes).
        // So here we calculate the value that we shall add to the "now - interval".
        int increaseMinutes = Math.max((intervalMinutes * 10) / 100, 120);

        // Get calendar instance, set it to now.
        Calendar cal = Calendar.getInstance();
        cal.setTime(now);

        // Subtract interval and add the above-calculated increase.
        cal.add(Calendar.MINUTE, -1 * intervalMinutes);
        cal.add(Calendar.MINUTE, increaseMinutes);

        // Just make it 100% sure that the calculated time will not be after now, though the business logic should exclude it.
        Date resultingTime = cal.getTime();
        if (resultingTime.after(now)) {
            resultingTime = now;
        }

        return resultingTime;
    }

    /**
     *
     * @param urlConn
     * @param responseCode
     * @param exception
     * @throws DAOException
     */
    private void addSourceMetadata(HttpURLConnection urlConn, int responseCode, String responseMessage,
            Exception exception) {

        String firstSeen = formatDate(getContextSourceDTO().getTimeCreated());
        String lastRefreshed = formatDate(new Date());

        addSourceMetadata(Predicates.CR_FIRST_SEEN, ObjectDTO.createLiteral(firstSeen, XMLSchema.DATETIME));
        addSourceMetadata(Predicates.CR_LAST_REFRESHED, ObjectDTO.createLiteral(lastRefreshed, XMLSchema.DATETIME));

        if (isError(responseCode)) {
            if (responseMessage == null) {
                responseMessage = "";
            }
            addSourceMetadata(Predicates.CR_ERROR_MESSAGE, ObjectDTO.createLiteral(
                    "Server returned error: " + responseMessage + " (HTTP response code: " + responseCode + ")"));
        } else if (exception != null) {
            addSourceMetadata(Predicates.CR_ERROR_MESSAGE, ObjectDTO.createLiteral(exception.toString()));
        }

        if (urlConn != null) {

            // content type
            String contentType = getSourceContentType(urlConn);
            if (!StringUtils.isBlank(contentType)) {

                addSourceMetadata(Predicates.CR_MEDIA_TYPE, ObjectDTO.createLiteral(contentType));

                // if content type is not "application/rdf+xml", generate rdf:type from the
                // DublinCore type mappings
                if (!contentType.toLowerCase().startsWith("application/rdf+xml")) {

                    String rdfType = MediaTypeToDcmiTypeConverter.getDcmiTypeFor(contentType);
                    if (rdfType != null) {
                        addSourceMetadata(Predicates.RDF_TYPE, ObjectDTO.createResource(rdfType));
                    }
                }
            }

            // content's last modification
            long contentLastModified = urlConn.getLastModified();
            if (contentLastModified > 0) {
                String lastModifString = formatDate(new Date(contentLastModified));
                addSourceMetadata(Predicates.CR_LAST_MODIFIED,
                        ObjectDTO.createLiteral(lastModifString, XMLSchema.DATETIME));
            }

            // content size
            int contentLength = urlConn.getContentLength();
            if (contentLength >= 0) {
                addSourceMetadata(Predicates.CR_BYTE_SIZE, ObjectDTO.createLiteral(contentLength));
            }
        }

    }

    /**
     * Download and process content. If response content type is one of RDF, then proceed straight to loading. Otherwise process the
     * file to see if it's zipped, it's an XML with RDF conversion, or actually an RDF file.
     *
     * @param urlConn
     *            - connection to the remote source.
     * @return number of triples harvested.
     *
     * @throws IOException
     * @throws DAOException
     * @throws SAXException
     * @throws RDFParseException
     *             if RDF parsing fails while analyzing file with unknown format
     * @throws RDFHandlerException
     *             if RDF parsing fails while analyzing file with unknown format
     */
    private int downloadAndProcessContent(HttpURLConnection urlConn)
            throws IOException, DAOException, SAXException, RDFHandlerException, RDFParseException {

        File downloadedFile = null;
        try {
            downloadedFile = downloadFile(urlConn);

            // If the downloaded file can be loaded straight away as it is, then proceed to loading straight away.
            // Otherwise try to process the file into RDF format and *then* proceed to loading.

            ContentLoader contentLoader = createContentLoader(urlConn);

            if (contentLoader != null) {
                contentLoader.setTimeout(getTimeout());
                LOGGER.debug(loggerMsg("Downloaded file is in RDF or web feed format"));
                return loadFile(downloadedFile, contentLoader);
            } else {
                LOGGER.debug(
                        loggerMsg("Downloaded file is not in RDF or web feed format, processing the file further"));
                File processedFile = null;
                try {
                    // The file could be a zipped RDF, an XML with an RDF conversion, N3, or actually a completely valid RDF
                    // that simply wasn't declared in the server-returned content type.
                    FileToRdfProcessor fileProcessor = new FileToRdfProcessor(downloadedFile, getContextUrl());
                    processedFile = fileProcessor.process();
                    if (processedFile != null && fileProcessor.getRdfFormat() != null) {
                        LOGGER.debug(loggerMsg("File processed into RDF format"));
                        ContentLoader rdfLoader = new RDFFormatLoader(fileProcessor.getRdfFormat());
                        rdfLoader.setTimeout(getTimeout());
                        return loadFile(processedFile, rdfLoader);
                    } else {
                        LOGGER.debug(loggerMsg("File couldn't be processed into RDF format"));
                        return 0;
                    }
                } finally {
                    FileDeletionJob.register(processedFile);
                }
            }
        } finally {
            FileDeletionJob.register(downloadedFile);
        }
    }

    /**
     *
     * @param redirectedToUrl
     * @param responseCode
     *            HTTP Code from the redirected URL
     * @throws DAOException
     */
    private void finishRedirectedHarvest(String redirectedToUrl, int responseCode) throws DAOException {

        Date redirectionSeen = new Date();

        // update the context source's last-harvest and number of statements
        getContextSourceDTO().setLastHarvest(redirectionSeen);
        getContextSourceDTO().setLastHarvestFailed(false);
        getContextSourceDTO().setStatements(0);
        getContextSourceDTO().setLastHarvestId(getHarvestId());
        getHarvestSourceDAO().updateSourceHarvestFinished(getContextSourceDTO());

        // update current harvest to finished, set its count of harvested triples to 0
        getHarvestDAO().updateFinishedHarvest(getHarvestId(), 0, responseCode);

        // insert redirection message to the current harvest
        String message = getContextUrl() + "  redirects to  " + redirectedToUrl;
        HarvestMessageDTO messageDTO = HarvestMessageDTO.create(message, HarvestMessageType.INFO, null);
        messageDTO.setHarvestId(getHarvestId());
        getHarvestMessageDAO().insertHarvestMessage(messageDTO);

        // clear context source's metadata, save new metadata about redirection
        getHarvestSourceDAO().deleteSubjectTriplesInSource(getContextUrl(), GeneralConfig.HARVESTER_URI);
        SubjectDTO subjectDTO = createRedirectionMetadata(getContextSourceDTO(), redirectionSeen, redirectedToUrl);
        getHelperDAO().addTriples(subjectDTO);

        // if redirected-to source not existing, create it by copying the context source
        HarvestSourceDTO redirectedToSourceDTO = getHarvestSource(redirectedToUrl);
        if (redirectedToSourceDTO == null) {

            LOGGER.debug(loggerMsg("Creating harvest source for " + redirectedToUrl));

            // clone the redirected-to source from the context source
            // (no null-checking, i.e. assuming the context source already exists)
            redirectedToSourceDTO = getContextSourceDTO().clone();

            // set the redirected-to source's url, creation time and last harvest time
            redirectedToSourceDTO.setUrl(redirectedToUrl);
            redirectedToSourceDTO.setUrlHash(Long.valueOf(Hashes.spoHash(redirectedToUrl)));
            redirectedToSourceDTO.setTimeCreated(redirectionSeen);

            // persist the redirected-to source
            getHarvestSourceDAO().addSource(redirectedToSourceDTO);
        }

        // delete old harvests history
        LOGGER.debug(loggerMsg("Deleting old redirected harvests history"));
        getHarvestDAO().deleteOldHarvests(getContextSourceDTO().getSourceId());
    }

    /**
     *
     * @param sourceDTO
     * @param redirectionSeen
     * @param redirectedToUrl
     * @return
     */
    private SubjectDTO createRedirectionMetadata(HarvestSourceDTO sourceDTO, Date redirectionSeen,
            String redirectedToUrl) {

        String firstSeen = formatDate(sourceDTO.getTimeCreated());
        String lastRefreshed = formatDate(redirectionSeen);

        SubjectDTO subjectDTO = new SubjectDTO(sourceDTO.getUrl(), false);

        String harvesterContextUri = eionet.cr.config.GeneralConfig.HARVESTER_URI;
        ObjectDTO object = ObjectDTO.createLiteral(firstSeen, XMLSchema.DATETIME);
        object.setSourceUri(harvesterContextUri);
        subjectDTO.addObject(Predicates.CR_FIRST_SEEN, object);

        object = ObjectDTO.createLiteral(lastRefreshed, XMLSchema.DATETIME);
        object.setSourceUri(harvesterContextUri);
        subjectDTO.addObject(Predicates.CR_LAST_REFRESHED, object);

        object = ObjectDTO.createResource(redirectedToUrl);
        object.setSourceUri(harvesterContextUri);
        subjectDTO.addObject(Predicates.CR_REDIRECTED_TO, object);

        return subjectDTO;
    }

    /**
     * Download file from remote source to a temporary file locally. Side effect: adds the file size to the metadata to save in the
     * harvester context.
     *
     * @param urlConn
     *            - connection to the remote source.
     * @return object representing the temporary file.
     * @throws IOException
     *             if the file is not downloadable.
     */
    private File downloadFile(HttpURLConnection urlConn) throws IOException {

        LOGGER.debug(loggerMsg("Downloading file"));

        InputStream inputStream = null;
        OutputStream outputStream = null;
        File file = TempFilePathGenerator.generate();
        try {
            outputStream = new FileOutputStream(file);
            inputStream = urlConn.getInputStream();
            isSourceAvailable = true;
            int bytesCopied = IOUtils.copy(inputStream, outputStream);

            // add number of bytes to source metadata, unless it's already there
            addSourceMetadata(Predicates.CR_BYTE_SIZE, ObjectDTO.createLiteral(String.valueOf(bytesCopied)));

        } catch (IOException e) {
            FileDeletionJob.register(file);
            throw e;
        } finally {
            IOUtils.closeQuietly(inputStream);
            IOUtils.closeQuietly(outputStream);
            URLUtil.disconnect(urlConn);
        }

        return file;
    }

    /**
     *
     * @param connectUrl
     * @return
     * @throws IOException
     * @throws DAOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    private HttpURLConnection openUrlConnection(String connectUrl)
            throws IOException, DAOException, SAXException, ParserConfigurationException {

        String sanitizedUrl = StringUtils.substringBefore(connectUrl, "#");
        sanitizedUrl = StringUtils.replace(sanitizedUrl, " ", "%20");

        HttpURLConnection connection = (HttpURLConnection) new URL(sanitizedUrl).openConnection();
        connection.setRequestProperty("Accept", ACCEPT_HEADER);
        connection.setRequestProperty("User-Agent", URLUtil.userAgentHeader());
        connection.setRequestProperty("Connection", "close");
        connection.setInstanceFollowRedirects(false);

        // Set the timeout both for establishing the connection, and reading from it once established.
        int httpTimeout = GeneralConfig.getIntProperty(GeneralConfig.HARVESTER_HTTP_TIMEOUT, getTimeout());
        connection.setConnectTimeout(httpTimeout);
        connection.setReadTimeout(httpTimeout);

        // Use "If-Modified-Since" header, if this is not an on-demand harvest
        if (!isOnDemandHarvest) {

            // "If-Modified-Since" will be compared to this URL's last harvest
            Date lastHarvestDate = getContextSourceDTO().getLastHarvest();
            long lastHarvest = lastHarvestDate == null ? 0L : lastHarvestDate.getTime();
            if (lastHarvest > 0) {

                // Check if this URL has a conversion stylesheet, and if the latter has been modified since last harvest.
                String conversionStylesheetUrl = getConversionStylesheetUrl(sanitizedUrl);
                boolean hasConversion = !StringUtils.isBlank(conversionStylesheetUrl);
                boolean hasModifiedConversion = hasConversion
                        && URLUtil.isModifiedSince(conversionStylesheetUrl, lastHarvest);

                // Check if post-harvest scripts are updated
                boolean scriptsModified = DAOFactory.get().getDao(PostHarvestScriptDAO.class)
                        .isScriptsModified(lastHarvestDate, getContextSourceDTO().getUrl());

                // "If-Modified-Since" should only be set if there is no modified conversion or post-harvest scripts for this URL.
                // Because if there is a conversion stylesheet or post-harvest scripts, and any of them has been modified since last
                // harvest, we surely want to get the content again and run the conversion or script on the content, regardless of
                // when the content itself was last modified.
                if (!hasModifiedConversion && !scriptsModified) {
                    LOGGER.debug(loggerMsg(
                            "Using if-modified-since, compared to last harvest " + formatDate(lastHarvestDate)));
                    connection.setIfModifiedSince(lastHarvest);
                }
            }
        }

        return connection;
    }

    /**
     *
     * @param connection
     * @return
     * @throws MalformedURLException
     */
    private String getRedirectUrl(HttpURLConnection connection) throws MalformedURLException {

        String location = connection.getHeaderField("Location");
        if (location != null) {
            try {
                // If location does not seem to be an absolute URI, consider it relative to the
                // URL of this URL connection.
                if (!(new URI(location).isAbsolute())) {
                    location = new URL(connection.getURL(), location).toString();
                }
            } catch (URISyntaxException e) {
                // Ignoring on purpose.
            }

            // we want to avoid fragment parts in CR harvest source URLs
            location = StringUtils.substringBefore(location, "#");
        }

        return location;
    }

    /**
     *
     * @param harvestSourceUrl
     * @return
     * @throws DAOException
     * @throws ParserConfigurationException
     * @throws SAXException
     * @throws IOException
     */
    private String getConversionStylesheetUrl(String harvestSourceUrl)
            throws DAOException, IOException, SAXException, ParserConfigurationException {

        String result = null;

        String schemaUri = getHelperDAO().getSubjectSchemaUri(harvestSourceUrl);
        if (!StringUtils.isBlank(schemaUri)) {

            // see if schema has RDF conversion
            ConversionsParser convParser = ConversionsParser.parseForSchema(schemaUri);
            if (!StringUtils.isBlank(convParser.getRdfConversionId())) {

                result = convParser.getRdfConversionXslFileName();
            }
        }

        return result;
    }

    /**
     * Returns RDF format from url connection.
     *
     * @param contentType
     * @return
     */
    private RDFFormat getRdfFormat(HttpURLConnection urlConn) {
        String contentType = getSourceContentType(urlConn);

        if (contentType == null) {
            return null;
        }

        if (contentType.equals(CONTENT_TYPE_TEXT)) {
            String path = urlConn.getURL().getPath();
            String[] arr = path.split("\\.");
            if (arr.length > 0) {
                String ext = arr[arr.length - 1];
                if (StringUtils.isNotEmpty(ext)) {
                    if (ext.equalsIgnoreCase(EXT_TTL)) {
                        return RDFFormat.TURTLE;
                    }
                    if (ext.equalsIgnoreCase(EXT_N3)) {
                        return RDFFormat.N3;
                    }
                }
            }
        }

        return RDFMediaTypes.toRdfFormat(contentType);
    }

    /**
     *
     * @param urlConn
     * @return
     */
    private String getSourceContentType(HttpURLConnection urlConn) {

        // prefer content type from context source DTO over the one from URL connection
        String contentType = getContextSourceDTO().getMediaType();
        if (StringUtils.isBlank(contentType)) {
            contentType = urlConn.getContentType();
        }
        return contentType;
    }

    /**
     *
     * @param urlConn
     * @return
     */
    private ContentLoader createContentLoader(HttpURLConnection urlConn) {

        RDFFormat rdfFormat = getRdfFormat(urlConn);
        if (rdfFormat != null) {
            return new RDFFormatLoader(rdfFormat);
        }

        String contentType = getSourceContentType(urlConn);
        if (StringUtils.isBlank(contentType)) {
            return null;
        } else if (contentType.startsWith("application/rss+xml")
                || contentType.startsWith("application/atom+xml")) {
            return new FeedFormatLoader();
        } else {
            return null;
        }
    }

    /**
     *
     * @param urlConn
     * @throws ContentTooLongException
     */
    private void validateContentLength(HttpURLConnection urlConn) throws ContentTooLongException {

        int maxLengthAllowed = NumberUtils
                .toInt(GeneralConfig.getProperty(GeneralConfig.HARVESTER_MAX_CONTENT_LENGTH));
        if (maxLengthAllowed > 0) {
            int contentLength = NumberUtils.toInt(urlConn.getHeaderField("Content-Length"));
            if (contentLength > maxLengthAllowed) {
                throw new ContentTooLongException(
                        contentLength + " is more than the allowed maximum " + maxLengthAllowed);
            }
        }
    }

    /*
     * (non-Javadoc)
     *
     * @see eionet.cr.harvest.BaseHarvest#getHarvestType()
     */
    @Override
    protected String getHarvestType() {

        return HarvestConstants.TYPE_PULL;
    }

    /**
     * @return the isSourceAvailable
     */
    protected boolean isSourceAvailable() {
        return isSourceAvailable;
    }

    /*
     * (non-Javadoc)
     *
     * @see eionet.cr.harvest.BaseHarvest#isSendNotifications()
     */
    @Override
    protected boolean isSendNotifications() {

        // Send notification only when this is not an on-demand harvest and the source is priority or a fatal error occurred.
        // P.S. The harvest's timeout is also considered a fatal error.
        return !isOnDemandHarvest && (getContextSourceDTO().isPrioritySource() || isFatalErrorOccured);
    }

    /*
     * (non-Javadoc)
     *
     * @see eionet.cr.harvest.BaseHarvest#isBeingHarvested(java.lang.String)
     */
    @Override
    public boolean isBeingHarvested(String url) {

        boolean isBeingHarvested = super.isBeingHarvested(url);
        return isBeingHarvested ? isBeingHarvested : redirectedUrls.contains(url);
    }

    /*
     * (non-Javadoc)
     *
     * @see eionet.cr.harvest.BaseHarvest#afterFinish()
     */
    @Override
    protected void afterFinish() {

        // Execute a background thread that will attempt to clear the graph of all redirected sources.
        (new Thread() {
            @Override
            public void run() {
                for (String redirectedUrl : PullHarvest.this.redirectedUrls) {

                    try {
                        PullHarvest.LOGGER.debug("Clearing the graph of redirected source " + redirectedUrl);
                        DAOFactory.get().getDao(HarvestSourceDAO.class).clearGraph(redirectedUrl);
                    } catch (DAOException e) {
                        PullHarvest.LOGGER.error("Failed to clear the graph of redirected source " + redirectedUrl,
                                e);
                    }
                }
            }
        }).start();
    }
}