org.fao.geonet.kernel.harvest.harvester.csw.Harvester.java Source code

Java tutorial

Introduction

Here is the source code for org.fao.geonet.kernel.harvest.harvester.csw.Harvester.java

Source

//=============================================================================
//===   Copyright (C) 2001-2007 Food and Agriculture Organization of the
//===   United Nations (FAO-UN), United Nations World Food Programme (WFP)
//===   and United Nations Environment Programme (UNEP)
//===
//===   This program is free software; you can redistribute it and/or modify
//===   it under the terms of the GNU General Public License as published by
//===   the Free Software Foundation; either version 2 of the License, or (at
//===   your option) any later version.
//===
//===   This program is distributed in the hope that it will be useful, but
//===   WITHOUT ANY WARRANTY; without even the implied warranty of
//===   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
//===   General Public License for more details.
//===
//===   You should have received a copy of the GNU General Public License
//===   along with this program; if not, write to the Free Software
//===   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
//===
//===   Contact: Jeroen Ticheler - FAO - Viale delle Terme di Caracalla 2,
//===   Rome - Italy. email: geonetwork@osgeo.org
//==============================================================================

package org.fao.geonet.kernel.harvest.harvester.csw;

import jeeves.exceptions.BadParameterEx;
import jeeves.exceptions.OperationAbortedEx;
import jeeves.interfaces.Logger;
import jeeves.resources.dbms.Dbms;
import jeeves.server.context.ServiceContext;
import jeeves.utils.Xml;
import jeeves.utils.XmlRequest;
import org.apache.commons.lang.StringUtils;
import org.fao.geonet.GeonetContext;
import org.fao.geonet.constants.Geonet;
import org.fao.geonet.csw.common.ConstraintLanguage;
import org.fao.geonet.csw.common.Csw;
import org.fao.geonet.csw.common.CswOperation;
import org.fao.geonet.csw.common.CswServer;
import org.fao.geonet.csw.common.ElementSetName;
import org.fao.geonet.csw.common.ResultType;
import org.fao.geonet.csw.common.TypeName;
import org.fao.geonet.csw.common.exceptions.CatalogException;
import org.fao.geonet.csw.common.requests.CatalogRequest;
import org.fao.geonet.csw.common.requests.GetRecordsRequest;
import org.fao.geonet.kernel.DataManager;
import org.fao.geonet.kernel.harvest.harvester.RecordInfo;
import org.fao.geonet.lib.Lib;
import org.jdom.Element;

import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;

//=============================================================================

class Harvester {
    //--------------------------------------------------------------------------
    //---
    //--- Constructor
    //---
    //--------------------------------------------------------------------------

    public Harvester(Logger log, ServiceContext context, Dbms dbms, CswParams params) {
        this.log = log;
        this.context = context;
        this.dbms = dbms;
        this.params = params;
    }

    //---------------------------------------------------------------------------
    //---
    //--- API methods
    //---
    //---------------------------------------------------------------------------

    public CswResult harvest() throws Exception {
        log.info("Retrieving capabilities file for : " + params.name);

        CswServer server = retrieveCapabilities(log);

        //--- perform all searches

        Set<RecordInfo> records = new HashSet<RecordInfo>();

        Search s = new Search();

        for (Element element : params.eltSearches) {
            if (element.getChildText("value") != null) {
                if (!element.getChildText("value").trim().equals("")) {
                    s.addAttribute(element.getName(), element.getChildText("value").trim());
                }
            } else if (element.getText() != null) {
                // Queryables with a namespace are stored in the Settings replacing : with __
                // Otherwise SettingsManager causes issues
                // When building the filter replacing __ with :
                s.addAttribute(element.getName().replace("__", ":"), element.getText().trim());
            }
        }

        records.addAll(search(server, s));

        /*
        for(Search s : params.getSearches()){
           records.addAll(search(server, s));
        }*/

        if (params.isSearchEmpty()) {
            records.addAll(search(server, Search.createEmptySearch()));
        }

        log.info("Total records processed in all searches :" + records.size());

        //--- align local node

        Aligner aligner = new Aligner(log, context, dbms, server, params);

        return aligner.align(records);
    }

    //---------------------------------------------------------------------------

    /**
     * Does CSW GetCapabilities request
     * and check that operations needed for harvesting
     * (ie. GetRecords and GetRecordById)
     * are available in remote node.
     */
    private CswServer retrieveCapabilities(Logger log) throws Exception {
        if (!Lib.net.isUrlValid(params.capabUrl))
            throw new BadParameterEx("Capabilities URL", params.capabUrl);

        XmlRequest req;
        // Support both full GetCapbilities URL or CSW entry point
        if (params.capabUrl.contains("GetCapabilities")) {
            req = new XmlRequest(new URL(params.capabUrl));
        } else {
            req = new XmlRequest(new URL(
                    params.capabUrl + (params.capabUrl.contains("?") ? "&" : "?") + GETCAPABILITIES_PARAMETERS));
        }

        Lib.net.setupProxy(context, req);

        if (params.useAccount)
            req.setCredentials(params.username, params.password);

        Element capabil = req.execute();

        if (log.isDebugEnabled())
            log.debug("Capabilities:\n" + Xml.getString(capabil));

        if (capabil.getName().equals("ExceptionReport"))
            CatalogException.unmarshal(capabil);

        CswServer server = new CswServer(capabil);

        if (!checkOperation(log, server, "GetRecords"))
            throw new OperationAbortedEx("GetRecords operation not found");

        if (!checkOperation(log, server, "GetRecordById"))
            throw new OperationAbortedEx("GetRecordById operation not found");

        return server;
    }

    //---------------------------------------------------------------------------

    private boolean checkOperation(Logger log, CswServer server, String name) {
        CswOperation oper = server.getOperation(name);

        if (oper == null) {
            log.warning("Operation not present in capabilities : " + name);
            return false;
        }

        if (oper.getUrl == null && oper.postUrl == null) {
            log.warning("Operation has no GET and POST bindings : " + name);
            return false;
        }

        return true;
    }

    //---------------------------------------------------------------------------

    /**
     * Does CSW GetRecordsRequest.
     */
    private Set<RecordInfo> search(CswServer server, Search s) throws Exception {
        int start = 1;

        GetRecordsRequest request = new GetRecordsRequest(context);

        request.setResultType(ResultType.RESULTS);
        //request.setOutputSchema(OutputSchema.OGC_CORE);   // Use default value
        request.setElementSetName(ElementSetName.SUMMARY);
        request.setMaxRecords(GETRECORDS_NUMBER_OF_RESULTS_PER_PAGE + "");

        CswOperation oper = server.getOperation(CswServer.GET_RECORDS);

        // Use the preferred HTTP method and check one exist.

        configRequest(request, oper, server, s, PREFERRED_HTTP_METHOD);

        if (params.useAccount)
            request.setCredentials(params.username, params.password);

        // Simple fallback mechanism. Try search with PREFERRED_HTTP_METHOD method, if fails change it
        try {
            request.setStartPosition(start + "");
            doSearch(request, start, 1);
        } catch (Exception ex) {
            if (log.isDebugEnabled()) {
                log.debug(ex.getMessage());
                log.debug("Changing to CSW harvester to use "
                        + (PREFERRED_HTTP_METHOD.equals("GET") ? "POST" : "GET"));
            }

            configRequest(request, oper, server, s, PREFERRED_HTTP_METHOD.equals("GET") ? "POST" : "GET");
        }

        Set<RecordInfo> records = new HashSet<RecordInfo>();

        while (true) {
            request.setStartPosition(start + "");
            Element response = doSearch(request, start, GETRECORDS_NUMBER_OF_RESULTS_PER_PAGE);
            if (log.isDebugEnabled())
                log.debug("Number of child elements in response: " + response.getChildren().size());

            Element results = response.getChild("SearchResults", Csw.NAMESPACE_CSW);
            // heikki: some providers forget to update their CSW namespace to the CSW 2.0.2 specification
            if (results == null) {
                // in that case, try to accommodate them anyway:
                results = response.getChild("SearchResults", Csw.NAMESPACE_CSW_OLD);
                if (results == null) {
                    throw new OperationAbortedEx("Missing 'SearchResults'", response);
                } else {
                    log.warning("Received GetRecords response with incorrect namespace: " + Csw.NAMESPACE_CSW_OLD);
                }
            }

            List list = results.getChildren();
            int counter = 0;

            for (Object e : list) {
                Element record = (Element) e;
                RecordInfo recInfo = getRecordInfo((Element) record.clone());

                if (recInfo != null)
                    records.add(recInfo);

                counter++;
            }

            //--- check to see if we have to perform other searches

            int recCount = getRecordCount(results);

            if (log.isDebugEnabled()) {
                log.debug("Records declared in response : " + recCount);
                log.debug("Records found in response    : " + counter);
            }

            if (start + GETRECORDS_NUMBER_OF_RESULTS_PER_PAGE > recCount)
                break;

            start += GETRECORDS_NUMBER_OF_RESULTS_PER_PAGE;
        }

        log.info("Records added to result list : " + records.size());

        return records;
    }

    //---------------------------------------------------------------------------

    /**
     * Configs the harvester request
     *
     * @param request
     * @param oper
     * @param server
     * @param s
     * @param preferredMethod
     * @throws Exception
     */
    private void configRequest(GetRecordsRequest request, CswOperation oper, CswServer server, Search s,
            String preferredMethod) throws Exception {
        if (oper.getUrl == null && oper.postUrl == null) {
            throw new OperationAbortedEx("No GET or POST DCP available in this service.");
        }

        // Use the preferred HTTP method and check one exist.
        if (oper.getUrl != null && preferredMethod.equals("GET") && oper.constraintLanguage.contains("cql_text")) {
            setUpRequest(request, oper, server, s, oper.getUrl, ConstraintLanguage.CQL, getCqlConstraint(s),
                    CatalogRequest.Method.GET);
        } else if (oper.postUrl != null && preferredMethod.equals("POST")
                && oper.constraintLanguage.contains("filter")) {
            setUpRequest(request, oper, server, s, oper.postUrl, ConstraintLanguage.FILTER, getFilterConstraint(s),
                    CatalogRequest.Method.POST);
        } else {
            if (oper.getUrl != null && oper.constraintLanguage.contains("cql_text")) {
                setUpRequest(request, oper, server, s, oper.getUrl, ConstraintLanguage.CQL, getCqlConstraint(s),
                        CatalogRequest.Method.GET);
            } else if (oper.postUrl != null && oper.constraintLanguage.contains("filter")) {
                setUpRequest(request, oper, server, s, oper.postUrl, ConstraintLanguage.FILTER,
                        getFilterConstraint(s), CatalogRequest.Method.POST);
            } else {
                // TODO : add GET+FE and POST+CQL support
                log.warning(
                        "No GET (using CQL) or POST (using FE) DCP available in this service... Trying GET CQL anyway ...");
                setUpRequest(request, oper, server, s, oper.getUrl, ConstraintLanguage.CQL, getCqlConstraint(s),
                        CatalogRequest.Method.GET);
            }
        }
    }

    /**
     * TODO Javadoc.
     *
     * @param request
     * @param oper
     * @param server
     * @param s
     * @param url
     * @param constraintLanguage
     * @param constraint
     * @param method
     */
    private void setUpRequest(GetRecordsRequest request, CswOperation oper, CswServer server, Search s, URL url,
            ConstraintLanguage constraintLanguage, String constraint, CatalogRequest.Method method) {

        request.setUrl(url);
        request.setServerVersion(server.getPreferredServerVersion());
        request.setOutputSchema(oper.preferredOutputSchema);
        request.setConstraintLanguage(constraintLanguage);
        request.setConstraintLangVersion(CONSTRAINT_LANGUAGE_VERSION);
        request.setConstraint(constraint);
        request.setMethod(method);
        for (String typeName : oper.typeNamesList) {
            request.addTypeName(TypeName.getTypeName(typeName));
        }
        request.setOutputFormat(oper.preferredOutputFormat);
    }

    //---------------------------------------------------------------------------

    private String getFilterConstraint(Search s) {
        //--- collect queriables

        ArrayList<Element> queriables = new ArrayList<Element>();

        if (!s.attributesMap.isEmpty()) {
            for (Entry<String, String> entry : s.attributesMap.entrySet()) {
                if (entry.getValue() != null) {
                    // If the queriable has the namespace, use it
                    String queryableName = entry.getKey();
                    if (!queryableName.contains(":")) {
                        queryableName = "csw:" + queryableName;
                    }

                    buildFilterQueryable(queriables, queryableName, entry.getValue());
                }
            }
        } else {
            log.debug("no search criterion specified, harvesting all ... ");
        }

        //--- build filter expression

        if (queriables.isEmpty())
            return null;

        Element filter = new Element("Filter", Csw.NAMESPACE_OGC);

        if (queriables.size() == 1)
            filter.addContent(queriables.get(0));
        else {
            Element and = new Element("And", Csw.NAMESPACE_OGC);

            for (Element prop : queriables)
                and.addContent(prop);

            filter.addContent(and);
        }

        return Xml.getString(filter);
    }

    //---------------------------------------------------------------------------
    private void buildFilterQueryable(List<Element> queryables, String name, String value) {
        if (value.contains("%") || StringUtils.containsIgnoreCase(name, "AnyText")) {
            buildFilterQueryable(queryables, name, value, "PropertyIsLike");
        } else {
            buildFilterQueryable(queryables, name, value, "PropertyIsEqualTo");
        }
    }

    private void buildFilterQueryable(List<Element> queryables, String name, String value, String operator) {
        if (value.length() == 0)
            return;

        // add Like operator
        Element prop;

        if (operator.equals("PropertyIsLike")) {
            prop = new Element(operator, Csw.NAMESPACE_OGC);
            prop.setAttribute("wildcard", "%");
            prop.setAttribute("singleChar", "_");
            prop.setAttribute("escapeChar", "\\");
        } else {
            prop = new Element(operator, Csw.NAMESPACE_OGC);
        }

        Element propName = new Element("PropertyName", Csw.NAMESPACE_OGC);
        Element literal = new Element("Literal", Csw.NAMESPACE_OGC);

        propName.setText(name);
        literal.setText(value);

        prop.addContent(propName);
        prop.addContent(literal);

        queryables.add(prop);
    }

    //---------------------------------------------------------------------------

    private String getCqlConstraint(Search s) {
        //--- collect queriables

        ArrayList<String> queryables = new ArrayList<String>();

        if (!s.attributesMap.isEmpty()) {
            for (Entry<String, String> entry : s.attributesMap.entrySet()) {
                if (entry.getValue() != null) {
                    buildCqlQueryable(queryables, "csw:" + entry.getKey(), entry.getValue());
                }
            }
        } else {
            System.out.println("no search criterion specified, harvesting all ... ");
        }

        /*
        buildCqlQueryable(queryables, "csw:AnyText", s.freeText);
        buildCqlQueryable(queryables, "dc:title", s.title);
        buildCqlQueryable(queryables, "dct:abstract", s.abstrac);
        buildCqlQueryable(queryables, "dc:subject", s.subject);
        buildCqlQueryable(queryables, "dct:denominator", s.minscale, ">=");
        buildCqlQueryable(queryables, "dct:denominator", s.maxscale, "<=");
        */

        //--- build CQL query

        StringBuffer sb = new StringBuffer();

        for (int i = 0; i < queryables.size(); i++) {
            sb.append(queryables.get(i));

            if (i < queryables.size() - 1)
                sb.append(" AND ");
        }

        return (queryables.size() == 0) ? null : sb.toString();
    }

    //---------------------------------------------------------------------------

    /**
     * Build CQL from user entry. If parameter value
     * contains '%', then the like operator is used.
     */
    private void buildCqlQueryable(List<String> queryables, String name, String value) {
        if (value.length() != 0) {
            if (value.contains("%"))
                buildCqlQueryable(queryables, name, value, "like");
            else
                buildCqlQueryable(queryables, name, value, "=");
        }
    }

    private void buildCqlQueryable(List<String> queryables, String name, String value, String operator) {
        if (value.length() != 0)
            queryables.add(name + " " + operator + " '" + value + "'");
    }
    //---------------------------------------------------------------------------

    private Element doSearch(CatalogRequest request, int start, int max) throws Exception {
        try {
            log.info("Searching on : " + params.name + " (" + start + ".." + (start + max) + ")");
            Element response = request.execute();
            if (log.isDebugEnabled()) {
                log.debug("Sent request " + request.getSentData());
                log.debug("Search results:\n" + Xml.getString(response));
            }

            return response;
        } catch (Exception e) {
            log.warning("Raised exception when searching : " + e);
            throw new OperationAbortedEx("Raised exception when searching: " + e.getMessage(), e);
        }
    }

    //---------------------------------------------------------------------------

    private int getRecordCount(Element results) throws OperationAbortedEx {
        String numRec = results.getAttributeValue("numberOfRecordsMatched");

        if (numRec == null)
            throw new OperationAbortedEx("Missing 'numberOfRecordsMatched' in 'SearchResults'");

        if (!Lib.type.isInteger(numRec))
            throw new OperationAbortedEx("Bad value for 'numberOfRecordsMatched'", numRec);

        return Integer.parseInt(numRec);
    }

    //---------------------------------------------------------------------------

    private RecordInfo getRecordInfo(Element record) {
        String name = record.getName();
        if (log.isDebugEnabled())
            log.debug("getRecordInfo (name): " + name);

        // get schema
        GeonetContext gc = (GeonetContext) context.getHandlerContext(Geonet.CONTEXT_NAME);
        DataManager dm = gc.getDataManager();

        // get uuid and date modified
        try {
            String schema = dm.autodetectSchema(record);
            if (log.isDebugEnabled())
                log.debug("getRecordInfo (schema): " + schema);

            String identif = dm.extractUUID(schema, record);
            if (identif.length() == 0) {
                log.warning("Record doesn't have a uuid : " + name);
                return null; // skip this one
            }

            String modified = dm.extractDateModified(schema, record);
            if (modified.length() == 0)
                modified = null;
            if (log.isDebugEnabled())
                log.debug("getRecordInfo: adding " + identif + " with modification date " + modified);
            return new RecordInfo(identif, modified);
        } catch (Exception e) {
            log.warning("Skipped record not in supported format : " + name);
            e.printStackTrace();
        }

        // we get here if we didn't recognize the schema and/or couldn't get the 
        // UUID or date modified
        return null;

    }

    //---------------------------------------------------------------------------
    //---
    //--- Variables
    //---
    //---------------------------------------------------------------------------
    // FIXME : Currently switch from POST to GET for testing mainly.
    public static final String PREFERRED_HTTP_METHOD = CatalogRequest.Method.GET.toString();
    private static int GETRECORDS_NUMBER_OF_RESULTS_PER_PAGE = 20;
    private static String CONSTRAINT_LANGUAGE_VERSION = "1.1.0";
    private static String GETCAPABILITIES_PARAMETERS = "SERVICE=CSW&REQUEST=GetCapabilities&VERSION=2.0.2";
    private Logger log;
    private Dbms dbms;
    private CswParams params;
    private ServiceContext context;
}

//=============================================================================