org.ala.lucene.ExternalIndexLoader.java Source code

Java tutorial

Introduction

Here is the source code for org.ala.lucene.ExternalIndexLoader.java

Source

/***************************************************************************
 * Copyright (C) 2010 Atlas of Living Australia
 * All Rights Reserved.
 *
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ***************************************************************************/
package org.ala.lucene;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.List;
import java.util.Map;

import javax.inject.Inject;
import javax.sql.DataSource;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.ala.dao.IndexedTypes;
import org.ala.dao.SolrUtils;
import org.ala.documentmapper.Mapping;
import org.ala.repository.Predicates;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.type.TypeReference;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import au.com.bytecode.opencsv.CSVReader;

/**
 * Load tool for loading external data into the indexes to provide a combined
 * single index for the BIE front end.
 *
 * TODO Replace the database links with loading via webservices
 *
 * @author Dave Martin (David.Martin@csiro.au)
 */
@Component("externalIndexLoader")
public class ExternalIndexLoader {

    protected static Logger logger = Logger.getLogger(ExternalIndexLoader.class);

    @Inject
    protected DataSource collectoryDataSource;
    @Inject
    protected SolrUtils solrUtils;

    protected String baseUrlForCollectory = "http://collections.ala.org.au/public/show/";

    /**
     * Run the loading of indexes for institutions, collections, data providers and data resources
     * 
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {

        String[] locations = { "classpath*:spring.xml" };

        ApplicationContext context = new ClassPathXmlApplicationContext(locations);
        ExternalIndexLoader l = (ExternalIndexLoader) context.getBean(ExternalIndexLoader.class);

        l.loadRegions();

        //load collections
        l.loadCollections();

        //load institutions
        l.loadInstitutions();

        //load data providers
        l.loadDataProviders();

        //load datasets
        l.loadDatasets();

        //load layers
        l.loadLayers();

        // load WordPress pages
        CreateWordPressIndex cwpi = (CreateWordPressIndex) context.getBean(CreateWordPressIndex.class);
        logger.info("Start of crawling and indexing WP pages.");
        cwpi.loadSitemap();
        cwpi.indexPages();

        System.exit(0);
    }

    public void loadLayers() throws Exception {

        SolrServer solrServer = solrUtils.getSolrServer();
        solrServer.deleteByQuery("idxtype:" + IndexedTypes.LAYERS); // delete layers!

        HttpClient httpClient = new HttpClient();
        GetMethod gm = new GetMethod("http://spatial.ala.org.au/layers-service/layers.json");
        logger.debug("Response code for get method: " + httpClient.executeMethod(gm));
        String layerJson = gm.getResponseBodyAsString();
        ObjectMapper om = new ObjectMapper();
        List<Map<String, Object>> layers = om.readValue(layerJson, new TypeReference<List<Map<String, Object>>>() {
        });
        for (Map<String, Object> layer : layers) {
            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("name", layer.get("displayname"));
            doc.addField("guid", "http://spatial.ala.org.au/layers/more/" + layer.get("name"));
            doc.addField("description", layer.get("notes"));
            doc.addField("text", layer.get("source"));
            doc.addField("text", layer.get("type"));
            doc.addField("text", layer.get("notes"));
            doc.addField("text", layer.get("keywords"));
            if (layer.get("classification1") != null)
                doc.addField("text", layer.get("classification1"));
            if (layer.get("classification2") != null)
                doc.addField("text", layer.get("classification2"));
            doc.addField("content", layer.get("notes"));
            doc.addField("dataProviderName", layer.get("source"));
            doc.addField("url", "http://spatial.ala.org.au/layers/more/" + layer.get("name"));
            doc.addField("id", layer.get("id"));
            doc.addField("idxtype", IndexedTypes.LAYERS);
            doc.addField("australian_s", "recorded"); // so they appear in default QF search
            doc.addField("image", "http://spatial.ala.org.au/output/layerthumbs/ALA:" + layer.get("name") + ".jpg"); // so they appear in default QF search
            solrServer.add(doc);
        }
        logger.info("Finished syncing layer information with the collectory.");
        solrServer.commit();
        logger.info("Finished syncing layers.");
    }

    /**
     * Loads collections and institutions into the BIE search index.
     * 
     * @throws Exception
     */
    public void loadCollections() throws Exception {

        logger.info("Starting syncing collection information....");
        Connection conn = collectoryDataSource.getConnection();
        Statement stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
        ResultSet rs = stmt.executeQuery(
                "select uid, guid, name, acronym, collection_type, pub_description, sub_collections, keywords from collection");

        SolrServer solrServer = solrUtils.getSolrServer();
        solrServer.deleteByQuery("idxtype:" + IndexedTypes.COLLECTION); // delete collections!

        while (rs.next()) {
            String uid = rs.getString("uid");
            String externalGuid = rs.getString("guid");
            String name = rs.getString("name");
            String acronym = rs.getString("acronym");
            String description = rs.getString("pub_description");
            String subCollections = rs.getString("sub_collections");
            String keywords = rs.getString("keywords");
            String collectionType = rs.getString("collection_type");

            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("acronym", acronym, 1.2f);
            doc.addField("name", name, 1.2f);
            doc.addField("guid", baseUrlForCollectory + uid);

            doc.addField("otherGuid", uid); // the internal UID e.g. co1
            if (externalGuid != null) {
                doc.addField("otherGuid", externalGuid); // the external GUID e.g. url:lsid:bci:123
            }

            //add as text
            doc.addField("text", description);
            doc.addField("text", subCollections);
            doc.addField("text", keywords);
            doc.addField("text", collectionType);

            doc.addField("url", baseUrlForCollectory + uid);
            doc.addField("id", baseUrlForCollectory + uid);
            doc.addField("idxtype", IndexedTypes.COLLECTION);
            //         doc.addField("aus_s", "yes");
            doc.addField("australian_s", "recorded"); // so they appear in default QF search

            solrServer.add(doc);
        }

        solrServer.commit();
        rs.close();
        stmt.close();
        conn.close();
        logger.info("Finished syncing collection information with the collectory.");
    }

    /**
     * Loads collections and institutions into the BIE search index.
     * 
     * @throws Exception
     */
    public void loadInstitutions() throws Exception {

        logger.info("Starting syncing institution information....");
        Connection conn = collectoryDataSource.getConnection();
        Statement stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
        ResultSet rs = stmt.executeQuery(
                "select uid, guid, name, acronym, institution_type, pub_description from institution");

        SolrServer solrServer = solrUtils.getSolrServer();
        solrServer.deleteByQuery("idxtype:" + IndexedTypes.INSTITUTION); // delete institutions!

        while (rs.next()) {
            String uid = rs.getString("uid");
            String externalGuid = rs.getString("guid");
            String name = rs.getString("name");
            String acronym = rs.getString("acronym");
            String institutionType = rs.getString("institution_type"); // university/museum/government
            String pubDescription = rs.getString("pub_description");

            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("acronym", acronym, 1.2f);
            doc.addField("name", name, 1.2f);
            doc.addField("guid", baseUrlForCollectory + uid);

            if (externalGuid != null) {
                doc.addField("otherGuid", externalGuid);
            }

            doc.addField("text", pubDescription);
            doc.addField("url", baseUrlForCollectory + uid);
            doc.addField("id", baseUrlForCollectory + uid);
            doc.addField("idxtype", IndexedTypes.INSTITUTION);
            doc.addField("institutionType", institutionType);
            //         doc.addField("aus_s", "yes");
            doc.addField("australian_s", "recorded"); // so they appear in default QF search
            solrServer.add(doc);
        }

        solrServer.commit();
        rs.close();
        stmt.close();
        conn.close();
        logger.info("Finished syncing institution information.");
    }

    /**
     * Load the datasets
     * 
     * @throws Exception
     */
    public void loadDatasets() throws Exception {

        logger.info("Starting syncing data resource information....");
        Connection conn = collectoryDataSource.getConnection();
        Statement stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
        ResultSet rs = stmt.executeQuery(
                "select dr.uid, dr.guid, dr.name, dr.acronym as acronym, dp.name as data_provider_name, dr.pub_description as description "
                        + "from data_resource dr " + "left join data_provider dp ON dp.id=dr.data_provider_id ");

        SolrServer solrServer = solrUtils.getSolrServer();
        solrServer.deleteByQuery("idxtype:" + IndexedTypes.DATASET);

        while (rs.next()) {
            String uid = rs.getString("uid");
            String externalGuid = rs.getString("guid");
            String name = rs.getString("name");
            String acronym = rs.getString("acronym");
            String dataProviderName = rs.getString("data_provider_name");
            String description = rs.getString("description");

            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("guid", baseUrlForCollectory + uid);
            doc.addField("url", baseUrlForCollectory + uid);
            doc.addField("id", baseUrlForCollectory + uid);
            doc.addField("name", name);
            if (externalGuid != null) {
                doc.addField("otherGuid", externalGuid);
            }
            if (acronym != null) {
                doc.addField("acronym", acronym);
            }
            doc.addField("dataProviderName", dataProviderName);
            doc.addField("description", description);
            doc.addField("idxtype", IndexedTypes.DATASET);
            //         doc.addField("aus_s", "yes");
            doc.addField("australian_s", "recorded"); // so they appear in default QF search
            solrServer.add(doc);
        }

        solrServer.commit();
        rs.close();
        stmt.close();
        conn.close();
        logger.info("Finished syncing data resource information.");
    }

    /**
     * Load the data providers
     * 
     * @throws Exception
     */
    public void loadDataProviders() throws Exception {

        logger.info("Started syncing data provider information....");
        Connection conn = collectoryDataSource.getConnection();
        Statement stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
        ResultSet rs = stmt
                .executeQuery("select dp.uid, dp.name, dp.pub_description as description from data_provider dp");

        SolrServer solrServer = solrUtils.getSolrServer();
        solrServer.deleteByQuery("idxtype:" + IndexedTypes.DATAPROVIDER);

        while (rs.next()) {
            String uid = rs.getString("uid");
            String name = rs.getString("name");
            String description = rs.getString("description");

            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("guid", baseUrlForCollectory + uid);
            doc.addField("url", baseUrlForCollectory + uid);
            doc.addField("id", baseUrlForCollectory + uid);
            doc.addField("name", name);
            doc.addField("description", description);
            doc.addField("idxtype", IndexedTypes.DATAPROVIDER);
            //         doc.addField("aus_s", "yes");
            doc.addField("australian_s", "recorded"); // so they appear in default QF search
            solrServer.add(doc);
        }

        solrServer.commit();
        rs.close();
        stmt.close();
        conn.close();
        logger.info("Finished syncing data provider information.");
    }

    // =======================
    private String getTagValue(String sTag, Element eElement) {
        NodeList nl = eElement.getElementsByTagName(sTag);
        if (nl == null || nl.getLength() < 1) {
            return null;
        }

        NodeList nlList = nl.item(0).getChildNodes();
        Node nValue = (Node) nlList.item(0);
        return nValue.getNodeValue();
    }

    public void loadRegions() throws Exception {
        int ctr = 0;
        logger.info("Started syncing regions information....");

        // init....
        HttpClient httpClient = new HttpClient();
        GetMethod gm = new GetMethod("http://regions.ala.org.au/data/sitemap.xml");
        logger.debug("Response code for get method: " + httpClient.executeMethod(gm));
        InputStream responseStream = gm.getResponseBodyAsStream();

        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        dbFactory.setNamespaceAware(false);

        DocumentBuilder parser = dbFactory.newDocumentBuilder();
        Document document = parser.parse(responseStream);
        NodeList nList = document.getElementsByTagName("url");

        // start process....
        if (nList.getLength() > 0) {
            SolrServer solrServer = solrUtils.getSolrServer();
            solrServer.deleteByQuery("idxtype:" + IndexedTypes.REGION);
            for (int temp = 0; temp < nList.getLength(); temp++) {
                Node nNode = nList.item(temp);
                if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                    Element eElement = (Element) nNode;
                    String loc = getTagValue("loc", eElement);
                    if (loc != null) {
                        String url = java.net.URLDecoder.decode(loc, "UTF-8");
                        String[] terms = url.split("/");
                        if (terms.length > 1) {
                            String name = terms[terms.length - 1];
                            String region = terms[terms.length - 2];
                            if (name != null && region != null) {
                                SolrInputDocument doc = new SolrInputDocument();
                                doc.addField("idxtype", IndexedTypes.REGION);
                                doc.addField("guid", url);
                                doc.addField("id", url);
                                doc.addField("url", url);
                                doc.addField("regionType", region);
                                doc.addField("name", name);
                                doc.addField("text", name);
                                doc.addField("australian_s", "recorded"); // so they appear in default QF search
                                solrServer.add(doc);
                                ctr++;
                            }
                        }
                    }
                }
            }
            solrServer.commit();
        }
        logger.info("Finished syncing regions information. Total count: " + ctr);
    }

    /**
     * @param collectoryDataSource the collectoryDataSource to set
     */
    public void setCollectoryDataSource(DataSource collectoryDataSource) {
        this.collectoryDataSource = collectoryDataSource;
    }

    /**
     * @param solrUtils the solrUtils to set
     */
    public void setSolrUtils(SolrUtils solrUtils) {
        this.solrUtils = solrUtils;
    }

    /**
     * @return the collectoryDataSource
     */
    public DataSource getCollectoryDataSource() {
        return collectoryDataSource;
    }

    /**
     * @param baseUrlForCollectory the baseUrlForCollectory to set
     */
    public void setBaseUrlForCollectory(String baseUrlForCollectory) {
        this.baseUrlForCollectory = baseUrlForCollectory;
    }
}