org.ala.harvester.IdentifyLifeHarvester.java Source code

Introduction

Here is the source code for org.ala.harvester.IdentifyLifeHarvester.java
Source

/***************************************************************************
 * Copyright (C) 2009 Atlas of Living Australia
 * All Rights Reserved.
 *
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ***************************************************************************/

package org.ala.harvester;

import java.io.ByteArrayInputStream;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import javax.inject.Inject;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;

import org.ala.client.util.RestfulClient;
import org.ala.documentmapper.DocumentMapper;
import org.ala.documentmapper.IdentifyLifeDocumentMapper;
import org.ala.model.IdentifyLifeVO;
import org.ala.repository.ParsedDocument;
import org.ala.repository.Repository;
import org.ala.util.MimeType;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.log4j.Logger;
import org.codehaus.jackson.map.DeserializationConfig;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.map.type.TypeFactory;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.Scope;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.NodeList;

import au.com.bytecode.opencsv.CSVWriter;

/**
 * A Harvester class for IdentifyLife
 * 
 * @author MOK011
 *
 */

@Component("identifyLifeHarvester")
@Scope(BeanDefinition.SCOPE_PROTOTYPE)
public class IdentifyLifeHarvester implements Harvester {
    protected Logger logger = Logger.getLogger(this.getClass());
    @Inject
    protected Repository repository;
    //create restful client with no connection timeout.
    protected RestfulClient restfulClient = new RestfulClient(0);

    protected DocumentMapper documentMapper;
    protected Map<String, String> connectionParams;
    protected String defaultMimeType = MimeType.HTML.getMimeType();
    protected int timeGap = 0;

    public static final String SEARCH_END_POINT_ATTR_NAME = "searchEndpoint";
    public static final String KEY_END_POINT_ATTR_NAME = "keyEndpoint";
    public static final String KEY_FIELD_ATTR_NAME = "keyField";
    public static final String SEARCH_FIELD_TAXO_SCOPE = "taxonomicscope";
    public static final String SEARCH_FIELD_GEOG_SCOPE = "GeographicScope";
    public static final String OUTPUT_FILENAME = "/data/bie-staging/identifylife/data.csv";

    public static enum IDLIFE_IDX {
        ID, TITLE, URL, DESCRIPTION, PUBLISHER, PUBLISHYEAR, TAXONOMICSCOPE, GEOGRAPHICSCOPE, KEYTYPE, ACCESSIBILITY, VOCABULARY, TECHNICALSKILLS, IMAGERY
    }

    /**
     * Harvest a document and update the repository.
     * 
     * @param infosourceId
     * @param url
     * @throws Exception
     */
    private void harvestDoc(int infosourceId, String url) throws Exception {
        byte[] content = null;

        System.out.println("******** request: " + url);
        Object[] resp = restfulClient.restGet(url);
        if ((Integer) resp[0] == HttpStatus.SC_OK) {
            content = resp[1].toString().getBytes("UTF-8");
        }

        if (content != null) {
            List<String> ids = new ArrayList<String>();
            String keyUrl = connectionParams.get(KEY_END_POINT_ATTR_NAME);

            // Instantiates a DOM builder to create a DOM of the response.
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = factory.newDocumentBuilder();

            // return a parsed Document
            Document doc = builder.parse(new ByteArrayInputStream(content));

            XPathFactory xfactory = XPathFactory.newInstance();
            XPath xpath = xfactory.newXPath();

            XPathExpression xe = xpath.compile("//keys/key[@id]");
            NodeList nodeSet = (NodeList) xe.evaluate(doc, XPathConstants.NODESET);
            for (int i = 0; i < nodeSet.getLength(); i++) {
                NamedNodeMap map = nodeSet.item(i).getAttributes();
                ids.add(map.getNamedItem("id").getNodeValue());
            }

            for (int i = 0; i < ids.size(); i++) {
                Object[] res = restfulClient.restGet(keyUrl + "/" + ids.get(i));
                if ((Integer) res[0] == HttpStatus.SC_OK) {
                    //map the document 
                    List<ParsedDocument> pds = documentMapper.map(keyUrl + ids.get(i),
                            res[1].toString().getBytes());

                    //store the results
                    for (ParsedDocument pd : pds) {
                        //debugParsedDoc(pd);
                        repository.storeDocument(infosourceId, pd);
                        logger.debug("Parent guid for stored doc: " + pd.getParentGuid());
                    }
                }
            }
        } else {
            logger.warn("Unable to process url: " + url);
        }
    }

    /**
     * Harvest a document and update the repository.
     * 
     * @param infosourceId
     * @param url
     * @throws Exception
     */
    private void harvestJsonDoc(int infosourceId, String url, CSVWriter writer) throws Exception {
        String content = "";
        ObjectMapper mapper = new ObjectMapper();
        mapper.getDeserializationConfig().set(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false);

        Map<String, String> hashTable = new Hashtable<String, String>();
        hashTable.put("accept", "application/json");

        System.out.println("******** request: " + url);
        Object[] resp = restfulClient.restGet(url, hashTable);
        if ((Integer) resp[0] == HttpStatus.SC_OK) {
            content = resp[1].toString();
            if (content != null && content.length() > "[]".length()) {
                List<IdentifyLifeVO> objectList = mapper.readValue(content,
                        TypeFactory.collectionType(ArrayList.class, IdentifyLifeVO.class));
                for (IdentifyLifeVO vo : objectList) {
                    csvWriteLine(writer, vo);
                }
            }
        } else {
            logger.warn("Unable to process url: " + url);
        }
    }

    private void csvWriteLine(CSVWriter writer, IdentifyLifeVO vo) {
        String[] outLine = new String[13];

        if (writer != null && vo != null) {
            outLine[IDLIFE_IDX.ID.ordinal()] = vo.getId();
            outLine[IDLIFE_IDX.TITLE.ordinal()] = vo.getTitle();
            outLine[IDLIFE_IDX.URL.ordinal()] = vo.getUrl();
            outLine[IDLIFE_IDX.DESCRIPTION.ordinal()] = vo.getDescription();
            outLine[IDLIFE_IDX.PUBLISHER.ordinal()] = vo.getPublisher();
            outLine[IDLIFE_IDX.PUBLISHYEAR.ordinal()] = "" + vo.getPublishedyear();
            outLine[IDLIFE_IDX.TAXONOMICSCOPE.ordinal()] = vo.getTaxonomicscope();
            outLine[IDLIFE_IDX.GEOGRAPHICSCOPE.ordinal()] = vo.getGeographicscope();
            outLine[IDLIFE_IDX.KEYTYPE.ordinal()] = vo.getKeytype();
            outLine[IDLIFE_IDX.ACCESSIBILITY.ordinal()] = vo.getAccessibility();
            outLine[IDLIFE_IDX.VOCABULARY.ordinal()] = vo.getVocabulary();
            outLine[IDLIFE_IDX.TECHNICALSKILLS.ordinal()] = vo.getTechnicalskills();
            outLine[IDLIFE_IDX.IMAGERY.ordinal()] = vo.getImagery();

            writer.writeNext(outLine);
        }
    }

    @Override
    public void start(int infosourceId) throws Exception {
        CSVWriter writer = new CSVWriter(new FileWriter(OUTPUT_FILENAME), ',');

        String url = connectionParams.get(SEARCH_END_POINT_ATTR_NAME) + "?"
                + connectionParams.get(KEY_FIELD_ATTR_NAME) + "=";
        // build request parameter from AA% to ZZ% 
        // Taxonomic & Geographic Scope are both wildcard capable searches using '%' wildcard character 
        for (char i = 65; i < (65 + 26); i++) {
            for (char j = 65; j < (65 + 26); j++) {
                //            harvestDoc(infosourceId, url + i + j + "%");
                harvestJsonDoc(infosourceId, url + i + j + "%", writer);
                Thread.sleep(timeGap);
            }
        }
        writer.close();
    }

    @Override
    public void start(int infosourceId, int timeGap) throws Exception {
        this.timeGap = timeGap;
        start(infosourceId);
    }

    @Override
    public void setConnectionParams(Map<String, String> connectionParams) {
        this.connectionParams = connectionParams;
    }

    @Override
    public void setDocumentMapper(DocumentMapper documentMapper) {
        this.documentMapper = documentMapper;
    }

    @Override
    public void setRepository(Repository repository) {
        this.repository = repository;
    }

    //==================<Main>=========================
    /**
     * @param args
     */
    public static void main(String[] args) throws Exception {
        if (args.length != 1) {
            System.out.print("Usage: ");
            System.out.print("<infosourceId>");
            System.out.println();
            System.out.println("infosourceId e.g. 1036");
            System.exit(1);
        }

        String[] locations = { "classpath*:spring.xml" };
        ApplicationContext context = new ClassPathXmlApplicationContext(locations);
        IdentifyLifeHarvester identifyLifeHarvester = context.getBean(IdentifyLifeHarvester.class);
        HashMap<String, String> connectParams = new HashMap<String, String>();
        connectParams.put(SEARCH_END_POINT_ATTR_NAME, "http://www.identifylife.org:8001/Keys/Search");
        connectParams.put(KEY_END_POINT_ATTR_NAME, "http://www.identifylife.org:8001/Keys");
        connectParams.put(KEY_FIELD_ATTR_NAME, SEARCH_FIELD_TAXO_SCOPE);
        identifyLifeHarvester.setConnectionParams(connectParams);
        identifyLifeHarvester.setDocumentMapper(new IdentifyLifeDocumentMapper());
        identifyLifeHarvester.start(Integer.parseInt(args[0]), 500);
        System.exit(0);
    }
}

//================================================