org.ala.hbase.ANBGDataLoader.java Source code

Java tutorial

Introduction

Here is the source code for org.ala.hbase.ANBGDataLoader.java

Source

/***************************************************************************
 * Copyright (C) 2010 Atlas of Living Australia
 * All Rights Reserved.
 *
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ***************************************************************************/
package org.ala.hbase;

import java.io.FileReader;
import java.util.List;

import javax.inject.Inject;

import org.ala.dao.InfoSourceDAO;
import org.ala.dao.TaxonConceptDao;
import org.ala.lucene.CreateLoadingIndex;
import org.ala.model.Publication;
import org.ala.model.Reference;
import org.ala.model.SynonymConcept;
import org.ala.model.TaxonConcept;
import org.ala.model.TaxonName;
import org.ala.util.LoadUtils;
import org.ala.util.SpringUtils;
import org.ala.util.TabReader;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.springframework.context.ApplicationContext;
import org.springframework.stereotype.Component;

import au.com.bytecode.opencsv.CSVReader;

/**
 * This class loads data from exported ANBG dump files into the table
 * "taxonConcept" after they have been preprocessed by a Scala script.
 * 
 * It makes use of lucene indexes for lookups of concepts, to add synonyms
 * and parent/child relationships. These indexes are generated using 
 * <code>CreateLoadingIndex</code>
 * 
 * This is currently filtering vernacular concepts and congruent 
 * concepts (favouring the "fromTaxon" in the relationship).
 * 
 * @see CreateLoadingIndex
 * 
 * @author David Martin
 */
@Component("anbgDataLoader")
public class ANBGDataLoader {

    protected static Logger logger = Logger.getLogger(ANBGDataLoader.class);
    @Inject
    protected InfoSourceDAO infoSourceDAO;
    @Inject
    protected TaxonConceptDao taxonConceptDao;

    private static final String AFD_TAXON_CONCEPTS = "/data/bie-staging/anbg/ALA_AFD_TAXON.csv";
    private static final String APNI_TAXON_CONCEPTS = "/data/bie-staging/anbg/ALA_APNI_TAXON.csv";
    private static final String AFD_TAXON_NAMES = "/data/bie-staging/anbg/ALA_AFD_NAME.csv";
    private static final String APNI_TAXON_NAMES = "/data/bie-staging/anbg/ALA_APNI_NAME.csv";
    private static final String RELATIONSHIPS = "/data/bie-staging/anbg/relationships.txt";
    private static final String AFD_PUBLICATIONS = "/data/bie-staging/anbg/ALA_AFD_PUBLICATION.csv";
    private static final String APNI_PUBLICATIONS = "/data/bie-staging/anbg/ALA_APNI_PUBLICATION.csv";
    private static final String AFD_REFERENCES = "/data/bie-staging/anbg/ALA_AFD_REFERENCE.csv";
    private static final String APNI_REFERENCES = "/data/bie-staging/anbg/ALA_APNI_REFERENCE.csv";

    /**
     * @param args
     */
    public static void main(String[] args) throws Exception {
        logger.info("Starting ANBG load....");
        long start = System.currentTimeMillis();
        ApplicationContext context = SpringUtils.getContext();
        ANBGDataLoader loader = (ANBGDataLoader) context.getBean(ANBGDataLoader.class);
        loader.load();
        long finish = System.currentTimeMillis();
        logger.info("Data loaded in: " + ((finish - start) / 60000) + " minutes.");
        System.exit(1);
    }

    /**
     * Load the profile data for taxon concepts. This takes around
     * 90 mins on my laptop to run over all the ANBG data.
     * 
     * @throws Exception
     */
    public void load() throws Exception {
        loadTaxonConceptPublicationInfo();
        loadTaxonNames(); // includes rank information
        //       loadVernacularConcepts();
        //TO DO we potentially want to add additional relationship information including hybrid parents
        //       loadRelationships();
        loadPublications();
        loadReferences();
    }

    private boolean isValidToLoad(LoadUtils loadUtils, String guid) throws Exception {
        //check to see if this guid exists in the name matching
        return loadUtils.getAlaAcceptedSource(guid) != null;
    }

    private void loadReferences() throws Exception {
        loadReferencesFile(AFD_REFERENCES);
        loadReferencesFile(APNI_REFERENCES);
    }

    private void loadReferencesFile(String file) throws Exception {
        logger.info("Starting to load taxon references");

        CSVReader reader = new CSVReader(new FileReader(file), ',', '"', '\\', 1);//Reader reader, char separator, char quotechar, char escape, int line
        LoadUtils loadUtils = new LoadUtils();
        String[] record = null;
        int i = 0;
        int j = 0;
        while ((record = reader.readNext()) != null) {
            i++;
            //LSID,URI,Title,ContainedIn,cc_license,cc_attributionURL
            if (record.length < 5) {
                logger.warn("truncated at line " + i + " record: " + record);
                continue;
            }

            List<String> taxonConceptIds = loadUtils.getGuidsForPublicationGuid(record[0], 100);
            java.util.Set<String> preferredLsids = new java.util.HashSet<String>();

            Reference r = new Reference();
            r.setGuid(record[0]);
            r.setTitle(record[2]);
            r.setContainedIn(record[3]);
            //p.setPublicationType(record[4]);

            //add this taxon name to each taxon concept
            for (String tcId : taxonConceptIds) {
                String preferredGuid = loadUtils.getPreferredGuid(tcId);
                if (!preferredLsids.contains(preferredGuid)) {
                    logger.debug("Adding reference to " + tcId + " record: " + r.getGuid());
                    if (isValidToLoad(loadUtils, preferredGuid)) {
                        boolean success = taxonConceptDao.addPublicationReference(preferredGuid, r);
                        if (success)
                            j++;
                    }
                    preferredLsids.add(preferredGuid);
                }

            }
        }
        logger.info(i + " lines read. " + j + " publications added to concept records.");
    }

    private void loadPublications() throws Exception {
        loadPublicationsFile(AFD_PUBLICATIONS);
        loadPublicationsFile(APNI_PUBLICATIONS);
    }

    /**
     * Load the publications for each concept.
     * 
     * @throws Exception
     */
    private void loadPublicationsFile(String file) throws Exception {
        logger.info("Starting to load taxon publications");

        CSVReader reader = new CSVReader(new FileReader(file), ',', '"', '\\', 1);//Reader reader, char separator, char quotechar, char escape, int line
        LoadUtils loadUtils = new LoadUtils();
        String[] record = null;
        int i = 0;
        int j = 0;
        while ((record = reader.readNext()) != null) {
            i++;
            if (record.length < 10) {
                logger.warn("truncated at line " + i + " record: " + record);
                continue;
            }

            List<String> taxonConceptIds = loadUtils.getGuidsForPublicationGuid(record[0], 100);
            java.util.Set<String> preferredLsids = new java.util.HashSet<String>();

            Publication p = new Publication();
            p.setGuid(record[0]);
            p.setTitle(record[2]);
            //p.setAuthor(record[2]);
            p.setDatePublished(StringUtils.stripToNull(record[4]));
            p.setCitation(StringUtils.stripToNull(record[5]));
            p.setYear(StringUtils.stripToNull(record[3]));
            p.setPublisher(StringUtils.stripToNull(record[9]));
            p.setContainedInGuid(StringUtils.stripToNull(record[6]));
            p.setEdition(StringUtils.stripToNull(record[7]));
            //p.setPublicationType(record[4]);

            //add this taxon name to each taxon concept
            for (String tcId : taxonConceptIds) {
                String preferredGuid = loadUtils.getPreferredGuid(tcId);
                if (!preferredLsids.contains(preferredGuid)) {
                    logger.debug("Adding publication to " + tcId + " record: " + p.getGuid());
                    if (isValidToLoad(loadUtils, preferredGuid)) {
                        boolean success = taxonConceptDao.addPublication(preferredGuid, p);

                        if (success)
                            j++;
                    }
                    preferredLsids.add(preferredGuid);
                }

            }
        }
        logger.info(i + " lines read. " + j + " publications added to concept records.");
    }

    /**
     * Add the relationships to the taxon concepts.
     * 
     * Synonyms are currently loaded by checklistbank, but this
     * adds in publication information for the synonym.
     * 
     * @throws Exception
     */
    private void loadRelationships() throws Exception {

        logger.info("Starting to load synonyms, parents, children");

        LoadUtils loadUtils = new LoadUtils();

        long start = System.currentTimeMillis();
        //add the relationships
        TabReader tr = new TabReader(RELATIONSHIPS);
        String[] keyValue = null;
        int i = 0;
        int j = 0;
        while ((keyValue = tr.readNext()) != null) {
            if (keyValue.length == 3) {
                i++;
                //add the relationship to the "toTaxon"
                if (++i % 10000 == 0)
                    logger.info(i + " relationships processed");

                //add the synonym information to the accepted concept
                if (keyValue[2].endsWith("HasSynonym")) {
                    TaxonConcept synonym = loadUtils.getByGuid(keyValue[1], 1);
                    if (synonym != null) {
                        taxonConceptDao.addSynonym(keyValue[0], (org.ala.model.SynonymConcept) synonym);
                        j++;
                    }
                }

                //add the synonym information to the accepted concept
                if (keyValue[2].endsWith("IsCongruentTo")) {

                    //currently AFD/APNI seems to organised so that
                    // the accepted concepts are marked as congruent to others...hence 
                    TaxonConcept congruentTc = loadUtils.getByGuid(keyValue[1], 1);
                    if (congruentTc != null) {
                        //get the congruent object from the loading indicies
                        TaxonConcept acceptedConcept = taxonConceptDao.getByGuid(keyValue[0]);

                        if (acceptedConcept == null) {
                            logger.error("acceptedConcept is null for guid: " + keyValue[0]);
                        } else if (!congruentTc.getNameString().equals(acceptedConcept.getNameString())) {
                            taxonConceptDao.addIsCongruentTo(keyValue[0], congruentTc);
                            j++;
                        } else {
                            logger.debug("Avoiding adding congruent taxon with same name:"
                                    + acceptedConcept.getNameString() + ", " + keyValue[1]);
                        }
                    }
                }

                //             http://rs.tdwg.org/ontology/voc/TaxonConcept#Includes
                //             http://rs.tdwg.org/ontology/voc/TaxonConcept#Overlaps
                //             http://rs.tdwg.org/ontology/voc/TaxonConcept#IsHybridParentOf
                //             http://rs.tdwg.org/ontology/voc/TaxonConcept#IsHybridChildOf
                /*             
                             doc.add(new Field("fromTaxon", keyValue[0], Store.YES, Index.ANALYZED));
                             doc.add(new Field("toTaxon", keyValue[1], Store.YES, Index.ANALYZED));
                             doc.add(new Field("relationship", keyValue[2], Store.YES, Index.ANALYZED));
                */
            }
        }
        tr.close();
        long finish = System.currentTimeMillis();
        logger.info(i + " loaded relationships, added " + j + " synonyms. Time taken "
                + (((finish - start) / 1000) / 60) + " minutes, " + (((finish - start) / 1000) % 60) + " seconds.");
    }

    private void loadTaxonNames() throws Exception {
        loadTaxonNames(AFD_TAXON_NAMES);
        loadTaxonNames(APNI_TAXON_NAMES);
    }

    /**
     * Load the taxon names
     * 
     * @throws Exception
     */
    private void loadTaxonNames(String file) throws Exception {

        logger.info("Starting to load taxon names");

        CSVReader reader = new CSVReader(new FileReader(file), ',', '"', '\\', 1);//Reader reader, char separator, char quotechar, char escape, int line
        LoadUtils loadUtils = new LoadUtils();
        String[] record = null;
        int i = 0;
        int j = 0;
        while ((record = reader.readNext()) != null) {
            i++;
            if (record.length < 21) {
                logger.info("truncated at line " + i + "record: " + record);
                continue;
            }

            List<TaxonConcept> tcs = loadUtils.getByNameGuid(record[0], 100);
            java.util.Set<String> preferredLsids = new java.util.HashSet<String>();
            TaxonName tn = new TaxonName();

            tn.guid = record[0];
            tn.nameComplete = record[2];
            tn.authorship = record[5];
            tn.rankString = record[4];
            tn.publishedInCitation = record[8];
            tn.nomenclaturalCode = record[10];
            tn.typificationString = record[19];
            tn.genusPart = record[11];
            tn.specificEpithet = record[12];
            tn.infraspecificEpithet = record[13];
            tn.nomenclaturalStatus = record[14];
            tn.hybridForm = record[15];
            tn.infragenericEpithet = record[17];
            tn.basionymAuthorship = record[18];
            tn.microReference = record[9];

            //load the publication information for the name
            if (record[8] != null) {
                Publication pub = loadUtils.getPublicationByGuid(record[8]);
                if (pub != null) {
                    tn.publishedIn = pub.getTitle();
                }
            }

            //add this taxon name to each taxon concept
            for (TaxonConcept tc : tcs) {
                //get the preferred lsid for the supplied concept
                String preferredGuid = loadUtils.getPreferredGuid(tc.getGuid());
                if (!preferredLsids.contains(preferredGuid)) {
                    j++;
                    if (isValidToLoad(loadUtils, preferredGuid)) {
                        taxonConceptDao.addTaxonName(preferredGuid, tn);
                    }
                    preferredLsids.add(preferredGuid);
                }

            }
        }
        logger.info(i + " lines read. " + j + " names added to concept records.");
    }

    /**
     * Adds some additional details to an existing taxon concept.
     * 
     * @throws Exception
     */
    private void loadTaxonConceptPublicationInfo() throws Exception {
        loadTCPublicationInfoFromFile(AFD_TAXON_CONCEPTS);
        loadTCPublicationInfoFromFile(APNI_TAXON_CONCEPTS);
    }

    private void loadTCPublicationInfoFromFile(String file) throws Exception {

        LoadUtils loadUtils = new LoadUtils();
        CSVReader reader = new CSVReader(new FileReader(file), ',', '"', '\\', 1);//Reader reader, char separator, char quotechar, char escape, int line
        String[] record;

        long start = System.currentTimeMillis();
        int i = 0;
        int j = 0;
        int acc = 0;
        int same = 0;
        int syn = 0;
        int sameSyn = 0;
        String guid = null;
        try {
            while ((record = reader.readNext()) != null) {
                i++;
                if (record.length >= 13) {

                    guid = record[0];
                    String nameGuid = record[4];
                    String name = record[3];

                    //check to see if this is the preferred guid for the item
                    String preferredGuid = loadUtils.getPreferredGuid(guid);

                    //get the acceptedConcept if this is a synonym
                    String acceptedGuid = loadUtils.getAlaAcceptedConcept(preferredGuid);
                    //System.out.println(name+" : guid: "+ guid + " preferred: " + preferredGuid + " accepted: " + acceptedGuid);
                    boolean protologue = "Y".equals(record[7]);
                    boolean draft = "Y".equals(record[11]);

                    Publication publication = loadUtils.getPublicationByGuid(record[8]);
                    Reference reference = loadUtils.getReferenceByGuid(record[8]);

                    if (acceptedGuid == null) {

                        //we are dealing with an accepted concept
                        if (guid.equals(preferredGuid)) {
                            //System.out.println(guid + "is accepted and preferred");
                            acc++;
                            //update the base taxon concept if necesary
                            if (publication != null || reference != null || protologue || draft) {
                                TaxonConcept tc = taxonConceptDao.getByGuid(record[0]);
                                if (tc != null) {
                                    if (publication != null) {
                                        tc.setPublishedIn(publication.getTitle());
                                        tc.setPublishedInCitation(publication.getGuid());
                                    }
                                    if (reference != null) {
                                        tc.setReferencedIn(reference.getTitle());
                                        tc.setReferencedInGuid(reference.getGuid());
                                    }
                                    tc.setIsProtologue(protologue);
                                    tc.setIsDraft(draft);
                                    if (isValidToLoad(loadUtils, tc.getGuid())) {
                                        taxonConceptDao.update(tc);
                                    }
                                }
                            }

                        } else {
                            //add a new sameAs taxonConcept
                            // System.out.println(guid + "is accepted and sameAs");
                            same++;
                            TaxonConcept tc = new TaxonConcept();
                            //get the taxon name because we don't want to use the "sensu" name
                            TaxonName tn = loadUtils.getNameByGuid(nameGuid);
                            if (tn != null) {
                                tc.setAuthor(tn.getAuthorship());
                                tc.setNameString(tn.getNameComplete());
                            } else
                                tc.setNameString(name);
                            tc.setGuid(guid);
                            tc.setNameGuid(nameGuid);
                            if (publication != null) {
                                tc.setPublishedIn(publication.getTitle());
                                tc.setPublishedInCitation(publication.getGuid());
                            }
                            if (reference != null) {
                                tc.setReferencedIn(reference.getTitle());
                                tc.setReferencedInGuid(reference.getGuid());
                            }
                            tc.setIsProtologue(protologue);
                            tc.setIsDraft(draft);
                            if (isValidToLoad(loadUtils, preferredGuid)) {
                                taxonConceptDao.addSameAsTaxonConcept(preferredGuid, tc);
                            }
                        }
                    } else {
                        sameSyn++;
                        //need to add the synonym information
                        if (guid.equals(preferredGuid)) {
                            // System.out.println(guid + "is synonym and preferred");
                            syn++;
                            if (publication != null || reference != null) {
                                List<SynonymConcept> synonyms = taxonConceptDao.getSynonymsFor(acceptedGuid);
                                for (SynonymConcept synonym : synonyms) {
                                    if (synonym.getGuid().equals(guid)) {
                                        if (publication != null) {
                                            synonym.setPublishedIn(publication.getTitle());
                                            synonym.setPublishedInCitation(publication.getGuid());
                                        }
                                        if (reference != null) {
                                            synonym.setReferencedIn(reference.getTitle());
                                            synonym.setReferencedInGuid(reference.getGuid());
                                        }
                                        if (isValidToLoad(loadUtils, acceptedGuid)) {
                                            taxonConceptDao.addSynonym(acceptedGuid, synonym);
                                        }
                                        break;
                                    }
                                }
                            }
                        } else {
                            //System.out.println(guid + "is synonym NOT predferred");
                            SynonymConcept synonym = new SynonymConcept();
                            synonym.setGuid(guid);
                            //get the taxon name because we don't want to use the "sensu" name
                            TaxonName tn = loadUtils.getNameByGuid(nameGuid);
                            synonym.setNameGuid(nameGuid);
                            if (tn != null) {
                                synonym.setAuthor(tn.getAuthorship());
                                synonym.setNameString(tn.getNameComplete());
                            } else
                                synonym.setNameString(name);
                            if (publication != null) {
                                synonym.setPublishedIn(publication.getTitle());
                                synonym.setPublishedInCitation(publication.getGuid());
                            }
                            if (reference != null) {
                                synonym.setReferencedIn(reference.getTitle());
                                synonym.setReferencedInGuid(reference.getGuid());
                            }
                            if (isValidToLoad(loadUtils, acceptedGuid)) {
                                taxonConceptDao.addSynonym(acceptedGuid, synonym);
                            }
                        }
                    }

                } else {
                    logger.error(i + " - missing fields: " + record.length + " fields:" + record);
                }
            }
            long finish = System.currentTimeMillis();
            logger.info(i + " lines read from + " + file + ", " + j + " loaded taxon concepts in: "
                    + (((finish - start) / 1000) / 60) + " minutes.");
        } catch (Exception e) {
            logger.error(i + " error on line " + guid);
            e.printStackTrace();
        }
    }

    /**
     * A check to see if the supplied taxon should be added to the profiler
     * 
     * @param loadUtils
     * @param guid
     * @return
     * @throws Exception
     */
    public boolean addTaxonToProfile(LoadUtils loadUtils, String guid) throws Exception {
        boolean isVernacular = loadUtils.isVernacularConcept(guid);
        boolean isCongruentTo = loadUtils.isCongruentConcept(guid);
        boolean isSynonymFor = loadUtils.isSynonymFor(guid);
        return !isVernacular && !isCongruentTo && !isSynonymFor;
    }

    //   /**
    //    * Load the vernacular concepts
    //    * 
    //    * @throws Exception
    //    */
    //   private void loadVernacularConcepts() throws Exception {
    //      
    //      logger.info("Starting load of common names...");
    //      
    //      LoadUtils loadUtils = new LoadUtils();
    //      TabReader tr = new TabReader(TAXON_CONCEPTS);
    //       String[] record = null;
    //       long start = System.currentTimeMillis();
    //       int i=0;
    //       try {
    //          while((record = tr.readNext())!=null){
    //             i++;
    //             if(record.length==9){
    //                
    //                boolean isVernacular = loadUtils.isVernacularConcept(record[0]);
    //                if(isVernacular){
    //                   CommonName cn = new CommonName();
    //                   cn.guid = record[0];
    ////                   tc.nameGuid = record[1];
    //                   cn.nameString = record[2];
    ////                   tc.author = record[3];
    ////                   tc.authorYear = record[4];
    ////                   tc.publishedInCitation = record[5];
    ////                   tc.publishedIn = record[6];
    ////                   tc.acceptedConceptGuid = record[8];
    //                   
    //                   List<String> guids = loadUtils.getIsVernacularConceptFor(record[0]);
    //                   for(String guid: guids){
    //                      taxonConceptDao.addCommonName(guid, cn);
    //                   }
    //                }
    //             } else {
    //                logger.error(i+" - missing fields: "+record.length+" record:"+record);
    //             }
    //          }
    //          
    //          long finish = System.currentTimeMillis();
    //          logger.info("loaded taxon concepts in: "+(((finish-start)/1000)/60)+" minutes.");
    //       } catch (Exception e){
    //          logger.error(i+" error on line", e);
    //          e.printStackTrace();
    //       }
    //   }

    /**
     * @param infoSourceDAO the infoSourceDAO to set
     */
    public void setInfoSourceDAO(InfoSourceDAO infoSourceDAO) {
        this.infoSourceDAO = infoSourceDAO;
    }

    /**
     * @param taxonConceptDao the taxonConceptDao to set
     */
    public void setTaxonConceptDao(TaxonConceptDao taxonConceptDao) {
        this.taxonConceptDao = taxonConceptDao;
    }
}