edu.ku.brc.specify.toycode.mexconabio.CopyFromGBIF.java Source code

Java tutorial

Introduction

Here is the source code for edu.ku.brc.specify.toycode.mexconabio.CopyFromGBIF.java

Source

/* Copyright (C) 2015, University of Kansas Center for Research
 * 
 * Specify Software Project, specify@ku.edu, Biodiversity Institute,
 * 1345 Jayhawk Boulevard, Lawrence, Kansas, 66045, USA
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
*/
package edu.ku.brc.specify.toycode.mexconabio;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import edu.ku.brc.specify.conversion.BasicSQLUtils;

/**
 * @author rods
 *
 * @code_status Alpha
 *
 * Created Date: Aug 23, 2010
 *
 */
public class CopyFromGBIF {

    private Connection dbConn = null;
    private Connection dbConn2 = null;
    private Connection srcDBConn = null;

    //-------------------------------
    // Lucene Indexing
    //-------------------------------

    protected File INDEX_DIR = new File("index-gbif");

    protected IndexReader reader;
    protected Searcher searcher;
    protected Analyzer analyzer;

    /**
     * @param server
     * @param port
     * @param dbName
     * @param username
     * @param pwd
     */
    public CopyFromGBIF() {
        super();
    }

    /**
     * @param server
     * @param port
     * @param dbName
     * @param username
     * @param pwd
     */
    public void createDBConnection(final String server, final String port, final String dbName,
            final String username, final String pwd) {
        String connStr = "jdbc:mysql://%s:%s/%s?characterEncoding=UTF-8&autoReconnect=true";
        try {
            dbConn = DriverManager.getConnection(String.format(connStr, server, port, dbName), username, pwd);
            dbConn2 = DriverManager.getConnection(String.format(connStr, server, port, dbName), username, pwd);

        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    /**
     * @param server
     * @param port
     * @param dbName
     * @param username
     * @param pwd
     */
    public void createSrcDBConnection(final String server, final String port, final String dbName,
            final String username, final String pwd) {

        String connStr = "jdbc:mysql://%s:%s/%s?characterEncoding=UTF-8&autoReconnect=true";
        try {
            srcDBConn = DriverManager.getConnection(String.format(connStr, server, port, dbName), username, pwd);

        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    /**
     * 
     */
    public void process() {
        boolean doQueryForCollNum = true;

        String pSQL = "INSERT INTO raw (old_id,data_provider_id,data_resource_id,resource_access_point_id, institution_code, collection_code, "
                + "catalogue_number, scientific_name, author, rank, kingdom, phylum, class, order_rank, family, genus, species, subspecies, latitude, longitude,  "
                + "lat_long_precision, max_altitude, min_altitude, altitude_precision, min_depth, max_depth, depth_precision, continent_ocean, country, state_province, county, collector_name, "
                + "locality,year, month, day, basis_of_record, identifier_name, identification_date,unit_qualifier, created, modified, deleted, collector_num) "
                + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";

        String gbifSQLBase = "SELECT r.id, r.data_provider_id, r.data_resource_id, r.resource_access_point_id, r.institution_code, r.collection_code, "
                + "r.catalogue_number, r.scientific_name, r.author, r.rank, r.kingdom, r.phylum, r.class, r.order_rank, r.family, r.genus, r.species, r.subspecies, "
                + "r.latitude, r.longitude, r.lat_long_precision, r.max_altitude, r.min_altitude, r.altitude_precision, r.min_depth, r.max_depth, r.depth_precision, "
                + "r.continent_ocean, r.country, r.state_province, r.county, r.collector_name, r.locality, r.year, r.month, r.day, r.basis_of_record, r.identifier_name, "
                + "r.identification_date, r.unit_qualifier, r.created, r.modified, r.deleted";

        String gbifSQL;
        if (doQueryForCollNum) {
            gbifSQL = gbifSQLBase + " FROM raw_occurrence_record r";
        } else {
            gbifSQL = gbifSQLBase
                    + ", i.identifier FROM raw_occurrence_record r, identifier_record i WHERE r.id = i.occurrence_id AND i.identifier_type = 3";
        }

        BasicSQLUtils.update(srcDBConn, "DELETE FROM raw WHERE id > 0");

        long totalRecs = BasicSQLUtils.getCount(dbConn, "SELECT COUNT(*) FROM raw_occurrence_record");
        long procRecs = 0;
        long startTime = System.currentTimeMillis();
        int secsThreshold = 0;

        PrintWriter pw = null;

        final double HRS = 1000.0 * 60.0 * 60.0;

        Statement gStmt = null;
        PreparedStatement pStmt = null;
        PreparedStatement stmt = null;

        try {
            pw = new PrintWriter("gbif.log");

            pStmt = srcDBConn.prepareStatement(pSQL);

            stmt = dbConn2.prepareStatement(
                    "SELECT identifier FROM identifier_record WHERE occurrence_id = ? AND identifier_type = 3");
            //stmt.setFetchSize(Integer.MIN_VALUE);

            System.out.println("Total Records: " + totalRecs);
            pw.println("Total Records: " + totalRecs);

            gStmt = dbConn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
            gStmt.setFetchSize(Integer.MIN_VALUE);

            String fullSQL = gbifSQL;
            System.out.println(fullSQL);

            ResultSet gRS = gStmt.executeQuery(fullSQL);
            ResultSetMetaData rsmd = gRS.getMetaData();
            int lastColInx = rsmd.getColumnCount() + (doQueryForCollNum ? 1 : 0);

            while (gRS.next()) {
                int id = gRS.getInt(1);
                pStmt.setObject(1, id);

                for (int i = 2; i <= rsmd.getColumnCount(); i++) {
                    Object obj = gRS.getObject(i);
                    pStmt.setObject(i, obj);
                }

                String collNum = null;
                if (doQueryForCollNum) {
                    //String tmpSQL = String.format("SELECT identifier FROM identifier_record WHERE occurrence_id = %d AND identifier_type = 3", id);
                    //System.out.println(tmpSQL);
                    stmt.setInt(1, id);
                    ResultSet rs = stmt.executeQuery();
                    if (rs.next()) {
                        collNum = rs.getString(1);
                    }
                    rs.close();
                } else {
                    collNum = gRS.getString(lastColInx - 1);
                }

                if (StringUtils.isNotEmpty(collNum)) {
                    if (collNum.length() < 256) {
                        pStmt.setString(lastColInx, collNum);

                    } else {
                        pStmt.setString(lastColInx, collNum.substring(0, 255));
                    }
                } else {
                    pStmt.setObject(lastColInx, null);
                }

                try {
                    pStmt.executeUpdate();

                } catch (Exception ex) {
                    System.err.println("For ID[" + gRS.getObject(1) + "]");
                    ex.printStackTrace();
                    pw.print("For ID[" + gRS.getObject(1) + "] " + ex.getMessage());
                    pw.flush();
                }

                procRecs++;
                if (procRecs % 10000 == 0) {
                    long endTime = System.currentTimeMillis();
                    long elapsedTime = endTime - startTime;

                    double avergeTime = (double) elapsedTime / (double) procRecs;

                    double hrsLeft = (((double) elapsedTime / (double) procRecs) * (double) totalRecs - procRecs)
                            / HRS;

                    int seconds = (int) (elapsedTime / 60000.0);
                    if (secsThreshold != seconds) {
                        secsThreshold = seconds;

                        String msg = String.format(
                                "Elapsed %8.2f hr.mn   Ave Time: %5.2f    Percent: %6.3f  Hours Left: %8.2f ",
                                ((double) (elapsedTime)) / HRS, avergeTime,
                                100.0 * ((double) procRecs / (double) totalRecs), hrsLeft);
                        System.out.println(msg);
                        pw.println(msg);
                        pw.flush();
                    }
                }
            }

        } catch (Exception ex) {
            ex.printStackTrace();

        } finally {
            try {
                if (gStmt != null) {
                    gStmt.close();
                }
                if (pStmt != null) {
                    pStmt.close();
                }
                if (stmt != null) {
                    stmt.close();
                }
                pw.close();

            } catch (Exception ex) {

            }
        }
        System.out.println("Done transferring.");
        pw.println("Done transferring.");

        /*
        int     count = 0;
        boolean cont = true;
        while (cont)
        {
        long start = System.currentTimeMillis();
            
        Statement         gStmt = null;
        PreparedStatement pStmt = null;
            
        try
        {
            gStmt = dbConn.createStatement(ResultSet.TYPE_FORWARD_ONLY,ResultSet.CONCUR_READ_ONLY);
                
            String fullSQL = gbifSQL + String.format(" AND r.id >= %d AND r.id < %d", count, count+recordStep);
            //System.out.println(fullSQL);
                
            int               cnt  = 0;
            ResultSet         rs   = gStmt.executeQuery(fullSQL);
            ResultSetMetaData rsmd = rs.getMetaData();
                
            //System.out.println("Done with query.");
                
            pStmt = srcDBConn.prepareStatement(pSQL);
            count += recordStep;
                
            while (rs.next())
            {
                Integer id  = rs.getInt(1);
                pStmt.setInt(1, id);
                    
                for (int i=2;i<=rsmd.getColumnCount();i++)
                {
                    Object obj = rs.getObject(i);
                    pStmt.setObject(i, obj);
                }
                    
                pStmt.executeUpdate();
                
                cnt++;
                procRecs++;
            }
            rs.close();
                
            if (count == 0)
            {
                break;
            }
                
        } catch (Exception ex)
        {
            ex.printStackTrace();
                
        } finally 
        {
            try
            {
                if (gStmt != null)
                {
                    gStmt.close();
                }
                if (pStmt != null)
                {
                    pStmt.close();
                }
            } catch (Exception ex)
            {
                    
            }
        }
            
        long endTime   = System.currentTimeMillis();
        long deltaTime = endTime - start;
               
        long elapsedTime = endTime - startTime;
            
        double avergeTime = (double)elapsedTime / (double)procRecs;
            
        double hrsLeft = (((double)procRecs / (double)elapsedTime) * (double)totalRecs)  / 3600000.0;
            
        int seconds = (int)(elapsedTime / 60000.0);
        if (secsThreshold != seconds)
        {
            secsThreshold = seconds;
                
            System.out.println(String.format("Elapsed %8.2f hr.mn   Time: %5.2f  Ave Time: %5.2f    Percent: %6.3f  Hours Left: Elapsed %8.2f ", 
                    ((double)(elapsedTime)) / 3600000.0, 
                    ((double)(deltaTime)) / 1000.0, 
                    avergeTime,
                    100.0 * ((double)procRecs / (double)totalRecs),
                    hrsLeft));
        }
        }
        System.out.println("Done transferring.");*/

        /*Statement uStmt = null;
        try
        {
        uStmt = srcDBConn.createStatement();
        int rv = uStmt.executeUpdate("ALTER TABLE raw ADD FULLTEXT(catalogue_number, genus, species, subspecies, collector_num)");
            
        System.out.println("Indexing rv = "+rv);
            
        } catch (Exception ex)
        {
        ex.printStackTrace();
            
        } finally 
        {
        try
        {
            if (uStmt != null)
            {
                uStmt.close();
            }
        } catch (Exception ex)
        {
            ex.printStackTrace();
        }
        }
        System.out.println("Done Indexing.");*/
    }

    /**
     * 
     */
    public void processMissingGenusSpecies() {
        String pSQL = "UPDATE raw SET genus=?, species=?, subspecies=? WHERE id = ?";

        String where = " WHERE genus IS NULL AND species IS NULL AND scientific_name IS NOT NULL";
        String gbifSQLBase = "SELECT id, scientific_name FROM raw" + where;

        long totalRecs = BasicSQLUtils.getCount(srcDBConn, "SELECT COUNT(*) FROM raw " + where);
        long procRecs = 0;
        long startTime = System.currentTimeMillis();
        int secsThreshold = 0;

        PrintWriter pw = null;

        final double HRS = 1000.0 * 60.0 * 60.0;

        Statement gStmt = null;
        PreparedStatement pStmt = null;

        try {
            pw = new PrintWriter("gbif.log");

            pStmt = dbConn.prepareStatement(pSQL);

            System.out.println("Total Records: " + totalRecs);
            pw.println("Total Records: " + totalRecs);

            gStmt = srcDBConn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
            gStmt.setFetchSize(Integer.MIN_VALUE);

            System.out.println(gbifSQLBase);

            ResultSet gRS = gStmt.executeQuery(gbifSQLBase);
            while (gRS.next()) {
                int id = gRS.getInt(1);
                pStmt.setObject(4, id);

                String[] gs = StringUtils.split(gRS.getString(2), ' ');
                switch (gs.length) {
                case 1:
                    pStmt.setString(1, gs[0]);
                    pStmt.setString(2, null);
                    pStmt.setString(3, null);
                    break;

                case 2:
                    pStmt.setString(1, gs[0]);
                    pStmt.setString(2, gs[1]);
                    pStmt.setString(3, null);
                    break;

                case 3:
                    pStmt.setString(1, gs[0]);
                    pStmt.setString(2, gs[1]);
                    pStmt.setString(3, gs[2]);
                    break;

                default:
                    continue;
                }
                try {
                    pStmt.executeUpdate();

                } catch (Exception ex) {
                    System.err.println("For ID[" + gRS.getObject(1) + "][" + gRS.getObject(2) + "]");
                    ex.printStackTrace();
                    pw.print("For ID[" + gRS.getObject(1) + "] " + ex.getMessage());
                    pw.flush();
                }

                procRecs++;
                if (procRecs % 10000 == 0) {
                    long endTime = System.currentTimeMillis();
                    long elapsedTime = endTime - startTime;

                    double avergeTime = (double) elapsedTime / (double) procRecs;

                    double hrsLeft = (((double) elapsedTime / (double) procRecs) * (double) totalRecs - procRecs)
                            / HRS;

                    int seconds = (int) (elapsedTime / 60000.0);
                    if (secsThreshold != seconds) {
                        secsThreshold = seconds;

                        String msg = String.format(
                                "Elapsed %8.2f hr.mn   Ave Time: %5.2f    Percent: %6.3f  Hours Left: %8.2f ",
                                ((double) (elapsedTime)) / HRS, avergeTime,
                                100.0 * ((double) procRecs / (double) totalRecs), hrsLeft);
                        System.out.println(msg);
                        pw.println(msg);
                        pw.flush();
                    }
                }
            }

        } catch (Exception ex) {
            ex.printStackTrace();

        } finally {
            try {
                if (gStmt != null) {
                    gStmt.close();
                }
                if (pStmt != null) {
                    pStmt.close();
                }
                pw.close();

            } catch (Exception ex) {

            }
        }
        System.out.println("Done transferring.");
        pw.println("Done transferring.");
    }

    /**
     * 
     */
    public void cleanup() {
        try {
            if (dbConn != null) {
                dbConn.close();
            }
            if (dbConn2 != null) {
                dbConn2.close();
            }
            if (srcDBConn != null) {
                srcDBConn.close();
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    /**
     * @return the dbConn
     */
    public Connection getDBConn() {
        return dbConn;
    }

    /**
     * @return the srcDBConn
     */
    public Connection getSrcDBConn() {
        return srcDBConn;
    }

    /**
     * 
     */
    public void index() {
        IndexWriter writer = null;
        try {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);

            FileUtils.deleteDirectory(INDEX_DIR);

            System.out.println("Indexing to directory '" + INDEX_DIR + "'...");

            long totalRecs = BasicSQLUtils.getCount(srcDBConn, "SELECT COUNT(*) FROM raw");
            long procRecs = 0;
            long startTime = System.currentTimeMillis();
            int secsThreshold = 0;

            PrintWriter pw = null;

            final double HRS = 1000.0 * 60.0 * 60.0;

            Statement stmt = null;

            try {
                writer = new IndexWriter(FSDirectory.open(INDEX_DIR), analyzer, true,
                        IndexWriter.MaxFieldLength.LIMITED);

                pw = new PrintWriter("gbif.log");

                System.out.println("Total Records: " + totalRecs);
                pw.println("Total Records: " + totalRecs);

                stmt = srcDBConn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
                stmt.setFetchSize(Integer.MIN_VALUE);

                //String[]          fldNames = {"id", "cn", "gn", "sp", "cln", "ctr", "yr", "mn", "dy"};
                ResultSet rs = stmt.executeQuery(
                        "SELECT id, catalogue_number, genus, species, collector_num, collector_name, year, month, day, state_province, county FROM raw");// LIMIT 100000,1000");
                ResultSetMetaData rsmd = rs.getMetaData();

                StringBuilder sb = new StringBuilder();
                while (rs.next()) {
                    String id = rs.getString(1);
                    Document doc = new Document();
                    doc.add(new Field("id", id.toString(), Field.Store.YES, Field.Index.NO));

                    sb.setLength(0);
                    for (int i = 2; i <= rsmd.getColumnCount(); i++) {
                        String val = rs.getString(i);
                        if (StringUtils.isNotEmpty(val)) {
                            sb.append(val);
                            sb.append(' ');
                        }
                    }
                    doc.add(new Field("contents", sb.toString(), Field.Store.NO, Field.Index.ANALYZED));

                    writer.addDocument(doc);

                    procRecs++;
                    if (procRecs % 10000 == 0) {
                        long endTime = System.currentTimeMillis();
                        long elapsedTime = endTime - startTime;

                        double timePerRecord = (elapsedTime / procRecs);

                        double hrsLeft = ((totalRecs - procRecs) * timePerRecord) / HRS;

                        int seconds = (int) (elapsedTime / 60000.0);
                        if (secsThreshold != seconds) {
                            secsThreshold = seconds;

                            String msg = String.format("Elapsed %8.2f hr.mn   Percent: %6.3f  Hours Left: %8.2f ",
                                    ((double) (elapsedTime)) / HRS,
                                    100.0 * ((double) procRecs / (double) totalRecs), hrsLeft);
                            System.out.println(msg);
                            pw.println(msg);
                            pw.flush();
                        }
                    }
                }

            } catch (SQLException sqlex) {
                sqlex.printStackTrace();

            } catch (IOException e) {
                e.printStackTrace();
                System.out.println("IOException adding Lucene Document: " + e.getMessage());

            } finally {

                if (stmt != null) {
                    try {
                        stmt.close();
                    } catch (SQLException e) {
                        e.printStackTrace();
                    }
                }

            }

        } catch (IOException e) {
            e.printStackTrace();

            System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());

        } finally {
            analyzer.close();
            analyzer = null;

            if (writer != null) {
                try {
                    System.out.println("Optimizing...");
                    writer.optimize();
                    writer.close();
                    System.out.println("Done Optimizing.");

                } catch (CorruptIndexException e) {
                    e.printStackTrace();

                } catch (IOException e) {
                    e.printStackTrace();
                }
                writer = null;
            }
        }
    }

    public void testSearch() {
        Statement stmt = null;

        String querystr = "Andrew AND Bentley AND Apogon AND angustatus";
        String term = "contents";
        try {
            stmt = srcDBConn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);

            if (analyzer == null) {
                analyzer = new StandardAnalyzer(Version.LUCENE_36);
            }
            reader = IndexReader.open(FSDirectory.open(INDEX_DIR), true);

            long startTime = System.currentTimeMillis();
            Query q = new QueryParser(Version.LUCENE_36, term, analyzer).parse(querystr);
            int hitsPerPage = 10;
            searcher = new IndexSearcher(reader);
            TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
            searcher.search(q, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
            System.out.println("Found " + hits.length + " hits.");
            for (int i = 0; i < hits.length; ++i) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                System.out.println((i + 1) + ". " + d.get("id"));

                String id = d.get("id");

                ResultSet rs = stmt.executeQuery(
                        "SELECT id, catalogue_number, genus, species, collector_num, collector_name, year, month, day FROM raw WHERE id = "
                                + id);
                ResultSetMetaData rsmd = rs.getMetaData();

                while (rs.next()) {
                    for (int j = 1; j <= rsmd.getColumnCount(); j++) {
                        System.out.print(rs.getObject(j) + "\t");
                    }
                    System.out.println();
                }
                rs.close();
            }
            System.out.println(String.format("Time: %8.2f", (System.currentTimeMillis() - startTime) / 1000.0));
            searcher.close();
            reader.close();
            analyzer.close();

        } catch (SQLException e) {
            e.printStackTrace();

        } catch (IOException e) {
            e.printStackTrace();

        } catch (ParseException e) {
            e.printStackTrace();
        } finally {
            if (stmt != null) {
                try {
                    stmt.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    //------------------------------------------------------------------------------------------
    public static void main(String[] args) {
        CopyFromGBIF awg = new CopyFromGBIF();

        boolean doBuild = false;

        if (doBuild) {
            awg.createDBConnection("lm2gbdb.nhm.ku.edu", "3399", "gbc20100726", "rods", "specify4us");
            awg.createSrcDBConnection("localhost", "3306", "gbif", "root", "root");
            awg.process();
            awg.cleanup();
        } else {
            awg.createDBConnection("localhost", "3306", "plants", "root", "root");

            awg.createSrcDBConnection("localhost", "3306", "plants_ref", "root", "root");
            awg.processMissingGenusSpecies();

            //awg.createSrcDBConnection("localhost", "3306", "gbif", "root", "root");
            awg.index();
            //awg.testSearch();
            awg.cleanup();
        }
    }
}