edu.mayo.informatics.lexgrid.convert.directConversions.UmlsCommon.LoadRRFToDB.java Source code

Java tutorial

Introduction

Here is the source code for edu.mayo.informatics.lexgrid.convert.directConversions.UmlsCommon.LoadRRFToDB.java

Source

/*
 * Copyright: (c) 2004-2010 Mayo Foundation for Medical Education and 
 * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
 * triple-shield Mayo logo are trademarks and service marks of MFMER.
 *
 * Except as contained in the copyright notice above, or as used to identify 
 * MFMER as the author of this software, the trade names, trademarks, service
 * marks, or product names of the copyright holder shall not be used in
 * advertising, promotion or otherwise in connection with this software without
 * prior written authorization of the copyright holder.
 * 
 * Licensed under the Eclipse Public License, Version 1.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at 
 * 
 *       http://www.eclipse.org/legal/epl-v10.html
 * 
 */
package edu.mayo.informatics.lexgrid.convert.directConversions.UmlsCommon;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;

import org.LexGrid.LexBIG.Utility.logging.LgMessageDirectorIF;
import org.LexGrid.util.sql.DBUtility;
import org.LexGrid.util.sql.GenericSQLModifier;
import org.apache.commons.lang.StringUtils;

/**
 * This class loads that tables that I need from RRF files into a database.
 * 
 * @author <A HREF="mailto:armbrust.daniel@mayo.edu">Dan Armbrust</A>
 * @author <A HREF="mailto:johnson.thomas@mayo.edu">Thomas Johnson</A>
 * @version subversion $Revision: $ checked in on $Date: $
 */
public class LoadRRFToDB {
    public static String[] validateRRF(URI rrfLocation, boolean skipNonLexGridFiles, LgMessageDirectorIF md)
            throws Exception {
        Random r = new Random();
        // generate a random db label
        return createAndLoadTables(rrfLocation, skipNonLexGridFiles, false, "jdbc:hsqldb:mem:" + r.nextInt(100),
                "org.hsqldb.jdbcDriver", "sa", "", md, true);
    }

    public static String[] loadRRFToDB(URI rrfLocation, boolean skipNonLexGridFiles, boolean recalcRootOnly,
            String dbServer, String dbDriver, String username, String password, LgMessageDirectorIF md)
            throws Exception {
        return createAndLoadTables(rrfLocation, skipNonLexGridFiles, recalcRootOnly, dbServer, dbDriver, username,
                password, md, false);
    }

    private static String[] createAndLoadTables(URI rrfLocation, boolean skipNonLexGridFiles,
            boolean recalcRootOnly, String dbServer, String dbDriver, String username, String password,
            LgMessageDirectorIF md, boolean validateMode) throws Exception {
        md.info("Connecting to RRF Files");
        BufferedReader reader = getReader(rrfLocation.resolve("MRFILES.RRF"));

        md.info("Connecting to db Files");
        Connection sqlConnection = DBUtility.connectToDatabase(dbServer, dbDriver, username, password);

        GenericSQLModifier gsm = new GenericSQLModifier(sqlConnection);

        Hashtable columnTypeMap = readMRCols(rrfLocation);
        Hashtable tableColumnMap = new Hashtable();

        List tables = new ArrayList();

        if (skipNonLexGridFiles) {
            // the only tables that I need to load
            tables.add("MRCONSO");
            tables.add("MRDOC");
            tables.add("MRREL");
            tables.add("MRSAB");
            tables.add("MRRANK");

            if (!recalcRootOnly) {
                tables.add("MRDEF");
                tables.add("MRSTY");
                tables.add("MRSAT");
                tables.add("MRHIER");
            }
        }

        md.info("Creating SQL database tables");

        PreparedStatement create = null;
        PreparedStatement drop = null;
        String line = reader.readLine();

        int mrhierHCDCol = -1;
        while (line != null) {
            String[] vals = stringToArray(line, '|');

            // for MRFILES, all I care about is the following
            String file = vals[0];
            String tableName = file.substring(0, file.indexOf('.'));

            // if file is MRHIER, remember HCD column number (base 0)
            if ("MRHIER".equalsIgnoreCase(tableName) && vals.length > 1) {
                mrhierHCDCol = Arrays.asList(vals[2].split(",")).indexOf("HCD");
            }

            if (skipNonLexGridFiles || recalcRootOnly) {
                if (!tables.contains(tableName)) {
                    line = reader.readLine();
                    continue;
                }
            } else {
                if (file.indexOf('/') != -1) {
                    // skip files in subfolders.
                    line = reader.readLine();
                    continue;
                }
                if (!tables.contains(tableName))
                    tables.add(tableName);
            }

            String[] columns = stringToArray(vals[2], ',');

            tableColumnMap.put(file, columns);

            StringBuffer tableCreateSQL = new StringBuffer();
            tableCreateSQL.append("CREATE TABLE {IF NOT EXISTS} ^" + tableName + "^ (");

            for (int i = 0; i < columns.length; i++) {

                tableCreateSQL.append(" ^" + columns[i] + "^ "
                        + mapUMLSType((String) columnTypeMap.get(columns[i] + "|" + file)) + " default NULL,");
            }

            // chop the last comma
            tableCreateSQL.deleteCharAt(tableCreateSQL.length() - 1);
            tableCreateSQL.append(") {TYPE}");

            // make sure the table doesn't exist
            try {
                drop = sqlConnection.prepareStatement(gsm.modifySQL("DROP TABLE " + tableName + " {CASCADE}"));
                drop.executeUpdate();
                drop.close();
            } catch (SQLException e) {
                // most likely means that the table didn't exist.
            }

            create = sqlConnection.prepareStatement(gsm.modifySQL(tableCreateSQL.toString()));
            create.executeUpdate();

            create.close();

            line = reader.readLine();
        }
        reader.close();

        md.info("Creating indexes");

        PreparedStatement createIndex = null;

        createIndex = sqlConnection
                .prepareStatement(gsm.modifySQL("CREATE INDEX ^mi1^ ON ^MRCONSO^ (^CUI^, ^SAB^)"));
        createIndex.executeUpdate();
        createIndex.close();

        createIndex = sqlConnection
                .prepareStatement(gsm.modifySQL("CREATE INDEX ^mi2^ ON ^MRCONSO^ (^CUI^, ^AUI^)"));
        createIndex.executeUpdate();
        createIndex.close();

        createIndex = sqlConnection
                .prepareStatement(gsm.modifySQL("CREATE INDEX ^mi3^ ON ^MRCONSO^ (^AUI^, ^CODE^)"));
        createIndex.executeUpdate();
        createIndex.close();

        createIndex = sqlConnection.prepareStatement(gsm.modifySQL("CREATE INDEX ^mi4^ ON ^MRREL^ (^RELA^)"));
        createIndex.executeUpdate();
        createIndex.close();

        createIndex = sqlConnection.prepareStatement(gsm.modifySQL("CREATE INDEX ^mi5^ ON ^MRREL^ (^REL^)"));
        createIndex.executeUpdate();
        createIndex.close();

        createIndex = sqlConnection.prepareStatement(gsm.modifySQL("CREATE INDEX ^mi6^ ON ^MRREL^ (^RUI^)"));
        createIndex.executeUpdate();
        createIndex.close();

        createIndex = sqlConnection
                .prepareStatement(gsm.modifySQL("CREATE INDEX ^mi7^ ON ^MRREL^ (^SAB^, ^RELA^)"));
        createIndex.executeUpdate();
        createIndex.close();

        createIndex = sqlConnection.prepareStatement(gsm.modifySQL("CREATE INDEX ^mi8^ ON ^MRSAB^ (^RSAB^)"));
        createIndex.executeUpdate();
        createIndex.close();

        createIndex = sqlConnection.prepareStatement(gsm.modifySQL("CREATE INDEX ^mi9^ ON ^MRRANK^ (^SAB^)"));
        createIndex.executeUpdate();
        createIndex.close();

        createIndex = sqlConnection.prepareStatement(gsm.modifySQL("CREATE INDEX ^mi10^ ON ^MRRANK^ (^TTY^)"));
        createIndex.executeUpdate();
        createIndex.close();

        if (!recalcRootOnly) {
            createIndex = sqlConnection
                    .prepareStatement(gsm.modifySQL("CREATE INDEX ^mi11^ ON ^MRDEF^ (^CUI^, ^SAB^)"));
            createIndex.executeUpdate();
            createIndex.close();

            createIndex = sqlConnection
                    .prepareStatement(gsm.modifySQL("CREATE INDEX ^mi12^ ON ^MRSAT^ (^METAUI^)"));
            createIndex.executeUpdate();
            createIndex.close();

            createIndex = sqlConnection
                    .prepareStatement(gsm.modifySQL("CREATE INDEX ^mi13^ ON ^MRSAT^ (^CUI^, ^SAB^)"));
            createIndex.executeUpdate();
            createIndex.close();

            createIndex = sqlConnection
                    .prepareStatement(gsm.modifySQL("CREATE INDEX ^mi14^ ON ^MRSAT^ (^CODE^, ^SAB^)"));
            createIndex.executeUpdate();
            createIndex.close();

            createIndex = sqlConnection.prepareStatement(gsm.modifySQL("CREATE INDEX ^mi15^ ON ^MRSTY^ (^CUI^)"));
            createIndex.executeUpdate();
            createIndex.close();

            createIndex = sqlConnection.prepareStatement(
                    gsm.modifySQL("CREATE INDEX ^mi16^ ON ^MRHIER^ (^CUI^, ^AUI^, ^HCD^, ^SAB^, ^CXN^)"));
            createIndex.executeUpdate();
            createIndex.close();

            createIndex = sqlConnection
                    .prepareStatement(gsm.modifySQL("CREATE INDEX ^mi17^ ON ^MRHIER^ (^CUI^, ^SAB^, ^CXN^)"));
            createIndex.executeUpdate();
            createIndex.close();
        }

        PreparedStatement insert = null;

        Iterator allTables = tables.iterator();
        Set rootCUIs = new HashSet();
        while (allTables.hasNext()) {
            System.gc();
            String table = (String) allTables.next();
            md.info("Loading " + table);

            boolean loadingMrHier = table.equalsIgnoreCase("MRHIER");

            StringBuffer insertSQL = new StringBuffer();

            insertSQL.append("INSERT INTO " + table + " (");

            String[] vals = (String[]) tableColumnMap.get(table + ".RRF");
            for (int i = 0; i < vals.length; i++) {
                if (gsm.getDatabaseType().equals("ACCESS") && vals[i].equals("VALUE")) {
                    // reserved word in MSAccess
                    insertSQL.append("\"" + vals[i] + "\", ");
                } else {
                    insertSQL.append(vals[i] + ", ");
                }
            }

            // chop the last comma and space
            insertSQL.deleteCharAt(insertSQL.length() - 2);
            insertSQL.append(") VALUES (");

            for (int i = 0; i < vals.length; i++) {
                insertSQL.append("?, ");
            }

            // chop the last comma and space
            insertSQL.deleteCharAt(insertSQL.length() - 2);
            insertSQL.append(")");
            insert = sqlConnection.prepareStatement(gsm.modifySQL(insertSQL.toString()));

            URI tableURI = rrfLocation.resolve(table + ".RRF");

            if (verifyTableExists(tableURI)) {
                try {
                    reader = getReader(tableURI);

                    int count = 1;
                    line = reader.readLine();
                    boolean restrictToRootCUIs = recalcRootOnly && table.equalsIgnoreCase("MRCONSO");
                    boolean restrictToRootRels = recalcRootOnly && table.equalsIgnoreCase("MRREL");
                    while (line != null && line.length() > 0) {
                        // Note: If we are only using the data to recalculate
                        // root nodes,
                        // we only need CUIs defining root hierarchical terms
                        // and any related
                        // relationships.
                        if (restrictToRootCUIs && !line.contains("|SRC|RHT|")) {
                            line = reader.readLine();
                            continue;
                        }
                        String[] data = stringToArray(line, '|');

                        // If processing MRHIER, we only care about entries
                        // relevant to
                        // the specified MRHIER processing option. Many entries
                        // in this file
                        // we do not require since they can be derived from
                        // MRREL.
                        // MRHIER typically is much larger since it pre-computes
                        // the entire
                        // hierarchy, so we want to conserve time and space by
                        // loading only
                        // those entries that require special handling.
                        if (loadingMrHier && mrhierHCDCol > 0 && data.length > mrhierHCDCol
                                && StringUtils.isBlank(data[mrhierHCDCol])) {
                            line = reader.readLine();
                            continue;
                        }

                        if (restrictToRootCUIs && data.length >= 1)
                            rootCUIs.add(data[0]);
                        if (restrictToRootRels && (data.length < 5
                                || (!rootCUIs.contains(data[0]) && !rootCUIs.contains(data[4])))) {
                            line = reader.readLine();
                            continue;
                        }

                        for (int i = 0; i < vals.length; i++) {
                            insert.setString(i + 1, data[i]);
                        }
                        insert.executeUpdate();

                        count++;
                        line = reader.readLine();

                        if (validateMode && count > 100) {
                            line = null;
                        }

                        if (count % 10000 == 0) {
                            md.busy();
                        }

                        if (count % 100000 == 0) {
                            md.info("Loaded " + count + " into " + table);
                        }
                    }
                    reader.close();
                } catch (Exception e) {
                    md.fatalAndThrowException("problem loading the table " + table, e);
                }

            } else {
                md.warn("Could not load table " + table + ". This" + "most likely means the corresponding RRF file"
                        + "was not found in the source.");
            }

            insert.close();
            System.gc();
        }

        sqlConnection.close();
        return (String[]) tables.toArray(new String[tables.size()]);
    }

    private static String mapUMLSType(String type) throws Exception {
        if (type.startsWith("varchar")) {
            return "{limitedText}" + type.substring(type.indexOf('('), type.length());
        } else if (type.startsWith("char")) {
            return "{limitedText}" + type.substring(type.indexOf('('), type.length());
        } else if (type.startsWith("numeric") || type.equals("integer")) {
            return "{limitedText}(20)";
        } else {
            throw new Exception("Unknown type");
        }
    }

    private static Hashtable readMRCols(URI rrfLocation) throws MalformedURLException, IOException {
        Hashtable result = new Hashtable();
        BufferedReader reader = getReader(rrfLocation.resolve("MRCOLS.RRF"));

        String line = reader.readLine();
        while (line != null) {
            String[] vals = stringToArray(line, '|');
            String key = vals[0] + "|" + vals[6];
            result.put(key, vals[7]);
            line = reader.readLine();
        }
        reader.close();

        return result;
    }

    private static String[] stringToArray(String string, char token) {
        ArrayList vals = new ArrayList();
        int startPos = 0;
        int endPos = string.indexOf(token);
        while (endPos != -1) {
            vals.add(string.substring(startPos, endPos));
            startPos = endPos + 1;
            endPos = string.indexOf(token, startPos);
        }
        vals.add(string.substring(startPos, string.length()));
        return (String[]) vals.toArray(new String[vals.size()]);
    }

    private static BufferedReader getReader(URI filePath) throws MalformedURLException, IOException {
        BufferedReader reader;
        if (filePath.getScheme().equals("file")) {
            reader = new BufferedReader(new FileReader(new File(filePath)));
        } else {
            reader = new BufferedReader(new InputStreamReader(filePath.toURL().openConnection().getInputStream()));
        }
        return reader;
    }

    public static void main(String[] args) throws Exception {
        URI testURI = new URI("http://www.cnn.com/");
        // URI testURI = new URI("file:///W:/temp");
        // System.out.println(new File(testURI).exists());

        HttpURLConnection connecition = (HttpURLConnection) testURI.toURL().openConnection();
        System.out.println(connecition.getResponseCode());

        System.out.println(testURI.resolve("TEST").toString());
    }

    private static boolean verifyTableExists(URI tableURI) {
        // if its a file, check if it exists
        if (tableURI.getScheme().equals("file")) {
            return new File(tableURI).exists();
        }
        // otherwise, just try to connect..
        // TODO: find a better way to check this instead of catching an
        // exception.
        else {
            try {
                tableURI.toURL().openConnection();
            } catch (Exception e) {
                return false;
            }
            return true;
        }
    }
}