mitll.xdata.dataset.kiva.ingest.KivaIngest.java Source code

Java tutorial

Introduction

Here is the source code for mitll.xdata.dataset.kiva.ingest.KivaIngest.java

Source

// Copyright 2013 MIT Lincoln Laboratory, Massachusetts Institute of Technology 
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mitll.xdata.dataset.kiva.ingest;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;

public class KivaIngest {
    public static Map<String, String> TYPE_TO_DB = new HashMap<String, String>();
    static {
        TYPE_TO_DB.put("INTEGER", "INT");
        TYPE_TO_DB.put("STRING", "VARCHAR");
        TYPE_TO_DB.put("DATE", "TIMESTAMP");
        TYPE_TO_DB.put("REAL", "DOUBLE");
        TYPE_TO_DB.put("BOOLEAN", "BOOLEAN");
    }

    /*    public static Object[] getColumnSpecs(String schema) {
    List<String> names = new ArrayList<String>();
    List<String> types = new ArrayList<String>();
    String[] columns = schema.split(";");
    for (int i = 0; i < columns.length; i++) {
        int index = columns[i].indexOf(":");
        names.add(columns[i].substring(0, index).trim());
        types.add(columns[i].substring(index + 1).trim());
    }
    return new Object[] { names, types };
        }*/

    public static Object[] processSchema(String filename) throws Exception {
        // teams:category: [String]
        // teams:image:id: [Integer]

        String schema = FileUtils.readFileToString(new File(filename), "UTF-8").trim();

        List<String> names = new ArrayList<String>();
        List<String> types = new ArrayList<String>();

        String regex = "([a-zA-Z_0-9:\\[\\]]+):\\s+\\[([a-zA-Z_0-9]+)\\]";
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(schema);

        while (matcher.find()) {
            String type = matcher.group(2).trim();
            String name = matcher.group(1).trim();

            // replace empty [] with nothing
            name = name.replaceAll("\\[\\]", "");

            // strip off final colons
            name = name.replaceAll(":+$", "");

            // replace internal ":" with "_"
            name = name.replaceAll(":+", "_");

            names.add(name);
            types.add(type);
        }

        return new Object[] { names, types };
    }

    public static String createCreateSQL(String tableName, List<String> names, List<String> types) {
        String sql = "CREATE TABLE " + tableName + " (" + "\n";
        for (int i = 0; i < names.size(); i++) {
            sql += (i > 0 ? ",\n  " : "  ") + names.get(i) + " " + TYPE_TO_DB.get(types.get(i).toUpperCase());
        }
        sql += "\n);";
        return sql;
    }

    public static String createInsertSQL(String tableName, List<String> names) {
        String sql = "INSERT INTO " + tableName + " (";
        for (int i = 0; i < names.size(); i++) {
            sql += (i > 0 ? ", " : "") + names.get(i);
        }
        sql += ") VALUES (";
        for (int i = 0; i < names.size(); i++) {
            sql += (i > 0 ? ", " : "") + "?";
        }
        sql += ");";
        return sql;
    }

    public static int executePreparedStatement(PreparedStatement statement, List<String> types, List<String> values)
            throws Exception {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss Z");
        try {
            for (int i = 0; i < types.size(); i++) {
                String type = TYPE_TO_DB.get(types.get(i).toUpperCase());
                String value = values.get(i);
                if (value != null && value.trim().length() == 0) {
                    value = null;
                }
                if (type.equalsIgnoreCase("INT")) {
                    if (value == null) {
                        statement.setNull(i + 1, java.sql.Types.INTEGER);
                    } else {
                        statement.setInt(i + 1, Integer.parseInt(value, 10));
                    }
                } else if (type.equalsIgnoreCase("DOUBLE")) {
                    if (value == null) {
                        statement.setNull(i + 1, java.sql.Types.DOUBLE);
                    } else {
                        statement.setDouble(i + 1, Double.parseDouble(value));
                    }
                } else if (type.equalsIgnoreCase("BOOLEAN")) {
                    if (value == null) {
                        statement.setNull(i + 1, java.sql.Types.BOOLEAN);
                    } else {
                        statement.setBoolean(i + 1, Boolean.parseBoolean(value));
                    }
                } else if (type.equalsIgnoreCase("VARCHAR")) {
                    statement.setString(i + 1, value);
                } else if (type.equalsIgnoreCase("TIMESTAMP")) {
                    if (value == null) {
                        statement.setNull(i + 1, java.sql.Types.TIMESTAMP);
                    } else {
                        statement.setTimestamp(i + 1, new Timestamp(sdf.parse(value).getTime()));
                    }
                }
            }
        } catch (Throwable e) {
            System.out.println("types = " + types);
            System.out.println("values = " + values);
            System.out.println("types.size() = " + types.size());
            System.out.println("values.size() = " + values.size());
            e.printStackTrace();
            System.out.println(e.getMessage());
            throw new Exception(e);
        }
        return statement.executeUpdate();
    }

    /**
     * Assumes at least one field.
     */
    public static List<String> split(String s, String separator) {
        List<String> fields = new ArrayList<String>();
        int i = 0;
        // add fields up to last separator
        while (i < s.length()) {
            int index = s.indexOf(separator, i);
            if (index < 0) {
                break;
            }
            fields.add(s.substring(i, index));
            i = index + 1;
        }
        // add field after last separator
        fields.add(s.substring(i, s.length()));
        return fields;
    }

    @SuppressWarnings("unchecked")
    public static void loadTable(String tableName, String schemaFilename, String dataFilename) throws Exception {
        Object[] temp = processSchema(schemaFilename);
        List<String> names = (List<String>) temp[0];
        List<String> types = (List<String>) temp[1];

        Class.forName("org.h2.Driver");
        Connection connection = DriverManager.getConnection("jdbc:h2:tcp://localhost//h2data/kiva/kiva", "sa", "");

        String createSQL = createCreateSQL(tableName, names, types);
        String insertSQL = createInsertSQL(tableName, names);

        PreparedStatement statement = connection.prepareStatement(createSQL);
        statement.executeUpdate();
        statement.close();

        statement = connection.prepareStatement(insertSQL);

        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(dataFilename), "UTF-8"));
        String line = null;
        int count = 0;
        long t0 = System.currentTimeMillis();
        while ((line = br.readLine()) != null) {
            count++;
            List<String> values = split(line, "\t");
            executePreparedStatement(statement, types, values);
            if (count % 10000 == 0) {
                System.out.println(
                        "count = " + count + "; " + (System.currentTimeMillis() - 1.0 * t0) / count + " ms/insert");
            }
        }
        br.close();

        statement.close();

        long t1 = System.currentTimeMillis();
        System.out.println("total count = " + count);
        System.out.println("total time = " + ((t1 - t0) / 1000.0) + " s");
        System.out.println((t1 - 1.0 * t0) / count + " ms/insert");
        System.out.println((1000.0 * count / (t1 - 1.0 * t0)) + " inserts/s");
    }

    /*    public static void testTabs(String dataFilename) throws Exception {
    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(dataFilename), "UTF-8"));
    String line = null;
    int count = 0;
    int firstNumTabs = -1;
    while ((line = br.readLine()) != null) {
        count++;
        int numTabs = 0;
        boolean lastTabLastCharacter = false;
        for (int i = 0; i < line.length(); i++) {
            if (line.charAt(i) == '\t') {
                numTabs++;
                if (i == line.length() - 1) {
                    lastTabLastCharacter = true;
                }
            }
        }
        if (firstNumTabs == -1) {
            firstNumTabs = numTabs;
            System.out.println("firstNumTabs = " + firstNumTabs);
        }
        if (numTabs != firstNumTabs) {
            System.out.println("line " + count + ": numTabs = " + numTabs);
        }
        if (lastTabLastCharacter) {
            System.out.println("line " + count + ": lastTabLastCharacter = " + lastTabLastCharacter);
            break;
        }
        if (count % 50000 == 0) {
            System.out.println("count = " + count);
        }
    }
    br.close();
        }*/

    public static void main(String[] args) throws Exception {
        // System.out.println(split("\ta\tb\t\tc\t", "\t"));

        String tableName = "loanJournalEntriesLinks";
        String schemaFilename = "kiva_schemas/" + tableName + ".schema";
        String dataFilename = "C:/Users/user/Desktop/xdata/datasets/kiva/" + tableName + ".csv";
        loadTable(tableName, schemaFilename, dataFilename);

        System.out.println("done");
    }
}