Java tutorial
/******************************************************************************* * relationalcloud.com * * Project Info: http://relationalcloud.com * Project Members: Carlo Curino <carlo.curino@gmail.com> * Evan Jones <ej@evanjones.ca> * Yang Zhang <yaaang@gmail.com> * Sam Madden <madden@csail.mit.edu> * This library is free software; you can redistribute it and/or modify it under the terms * of the GNU General Public License as published by the Free Software Foundation; * either version 3.0 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. ******************************************************************************/ package com.relationalcloud.partitioning.explanation; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.Enumeration; import java.util.Properties; import weka.classifiers.Classifier; import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; import weka.filters.Filter; import weka.filters.unsupervised.attribute.NumericToNominal; import weka.filters.unsupervised.attribute.StringToNominal; import weka.filters.unsupervised.instance.Resample; import com.relationalcloud.tsqlparser.loader.Schema; import com.relationalcloud.partitioning.DecisionTree; import com.relationalcloud.partitioning.ExplanationWorkloadPrepocessor; import com.relationalcloud.partitioning.counters.SimpleCount; import com.relationalcloud.weka.util.WekaHelper; public class ExplanationHandler { public double correlationThreshold = 0.3; public Properties ini; public String schemaname; public String sampledtrainingtable; public String testingtable; public String numTxnsToExtractTemplates; public String txnLogTable; public String driver; public String connection; public String user; public String password; public String dbPropertyFile; public String pcol; public String explainedPartitionCol; public boolean doLabelTraining; public boolean doPopulateRouter; public ExplanationHandler(Properties ini) { this.ini = ini; // loading properties from file schemaname = ini.getProperty("schema"); sampledtrainingtable = ini.getProperty("sampledAccessLogTable"); testingtable = ini.getProperty("testingAccessLogTable"); numTxnsToExtractTemplates = ini.getProperty("Explanation.numTxnsToExtractTemplates"); txnLogTable = ini.getProperty("txnLogTable"); driver = ini.getProperty("driver"); connection = ini.getProperty("conn"); user = ini.getProperty("user"); password = ini.getProperty("password"); dbPropertyFile = ini.getProperty("dbPropertyFile"); doLabelTraining = Boolean.parseBoolean(ini.getProperty("Explanation.populateTraining")); // doPopulateRouter = // Boolean.parseBoolean(ini.getProperty("Explanation.populateRouter")); String partitioningMethod = ini.getProperty("partitioningMethod"); if (partitioningMethod.equals("repGraph")) pcol = ini.getProperty("replicatedPartitionCol"); else pcol = ini.getProperty("graphPartitionCol"); explainedPartitionCol = ini.getProperty("explainedPartitionCol"); } public ExplanationHandler(String schemaname, String conn, String us, String pwd) { this.ini = ini; // loading properties from file this.schemaname = schemaname; sampledtrainingtable = "metarelcloud_training_access_log"; testingtable = "metarelcloud_testing_access_log"; numTxnsToExtractTemplates = "2000"; txnLogTable = "metarelcloud_txn_log"; driver = "com.mysql.Driver"; connection = conn; user = us; password = pwd; dbPropertyFile = "config/WekaHelper.props"; doLabelTraining = false; String partitioningMethod = "repGraph"; pcol = "replicatedpartition"; explainedPartitionCol = "explainedpartition"; } public void labelTest(String tableProcessed, Classifier classifier, Connection conn) throws Exception { tableProcessed = removeQuotes(tableProcessed); DecisionTree t = DecisionTree.parse(classifier.toString().trim()); String sqlCase = t.toSqlCase(); String u = ""; u += "update " + testingtable + ", relcloud_" + tableProcessed + " as " + tableProcessed + " "; u += "set " + pcol + " = " + sqlCase + " where relcloud_id = tupleid;"; System.out.println(u); Statement stmt = conn.createStatement(); stmt.executeUpdate(u); if (doLabelTraining) { u = ""; u += "update " + sampledtrainingtable + ", relcloud_" + tableProcessed + " as " + tableProcessed + " "; u += "set " + pcol + " = " + sqlCase + " where relcloud_id = tupleid;"; stmt.executeUpdate(u); } if (doPopulateRouter) { u = ""; u += "insert into treerouter "; u += "(dbname, dbversion, tablename, tree) "; u += "values (?, ?, ?, ?);"; PreparedStatement ps = conn.prepareStatement(u); ps.setString(1, ini.getProperty("schema")); ps.setString(2, "1"); ps.setString(3, tableProcessed); ps.setString(4, t.toString()); ps.execute(); } } /** * Repeat the selection from the database removing duplicates, since they will * only increase the execution time. And run the tuples through the classifier * to populate the justifiedpartition column. * * @param tableProcessed * @param classifier * @param wa * @throws SQLException * @throws Exception */ public void populateJustifiedColumn(String tableProcessed, Classifier classifier, ArrayList<String> attributes, Connection conn, int numbPart, Enumeration enumclassvalues) throws SQLException, Exception { if (true) { labelTest(tableProcessed, classifier, conn); return; } tableProcessed = removeQuotes(tableProcessed); // get from the DB the tuples content and their partitioning column String sqlstring = "SELECT distinct g.tupleid, "; for (String sc : attributes) { sqlstring += "s." + sc + ", "; } sqlstring += "g." + pcol + " FROM " + "(SELECT distinct tupleid," + pcol + " FROM `" + testingtable + "` WHERE tableid = '" + tableProcessed + "') AS g, relcloud_" + tableProcessed + " AS s " + "WHERE s.relcloud_id = g.tupleid;"; System.out.println(sqlstring); Statement stmt = conn.createStatement(); // initializing the testing table to avoid complaints from classifier with // an hash partition like distribution if (!testingtable.equals(sampledtrainingtable)) { int i = 0; Object o = enumclassvalues.nextElement(); // set everything to an existing value to ensure that every field is // covered stmt.executeUpdate("UPDATE " + testingtable + " SET " + pcol + "=" + o + " WHERE tableid = '" + tableProcessed + "'"); // and than sparkly in a bunch of other values (unsure whether it is // required); while (enumclassvalues.hasMoreElements()) { o = enumclassvalues.nextElement(); // FIXME there might still be an issue in which tupleid%i do not exists, // and thus one of the "o" never appears in the instance... stmt.executeUpdate("UPDATE " + testingtable + " SET " + pcol + "=" + o + " WHERE tupleid%" + numbPart + "=" + i + " AND tableid = '" + tableProcessed + "'"); i++; } } ResultSet res = stmt.executeQuery(sqlstring); // create an instance from the resultset Instances data_tupleid = WekaHelper.retrieveInstanceFromResultSetComplete(res, dbPropertyFile); res.close(); data_tupleid.setClassIndex(data_tupleid.numAttributes() - 1); Instances data_no_tupleid = makeLastNominal(data_tupleid); data_no_tupleid.setClassIndex(data_no_tupleid.numAttributes() - 1); // remove tupleid from data_no_tupleid, still available in data_tupleid data_no_tupleid.deleteAttributeAt(0); // if(data_no_tupleid.classAttribute().numValues()>1){ System.out.println("Running the tuples through the classifier to populate " + explainedPartitionCol); // use data that still has the tupleid and newData for the classification Enumeration enum_data_tupleid = data_tupleid.enumerateInstances(); Enumeration enum_data_no_tupleid = data_no_tupleid.enumerateInstances(); PreparedStatement updateJustCol = conn.prepareStatement("UPDATE `" + testingtable + "` SET `" + explainedPartitionCol + "` = ? " + "WHERE tableid = '" + tableProcessed + "' AND tupleid = ?;"); while (enum_data_tupleid.hasMoreElements() && enum_data_no_tupleid.hasMoreElements()) { Instance tupIDinstance = (Instance) enum_data_tupleid.nextElement(); Instance instance = (Instance) enum_data_no_tupleid.nextElement(); double part = classifier.classifyInstance(instance); if (part == Instance.missingValue()) System.err.println("No classification for:" + instance.toString()); updateJustCol.setInt(1, (int) part); updateJustCol.setInt(2, (int) tupIDinstance.value(0)); // System.out.println(tableProcessed+" "+ instance.value(0) + " " + // tupIDinstance.classValue() +" "+ part); updateJustCol.execute(); updateJustCol.clearParameters(); } updateJustCol.close(); } public void populateExplainedColumn(String tableProcessed, int partitionvalue, ArrayList<String> attributesForPopulation, Connection conn) { System.out.println( "The entire table is in one partition: " + partitionvalue + " populating " + explainedPartitionCol); try { Statement updateJustCol = conn.createStatement(); updateJustCol.execute("UPDATE `" + testingtable + "` SET `" + explainedPartitionCol + "` = '" + partitionvalue + "' WHERE tableid = '" + tableProcessed + "';"); if (doLabelTraining) { updateJustCol.execute("UPDATE `" + sampledtrainingtable + "` SET `" + explainedPartitionCol + "` = '" + partitionvalue + "' WHERE tableid = '" + tableProcessed + "';"); } updateJustCol.close(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * Train the given classifier * * @param newData * @param classifier * @throws Exception */ public static void trainClassifier(Instances newData, Classifier classifier) throws Exception { // if the class attributed is not unary we proceed regularly // verify the Classifier can handle this dataset classifier.getCapabilities().testWithFail(newData); System.out.println("BUILDING CLASSIFIER ON INSTANCE:" + newData.toSummaryString()); long treeTstart = System.currentTimeMillis(); classifier.buildClassifier(newData); // build classifier long treeTend = System.currentTimeMillis(); System.out.println("CLASSIFIER BUILDING TIME: " + (treeTend - treeTstart) + "ms FROM: " + newData.numInstances() + " instances \n" + classifier.toString()); } /** * Applies sample training to reduce the input Instances to a specified size. * * @param classifier_sampling_threshold * @param newData1 * @return * @throws Exception */ public static Instances sampleTraining(Double classifier_sampling_threshold, Instances newData1) throws Exception { String[] options; Instances newData; if (newData1.numInstances() > classifier_sampling_threshold) { double percentage = (double) 100 * ((double) classifier_sampling_threshold) / ((double) newData1.numInstances()); Resample r = new Resample(); options = new String[4]; options[0] = "-C"; options[1] = "last"; options[2] = "-Z"; options[3] = "" + percentage; r.setOptions(options); r.setInputFormat(newData1); newData = Filter.useFilter(newData1, r); } else { newData = newData1; } return newData; } /** * Invokes filter to transform last parameter into a Nominal * * @param data * @return * @throws Exception */ public static Instances makeLastNominal(Instances data) throws Exception { Instances newData; if (data.attribute(data.numAttributes() - 1).type() == Attribute.NUMERIC) { NumericToNominal ntn = new NumericToNominal(); String[] options = new String[2]; options[0] = "-R"; // "range" options[1] = "last"; // first attribute ntn.setOptions(options); // set options ntn.setInputFormat(data); // inform filter about dataset // **AFTER** setting options newData = Filter.useFilter(data, ntn); // apply fil } else { StringToNominal ntn = new StringToNominal(); String[] options = new String[2]; options[0] = "-R"; // "range" options[1] = "last"; // first attribute ntn.setOptions(options); // set options ntn.setInputFormat(data); // inform filter about dataset // **AFTER** setting options newData = Filter.useFilter(data, ntn); // apply fil } return newData; } /** * Fetch from the database the content of the table and the partition lables, * and prepares a Weka Instance by sampling and cleaning it * * @param tableProcessed * @param arraySc * @param conn * @return */ public Instances generateInstancesForTable(String tabname, ArrayList<SimpleCount> arraySc, Connection conn) { tabname = removeQuotes(tabname); Statement stmt; try { stmt = conn.createStatement(); ResultSet test = stmt .executeQuery("SELECT count(*) FROM " + sampledtrainingtable + " WHERE " + pcol + " is null"); // safety check, verifies that there are no nulls in input table. if (test.next() && test.getInt(1) > 0) throw new Exception("Table " + sampledtrainingtable + " contains nulls in " + pcol); // get from the DB the tuples content and their partitioning column String sqlstring = "SELECT "; for (SimpleCount sc : arraySc) { sqlstring += "s." + sc.colname + ", "; } sqlstring += "g." + pcol + " FROM " + "(SELECT tupleid," + pcol + " FROM `" + sampledtrainingtable + "` WHERE tableid = '" + tabname + "') AS g, relcloud_" + tabname + " AS s " + "WHERE s.relcloud_id = g.tupleid"; System.out.println(sqlstring); ResultSet res = stmt.executeQuery(sqlstring); // create an instance from the resultset Instances data = WekaHelper.retrieveInstanceFromResultSetComplete(res, dbPropertyFile); res.close(); // prepare the data, by setting class attributed and sampling if required data = makeLastNominal(data); data.setClassIndex(data.numAttributes() - 1); data = sampleTraining(Double.parseDouble(ini.getProperty("Explanation.j48SamplingThreshold")), data); System.out.println(data.toSummaryString()); return data; } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } public Instances generateInstancesForTableDemo(String tabname, ArrayList<SimpleCount> arraySc, Connection conn) { tabname = removeQuotes(tabname); Statement stmt; try { stmt = conn.createStatement(); // get from the DB the tuples content and their partitioning column String sqlstring = "SELECT tableid,g.tupleid,"; for (SimpleCount sc : arraySc) { sqlstring += "s." + sc.colname + ", "; } sqlstring += "0 as partitionLabel" + " FROM " + "(SELECT tableid,tupleid," + pcol + " FROM `" + sampledtrainingtable + "` WHERE tableid = '" + tabname + "') AS g, relcloud_" + tabname + " AS s " + "WHERE s.relcloud_id = g.tupleid"; System.out.println(sqlstring); ResultSet res = stmt.executeQuery(sqlstring); // create an instance from the resultset Instances data = WekaHelper.retrieveInstanceFromResultSetComplete(res, dbPropertyFile); res.close(); // prepare the data, by setting class attributed and sampling if required //data = sampleTraining(Double.parseDouble("2000"), data); System.out.println(data.toSummaryString()); return data; } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } private String removeQuotes(String tabname) { if (tabname.startsWith("`") && tabname.endsWith("`")) tabname = tabname.substring(1, tabname.length() - 1); return tabname; } /** * This helper method is used to analyze the transactions in the workload and * extract frequencies of tables and attributes * * @param txnLogTable * @param numb_trans_to_process * @param schemaname * @param conn * @return * @throws SQLException */ public static ExplanationWorkloadPrepocessor analyzeWorkload(String txnLogTable, String numb_trans_to_process, String schemaname, Connection conn, Schema schema) throws SQLException { conn.setAutoCommit(true); Statement stmt = conn.createStatement(); // NOTE: the paramenter numb_trans_to_process is used to limit // the number of transactions parsed to determine the which attributes // are common in the workload WHERE clauses. This can be a subset of the // overall set String sqlstring = "SELECT sqlstring FROM `" + txnLogTable + "` LIMIT " + numb_trans_to_process; long t1start = System.currentTimeMillis(); ResultSet res = stmt.executeQuery(sqlstring); long t1end = System.currentTimeMillis(); System.out.println(numb_trans_to_process + " statements fetched from DB in " + (t1end - t1start) + "ms"); ExplanationWorkloadPrepocessor wa = new ExplanationWorkloadPrepocessor(schemaname, schema); double tstart = System.currentTimeMillis(); double i = 0; while (res.next()) { String sql = res.getString(1); // PARSE THE STATEMENT wa.processSql(sql); i++; } res.close(); double tend = System.currentTimeMillis(); System.out.println("Processed " + i + " statements in " + (tend - tstart) + "ms average:" + (tend - tstart) / i + "ms per statement"); // System.out.println("ANALISYS RESULTS:\n "); // wa.printStatsByTableColumn(); return wa; } public void populateHashPartition(Connection conn) throws SQLException { Statement stmt = conn.createStatement(); int numPart = Integer.parseInt(ini.getProperty("numPartitions")); String pcol = ini.getProperty("hashPartitionCol"); stmt.executeUpdate("UPDATE " + testingtable + " SET " + pcol + "=tupleid%" + numPart); } public void populateReplicatedPartition(Connection conn, boolean defaultReplicate) throws SQLException { if (!testingtable.equals(sampledtrainingtable)) { Statement stmt = conn.createStatement(); int repPartitionId = Integer.parseInt(ini.getProperty("repPartitionId")); int numPart = Integer.parseInt(ini.getProperty("numPartitions")); String pcol = ini.getProperty("replicatedPartitionCol"); // set by default to replication if (defaultReplicate) stmt.executeUpdate("UPDATE " + testingtable + " SET " + pcol + "= " + repPartitionId); else stmt.executeUpdate("UPDATE " + testingtable + " SET " + pcol + "=tupleid%" + numPart); stmt.executeUpdate("UPDATE " + testingtable + " t , " + sampledtrainingtable + " s SET t." + pcol + "= s." + pcol + " WHERE t.tableid=s.tableid AND t.tupleid=s.tupleid"); } } }