Java tutorial
/* * Copyright (c) 2012, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.analyzer; import java.sql.SQLException; import java.sql.Connection; import java.sql.ResultSet; import java.sql.ResultSetMetaData; import java.sql.Statement; import java.sql.DriverManager; import org.apache.hadoop.fs.permission.FsPermission; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.Map; import java.util.Set; import java.util.List; import java.util.Random; import java.util.HashMap; import java.util.HashSet; import java.util.ArrayList; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.avro.Schema; /******************************************************* * DataQuery handles Hive data-importation and query processing. * Assumes Hive is running locally at port 10000. * * @author "Michael Cafarella" <mjc@lofie.local> * @version 1.0 * @since 1.0 ********************************************************/ public class DataQuery implements Serializable { private static final Log LOG = LogFactory.getLog(DataQuery.class); private static boolean inited = false; private static DataQuery dataQuery; private static String hiveDriverName = "org.apache.hive.jdbc.HiveDriver"; private static String impalaDriverName = "org.apache.hive.jdbc.HiveDriver"; private static String tmpTablesDir = "/tmp/tmptables"; String hiveConnectString; String impalaConnectString; Configuration conf; Connection hiveCon; Connection impalaCon; HiveTableCache tableCache; Random r = new Random(); Map<Path, String> tables; Set<Path> isLoaded; public synchronized static DataQuery getInstance() { return DataQuery.getInstance(false); } public synchronized static DataQuery getInstance(boolean force) { if (force && dataQuery != null) { try { dataQuery.close(); } catch (SQLException sqe) { } dataQuery = null; } if (force || (!inited)) { try { dataQuery = new DataQuery(); } catch (SQLException se) { se.printStackTrace(); } finally { inited = true; } } return dataQuery; } public DataQuery() throws SQLException { try { this.conf = new Configuration(); Class.forName(hiveDriverName); Class.forName(impalaDriverName); this.hiveConnectString = conf.get("hive.connectstring", "jdbc:hive2://localhost:10000/default"); this.impalaConnectString = conf.get("impala.connectstring", "jdbc:hive2://localhost:21050/;auth=noSasl"); LOG.info("Hive connect string: " + hiveConnectString); LOG.info("Impala connect string: " + impalaConnectString); this.tableCache = new HiveTableCache(); try { this.hiveCon = DriverManager.getConnection(hiveConnectString, "cloudera", "cloudera"); } catch (Exception ex) { ex.printStackTrace(); } this.impalaCon = DriverManager.getConnection(impalaConnectString, "cloudera", "cloudera"); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (Exception ex) { ex.printStackTrace(); } // Force impala to refresh metadata if (impalaCon != null) { Statement stmt = impalaCon.createStatement(); try { try { LOG.info("Rebuilding Impala metadata..."); stmt.execute("INVALIDATE METADATA"); } catch (Exception iex) { LOG.info("Impala metadata rebuild failed: " + iex.toString()); } } finally { stmt.close(); } } // Misc data structures this.tables = new HashMap<Path, String>(); this.isLoaded = new HashSet<Path>(); } public void close() throws SQLException { if (hiveCon != null) { this.hiveCon.close(); } this.hiveCon = null; if (impalaCon != null) { this.impalaCon.close(); } this.impalaCon = null; } /** * Connection string for Hive */ public String getHiveConnectionString() { return hiveConnectString; } /** * Run a sample set of Hive test queries to check whether the Hive server is up and active */ public boolean testQueryServer() { if (hiveCon == null) { return false; } try { // // Create table // String tablename = "test_datatable" + Math.abs(r.nextInt()); Statement stmt = hiveCon.createStatement(); try { stmt.execute("CREATE TABLE " + tablename + "(a int, b int, c int)"); } finally { stmt.close(); } // // Drop table // stmt = hiveCon.createStatement(); try { stmt.execute("DROP TABLE " + tablename); } finally { stmt.close(); } return true; } catch (Exception ex) { ex.printStackTrace(); return false; } } String grabTable(DataDescriptor desc) throws SQLException, IOException { // Set up Hive table Path p = desc.getFilename(); String tablename = tableCache.get(p); if (tablename == null) { tablename = "datatable" + Math.abs(r.nextInt()); Statement stmt = hiveCon.createStatement(); try { String creatTxt = desc.getHiveCreateTableStatement(tablename); LOG.info("Create: " + creatTxt); stmt.execute(creatTxt); tables.put(p, tablename); } finally { stmt.close(); } // Copy avro version of data into secret location prior to Hive import FileSystem fs = FileSystem.get(conf); Path tmpTables = new Path(tmpTablesDir); if (!fs.exists(tmpTables)) { fs.mkdirs(tmpTables, new FsPermission("-rwxrwxrwx")); } Path secretDst = new Path(tmpTables, "r" + r.nextInt()); LOG.info("Preparing Avro data at " + secretDst); desc.prepareAvroFile(fs, fs, secretDst, conf); fs.setPermission(secretDst, new FsPermission("-rwxrwxrwx")); // Import data stmt = hiveCon.createStatement(); try { LOG.info("Import data into Hive: " + desc.getHiveImportDataStatement(tablename, secretDst)); stmt.execute(desc.getHiveImportDataStatement(tablename, secretDst)); isLoaded.add(p); } finally { stmt.close(); } // Refresh impala metadata stmt = impalaCon.createStatement(); try { try { LOG.info("Rebuilding Impala metadata..."); stmt.execute("INVALIDATE METADATA"); } catch (Exception iex) { LOG.info("Impala metadata rebuild failed: " + iex.toString()); } } finally { stmt.close(); } // Insert into table cache tableCache.put(p, tablename); } return tablename; } public List<List<Object>> query(DataDescriptor desc1, DataDescriptor desc2, String projectionClause, String selectionClause) throws SQLException, IOException { String tablename1 = grabTable(desc1); String tablename2 = null; if (desc2 != null) { tablename2 = grabTable(desc2); } // // Build the SQL query against the table // if (projectionClause == null || projectionClause.trim().length() == 0) { projectionClause = "*"; } if (selectionClause == null) { selectionClause = ""; } if (tablename2 == null) { projectionClause = projectionClause.replaceAll("DATA", tablename1); selectionClause = selectionClause.replaceAll("DATA", tablename1); } projectionClause = projectionClause.trim(); selectionClause = selectionClause.trim(); String query; if (tablename2 == null) { query = "SELECT " + projectionClause + " FROM " + tablename1; } else { query = "SELECT " + projectionClause + " FROM " + tablename1 + " DATA1" + ", " + tablename2 + " DATA2"; } if (selectionClause.length() > 0) { query = query + " WHERE " + selectionClause; } // // Try to run it first with the impala connection. // If that fails, try hive. // List<List<Object>> result = new ArrayList<List<Object>>(); Statement stmt = impalaCon.createStatement(); LOG.info("Processing: " + query); try { ResultSet res = null; try { res = stmt.executeQuery(query); LOG.info("Ran Impala query: " + query); } catch (Exception iex) { iex.printStackTrace(); // Fail back to Hive! stmt.close(); stmt = hiveCon.createStatement(); res = stmt.executeQuery(query); LOG.info("Ran Hive query: " + query); } // OK now do the real work ResultSetMetaData rsmd = res.getMetaData(); List<Object> metatuple = new ArrayList<Object>(); for (int i = 1; i <= rsmd.getColumnCount(); i++) { metatuple.add(rsmd.getColumnLabel(i)); } result.add(metatuple); while (res.next()) { List<Object> tuple = new ArrayList<Object>(); for (int i = 1; i <= rsmd.getColumnCount(); i++) { tuple.add(res.getObject(i)); } result.add(tuple); } return result; } finally { stmt.close(); } } }