Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.sql.parser; import java.io.UnsupportedEncodingException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.antlr.runtime.tree.Tree; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; /** * SemanticAnalyzer. * */ public abstract class SemanticAnalyzer { public static String charSetString(String charSetName, String charSetString) throws SemanticException { try { // The character set name starts with a _, so strip that charSetName = charSetName.substring(1); if (charSetString.charAt(0) == '\'') { return new String(unescapeSQLString(charSetString).getBytes(), charSetName); } else // hex input is also supported { assert charSetString.charAt(0) == '0'; assert charSetString.charAt(1) == 'x'; charSetString = charSetString.substring(2); byte[] bArray = new byte[charSetString.length() / 2]; int j = 0; for (int i = 0; i < charSetString.length(); i += 2) { int val = Character.digit(charSetString.charAt(i), 16) * 16 + Character.digit(charSetString.charAt(i + 1), 16); if (val > 127) { val = val - 256; } bArray[j++] = (byte) val; } String res = new String(bArray, charSetName); return res; } } catch (UnsupportedEncodingException e) { throw new SemanticException(e); } } /** * Remove the encapsulating "`" pair from the identifier. We allow users to * use "`" to escape identifier for table names, column names and aliases, in * case that coincide with Hive language keywords. */ public static String unescapeIdentifier(String val) { if (val == null) { return null; } if (val.charAt(0) == '`' && val.charAt(val.length() - 1) == '`') { val = val.substring(1, val.length() - 1); } return val; } /** * Converts parsed key/value properties pairs into a map. * * @param prop ASTNode parent of the key/value pairs * * @param mapProp property map which receives the mappings */ public static void readProps(ASTNode prop, Map<String, String> mapProp) { for (int propChild = 0; propChild < prop.getChildCount(); propChild++) { String key = unescapeSQLString(prop.getChild(propChild).getChild(0).getText()); String value = null; if (prop.getChild(propChild).getChild(1) != null) { value = unescapeSQLString(prop.getChild(propChild).getChild(1).getText()); } mapProp.put(key, value); } } private static final int[] multiplier = new int[] { 1000, 100, 10, 1 }; @SuppressWarnings("nls") public static String unescapeSQLString(String b) { Character enclosure = null; // Some of the strings can be passed in as unicode. For example, the // delimiter can be passed in as \002 - So, we first check if the // string is a unicode number, else go back to the old behavior StringBuilder sb = new StringBuilder(b.length()); for (int i = 0; i < b.length(); i++) { char currentChar = b.charAt(i); if (enclosure == null) { if (currentChar == '\'' || b.charAt(i) == '\"') { enclosure = currentChar; } // ignore all other chars outside the enclosure continue; } if (enclosure.equals(currentChar)) { enclosure = null; continue; } if (currentChar == '\\' && (i + 6 < b.length()) && b.charAt(i + 1) == 'u') { int code = 0; int base = i + 2; for (int j = 0; j < 4; j++) { int digit = Character.digit(b.charAt(j + base), 16); code += digit * multiplier[j]; } sb.append((char) code); i += 5; continue; } if (currentChar == '\\' && (i + 4 < b.length())) { char i1 = b.charAt(i + 1); char i2 = b.charAt(i + 2); char i3 = b.charAt(i + 3); if ((i1 >= '0' && i1 <= '1') && (i2 >= '0' && i2 <= '7') && (i3 >= '0' && i3 <= '7')) { byte bVal = (byte) ((i3 - '0') + ((i2 - '0') * 8) + ((i1 - '0') * 8 * 8)); byte[] bValArr = new byte[1]; bValArr[0] = bVal; String tmp = new String(bValArr); sb.append(tmp); i += 3; continue; } } if (currentChar == '\\' && (i + 2 < b.length())) { char n = b.charAt(i + 1); switch (n) { case '0': sb.append("\0"); break; case '\'': sb.append("'"); break; case '"': sb.append("\""); break; case 'b': sb.append("\b"); break; case 'n': sb.append("\n"); break; case 'r': sb.append("\r"); break; case 't': sb.append("\t"); break; case 'Z': sb.append("\u001A"); break; case '\\': sb.append("\\"); break; // The following 2 lines are exactly what MySQL does TODO: why do we do this? case '%': sb.append("\\%"); break; case '_': sb.append("\\_"); break; default: sb.append(n); } i++; } else { sb.append(currentChar); } } return sb.toString(); } /** * Get the list of FieldSchema out of the ASTNode. */ public static List<FieldSchema> getColumns(ASTNode ast, boolean lowerCase) throws SemanticException { List<FieldSchema> colList = new ArrayList<FieldSchema>(); int numCh = ast.getChildCount(); for (int i = 0; i < numCh; i++) { FieldSchema col = new FieldSchema(); ASTNode child = (ASTNode) ast.getChild(i); Tree grandChild = child.getChild(0); if (grandChild != null) { String name = grandChild.getText(); if (lowerCase) { name = name.toLowerCase(); } // child 0 is the name of the column col.setName(unescapeIdentifier(name)); // child 1 is the type of the column ASTNode typeChild = (ASTNode) (child.getChild(1)); col.setType(getTypeStringFromAST(typeChild)); // child 2 is the optional comment of the column if (child.getChildCount() == 3) { col.setComment(unescapeSQLString(child.getChild(2).getText())); } } colList.add(col); } return colList; } protected static String getTypeStringFromAST(ASTNode typeNode) throws SemanticException { switch (typeNode.getType()) { case SparkSqlParser.TOK_LIST: return serdeConstants.LIST_TYPE_NAME + "<" + getTypeStringFromAST((ASTNode) typeNode.getChild(0)) + ">"; case SparkSqlParser.TOK_MAP: return serdeConstants.MAP_TYPE_NAME + "<" + getTypeStringFromAST((ASTNode) typeNode.getChild(0)) + "," + getTypeStringFromAST((ASTNode) typeNode.getChild(1)) + ">"; case SparkSqlParser.TOK_STRUCT: return getStructTypeStringFromAST(typeNode); case SparkSqlParser.TOK_UNIONTYPE: return getUnionTypeStringFromAST(typeNode); default: return getTypeName(typeNode); } } private static String getStructTypeStringFromAST(ASTNode typeNode) throws SemanticException { String typeStr = serdeConstants.STRUCT_TYPE_NAME + "<"; typeNode = (ASTNode) typeNode.getChild(0); int children = typeNode.getChildCount(); if (children <= 0) { throw new SemanticException("empty struct not allowed."); } StringBuilder buffer = new StringBuilder(typeStr); for (int i = 0; i < children; i++) { ASTNode child = (ASTNode) typeNode.getChild(i); buffer.append(unescapeIdentifier(child.getChild(0).getText())).append(":"); buffer.append(getTypeStringFromAST((ASTNode) child.getChild(1))); if (i < children - 1) { buffer.append(","); } } buffer.append(">"); return buffer.toString(); } private static String getUnionTypeStringFromAST(ASTNode typeNode) throws SemanticException { String typeStr = serdeConstants.UNION_TYPE_NAME + "<"; typeNode = (ASTNode) typeNode.getChild(0); int children = typeNode.getChildCount(); if (children <= 0) { throw new SemanticException("empty union not allowed."); } StringBuilder buffer = new StringBuilder(typeStr); for (int i = 0; i < children; i++) { buffer.append(getTypeStringFromAST((ASTNode) typeNode.getChild(i))); if (i < children - 1) { buffer.append(","); } } buffer.append(">"); typeStr = buffer.toString(); return typeStr; } public static String getAstNodeText(ASTNode tree) { return tree.getChildCount() == 0 ? tree.getText() : getAstNodeText((ASTNode) tree.getChild(tree.getChildCount() - 1)); } public static String generateErrorMessage(ASTNode ast, String message) { StringBuilder sb = new StringBuilder(); if (ast == null) { sb.append(message).append(". Cannot tell the position of null AST."); return sb.toString(); } sb.append(ast.getLine()); sb.append(":"); sb.append(ast.getCharPositionInLine()); sb.append(" "); sb.append(message); sb.append(". Error encountered near token '"); sb.append(getAstNodeText(ast)); sb.append("'"); return sb.toString(); } private static final Map<Integer, String> TokenToTypeName = new HashMap<Integer, String>(); static { TokenToTypeName.put(SparkSqlParser.TOK_BOOLEAN, serdeConstants.BOOLEAN_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_TINYINT, serdeConstants.TINYINT_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_SMALLINT, serdeConstants.SMALLINT_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_INT, serdeConstants.INT_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_BIGINT, serdeConstants.BIGINT_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_FLOAT, serdeConstants.FLOAT_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_DOUBLE, serdeConstants.DOUBLE_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_STRING, serdeConstants.STRING_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_CHAR, serdeConstants.CHAR_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_VARCHAR, serdeConstants.VARCHAR_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_BINARY, serdeConstants.BINARY_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_DATE, serdeConstants.DATE_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_DATETIME, serdeConstants.DATETIME_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_TIMESTAMP, serdeConstants.TIMESTAMP_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_INTERVAL_YEAR_MONTH, serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_INTERVAL_DAY_TIME, serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME); TokenToTypeName.put(SparkSqlParser.TOK_DECIMAL, serdeConstants.DECIMAL_TYPE_NAME); } public static String getTypeName(ASTNode node) throws SemanticException { int token = node.getType(); String typeName; // datetime type isn't currently supported if (token == SparkSqlParser.TOK_DATETIME) { throw new SemanticException(ErrorMsg.UNSUPPORTED_TYPE.getMsg()); } switch (token) { case SparkSqlParser.TOK_CHAR: CharTypeInfo charTypeInfo = ParseUtils.getCharTypeInfo(node); typeName = charTypeInfo.getQualifiedName(); break; case SparkSqlParser.TOK_VARCHAR: VarcharTypeInfo varcharTypeInfo = ParseUtils.getVarcharTypeInfo(node); typeName = varcharTypeInfo.getQualifiedName(); break; case SparkSqlParser.TOK_DECIMAL: DecimalTypeInfo decTypeInfo = ParseUtils.getDecimalTypeTypeInfo(node); typeName = decTypeInfo.getQualifiedName(); break; default: typeName = TokenToTypeName.get(token); } return typeName; } public static String relativeToAbsolutePath(HiveConf conf, String location) throws SemanticException { boolean testMode = conf.getBoolVar(HiveConf.ConfVars.HIVETESTMODE); if (testMode) { URI uri = new Path(location).toUri(); String scheme = uri.getScheme(); String authority = uri.getAuthority(); String path = uri.getPath(); if (!path.startsWith("/")) { path = (new Path(System.getProperty("test.tmp.dir"), path)).toUri().getPath(); } if (StringUtils.isEmpty(scheme)) { scheme = "pfile"; } try { uri = new URI(scheme, authority, path, null, null); } catch (URISyntaxException e) { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(), e); } return uri.toString(); } else { //no-op for non-test mode for now return location; } } }