Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.phoenix.util; import com.google.common.base.Charsets; import com.google.common.base.Joiner; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.phoenix.exception.SQLExceptionCode; import org.apache.phoenix.exception.SQLExceptionInfo; import org.apache.phoenix.jdbc.PhoenixConnection; import org.apache.phoenix.schema.types.PDataType; import org.apache.phoenix.util.csv.CsvUpsertExecutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.Reader; import java.sql.Connection; import java.sql.DatabaseMetaData; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; /*** * Upserts CSV data using Phoenix JDBC connection */ public class CSVCommonsLoader { private static final Logger LOG = LoggerFactory.getLogger(CSVCommonsLoader.class); public static final String DEFAULT_ARRAY_ELEMENT_SEPARATOR = ":"; private static final Map<Character, Character> CTRL_CHARACTER_TABLE = ImmutableMap .<Character, Character>builder().put('1', '\u0001').put('2', '\u0002').put('3', '\u0003') .put('4', '\u0004').put('5', '\u0005').put('6', '\u0006').put('7', '\u0007').put('8', '\u0008') .put('9', '\u0009').build(); private final PhoenixConnection conn; private final String tableName; private final List<String> columns; private final boolean isStrict; private final char fieldDelimiter; private final char quoteCharacter; private final Character escapeCharacter; private PhoenixHeaderSource headerSource = PhoenixHeaderSource.FROM_TABLE; private final CSVFormat format; private final String arrayElementSeparator; public enum PhoenixHeaderSource { FROM_TABLE, IN_LINE, SUPPLIED_BY_USER } public CSVCommonsLoader(PhoenixConnection conn, String tableName, List<String> columns, boolean isStrict) { this(conn, tableName, columns, isStrict, ',', '"', null, DEFAULT_ARRAY_ELEMENT_SEPARATOR); } public CSVCommonsLoader(PhoenixConnection conn, String tableName, List<String> columns, boolean isStrict, char fieldDelimiter, char quoteCharacter, Character escapeCharacter, String arrayElementSeparator) { this.conn = conn; this.tableName = tableName; this.columns = columns; this.isStrict = isStrict; this.fieldDelimiter = fieldDelimiter; this.quoteCharacter = quoteCharacter; this.escapeCharacter = escapeCharacter; // implicit in the columns value. if (columns != null && !columns.isEmpty()) { headerSource = PhoenixHeaderSource.SUPPLIED_BY_USER; } else if (columns != null && columns.isEmpty()) { headerSource = PhoenixHeaderSource.IN_LINE; } this.arrayElementSeparator = arrayElementSeparator; this.format = buildFormat(); } public CSVFormat getFormat() { return format; } /** * default settings * delimiter = ',' * quoteChar = '"', * escape = null * recordSeparator = CRLF, CR, or LF * ignore empty lines allows the last data line to have a recordSeparator * * @return CSVFormat based on constructor settings. */ private CSVFormat buildFormat() { CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines(true) .withDelimiter(asControlCharacter(fieldDelimiter)).withQuote(asControlCharacter(quoteCharacter)); if (escapeCharacter != null) { format = format.withEscape(asControlCharacter(escapeCharacter)); } switch (headerSource) { case FROM_TABLE: // obtain headers from table, so format should not expect a header. break; case IN_LINE: // an empty string array triggers csv loader to grab the first line as the header format = format.withHeader(new String[0]); break; case SUPPLIED_BY_USER: // a populated string array supplied by the user format = format.withHeader(columns.toArray(new String[columns.size()])); break; default: throw new RuntimeException("Header source was unable to be inferred."); } return format; } /** * Translate a field separator, escape character, or phrase delimiter into a control character * if it is a single digit other than 0. * * @param delimiter * @return */ public static char asControlCharacter(char delimiter) { if (CTRL_CHARACTER_TABLE.containsKey(delimiter)) { return CTRL_CHARACTER_TABLE.get(delimiter); } else { return delimiter; } } /** * Upserts data from CSV file. * * Data is batched up based on connection batch size. * Column PDataType is read from metadata and is used to convert * column value to correct type before upsert. * * The constructor determines the format for the CSV files. * * @param fileName * @throws Exception */ public void upsert(String fileName) throws Exception { CSVParser parser = CSVParser.parse(new File(fileName), Charsets.UTF_8, format); upsert(parser); } public void upsert(Reader reader) throws Exception { CSVParser parser = new CSVParser(reader, format); upsert(parser); } private static <T> String buildStringFromList(List<T> list) { return Joiner.on(", ").useForNull("null").join(list); } /** * Data is batched up based on connection batch size. * Column PDataType is read from metadata and is used to convert * column value to correct type before upsert. * * The format is determined by the supplied csvParser. * @param csvParser * CSVParser instance * @throws Exception */ public void upsert(CSVParser csvParser) throws Exception { List<ColumnInfo> columnInfoList = buildColumnInfoList(csvParser); boolean wasAutoCommit = conn.getAutoCommit(); try { conn.setAutoCommit(false); long start = System.currentTimeMillis(); CsvUpsertListener upsertListener = new CsvUpsertListener(conn, conn.getMutateBatchSize(), isStrict); CsvUpsertExecutor csvUpsertExecutor = CsvUpsertExecutor.create(conn, tableName, columnInfoList, upsertListener, arrayElementSeparator); csvUpsertExecutor.execute(csvParser); csvUpsertExecutor.close(); conn.commit(); double elapsedDuration = ((System.currentTimeMillis() - start) / 1000.0); System.out.println("CSV Upsert complete. " + upsertListener.getTotalUpsertCount() + " rows upserted"); System.out.println("Time: " + elapsedDuration + " sec(s)\n"); } finally { // release reader resources. if (csvParser != null) { csvParser.close(); } if (wasAutoCommit) { conn.setAutoCommit(true); } } } private List<ColumnInfo> buildColumnInfoList(CSVParser parser) throws SQLException { List<String> columns = this.columns; switch (headerSource) { case FROM_TABLE: System.out.println(String.format("csv columns from database.")); break; case IN_LINE: columns = new ArrayList<String>(); for (String colName : parser.getHeaderMap().keySet()) { columns.add(colName); // iterates in column order } System.out.println(String.format("csv columns from header line. length=%s, %s", columns.size(), buildStringFromList(columns))); break; case SUPPLIED_BY_USER: System.out.println(String.format("csv columns from user. length=%s, %s", columns.size(), buildStringFromList(columns))); break; default: throw new IllegalStateException("parser has unknown column source."); } return generateColumnInfo(conn, tableName, columns, isStrict); } /** * Get list of ColumnInfos that contain Column Name and its associated * PDataType for an import. The supplied list of columns can be null -- if it is non-null, * it represents a user-supplied list of columns to be imported. * * @param conn Phoenix connection from which metadata will be read * @param tableName Phoenix table name whose columns are to be checked. Can include a schema * name * @param columns user-supplied list of import columns, can be null * @param strict if true, an exception will be thrown if unknown columns are supplied */ public static List<ColumnInfo> generateColumnInfo(Connection conn, String tableName, List<String> columns, boolean strict) throws SQLException { Map<String, Integer> columnNameToTypeMap = Maps.newLinkedHashMap(); Set<String> ambiguousColumnNames = new HashSet<String>(); Map<String, Integer> fullColumnNameToTypeMap = Maps.newLinkedHashMap(); DatabaseMetaData dbmd = conn.getMetaData(); int unfoundColumnCount = 0; // TODO: escape wildcard characters here because we don't want that // behavior here String escapedTableName = StringUtil.escapeLike(tableName); String[] schemaAndTable = escapedTableName.split("\\."); ResultSet rs = null; try { rs = dbmd.getColumns(null, (schemaAndTable.length == 1 ? "" : schemaAndTable[0]), (schemaAndTable.length == 1 ? escapedTableName : schemaAndTable[1]), null); while (rs.next()) { String colName = rs.getString(QueryUtil.COLUMN_NAME_POSITION); String colFam = rs.getString(QueryUtil.COLUMN_FAMILY_POSITION); // use family qualifier, if available, otherwise, use column name String fullColumn = (colFam == null ? colName : String.format("%s.%s", colFam, colName)); String sqlTypeName = rs.getString(QueryUtil.DATA_TYPE_NAME_POSITION); // allow for both bare and family qualified names. if (columnNameToTypeMap.keySet().contains(colName)) { ambiguousColumnNames.add(colName); } columnNameToTypeMap.put(colName, PDataType.fromSqlTypeName(sqlTypeName).getSqlType()); fullColumnNameToTypeMap.put(fullColumn, PDataType.fromSqlTypeName(sqlTypeName).getSqlType()); } if (columnNameToTypeMap.isEmpty()) { throw new IllegalArgumentException("Table " + tableName + " not found"); } } finally { if (rs != null) { rs.close(); } } List<ColumnInfo> columnInfoList = Lists.newArrayList(); Set<String> unresolvedColumnNames = new TreeSet<String>(); if (columns == null) { // use family qualified names by default, if no columns are specified. for (Map.Entry<String, Integer> entry : fullColumnNameToTypeMap.entrySet()) { columnInfoList.add(new ColumnInfo(entry.getKey(), entry.getValue())); } } else { // Leave "null" as indication to skip b/c it doesn't exist for (int i = 0; i < columns.size(); i++) { String columnName = columns.get(i).trim(); Integer sqlType = null; if (fullColumnNameToTypeMap.containsKey(columnName)) { sqlType = fullColumnNameToTypeMap.get(columnName); } else if (columnNameToTypeMap.containsKey(columnName)) { if (ambiguousColumnNames.contains(columnName)) { unresolvedColumnNames.add(columnName); } // fall back to bare column name. sqlType = columnNameToTypeMap.get(columnName); } if (unresolvedColumnNames.size() > 0) { StringBuilder exceptionMessage = new StringBuilder(); boolean first = true; exceptionMessage.append("Unable to resolve these column names to a single column family:\n"); for (String col : unresolvedColumnNames) { if (first) first = false; else exceptionMessage.append(","); exceptionMessage.append(col); } exceptionMessage.append("\nAvailable columns with column families:\n"); first = true; for (String col : fullColumnNameToTypeMap.keySet()) { if (first) first = false; else exceptionMessage.append(","); exceptionMessage.append(col); } throw new SQLException(exceptionMessage.toString()); } if (sqlType == null) { if (strict) { throw new SQLExceptionInfo.Builder(SQLExceptionCode.COLUMN_NOT_FOUND) .setColumnName(columnName).setTableName(tableName).build().buildException(); } unfoundColumnCount++; } else { columnInfoList.add(new ColumnInfo(columnName, sqlType)); } } if (unfoundColumnCount == columns.size()) { throw new SQLExceptionInfo.Builder(SQLExceptionCode.COLUMN_NOT_FOUND) .setColumnName(Arrays.toString(columns.toArray(new String[0]))).setTableName(tableName) .build().buildException(); } } return columnInfoList; } static class CsvUpsertListener implements CsvUpsertExecutor.UpsertListener { private final PhoenixConnection conn; private final int upsertBatchSize; private long totalUpserts = 0L; private final boolean strict; CsvUpsertListener(PhoenixConnection conn, int upsertBatchSize, boolean strict) { this.conn = conn; this.upsertBatchSize = upsertBatchSize; this.strict = strict; } @Override public void upsertDone(long upsertCount) { totalUpserts = upsertCount; if (upsertCount % upsertBatchSize == 0) { if (upsertCount % 1000 == 0) { LOG.info("Processed upsert #{}", upsertCount); } try { LOG.info("Committing after {} records", upsertCount); conn.commit(); } catch (SQLException e) { throw new RuntimeException(e); } } } @Override public void errorOnRecord(CSVRecord csvRecord, Throwable throwable) { LOG.error("Error upserting record " + csvRecord, throwable.getMessage()); if (strict) { throw Throwables.propagate(throwable); } } /** * Get the total number of upserts that this listener has been notified about up until now. * * @return the total count of upserts */ public long getTotalUpsertCount() { return totalUpserts; } } }