Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.ailk.oci.ocnosql.tools.load.csvbulkload; import java.io.IOException; import java.io.StringReader; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.sql.DriverManager; import java.sql.SQLException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import javax.annotation.Nullable; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.util.ReflectionUtils; import org.apache.phoenix.jdbc.PhoenixConnection; import org.apache.phoenix.jdbc.PhoenixDriver; import org.apache.phoenix.mapreduce.ImportPreUpsertKeyValueProcessor; import org.apache.phoenix.util.CSVCommonsLoader; import org.apache.phoenix.util.ColumnInfo; import org.apache.phoenix.util.PhoenixRuntime; import org.apache.phoenix.util.csv.CsvUpsertExecutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.ailk.oci.ocnosql.common.rowkeygenerator.MD5RowKeyGenerator; import com.ailk.oci.ocnosql.common.rowkeygenerator.RowKeyGenerator; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; /** * MapReduce mapper that converts CSV input lines into KeyValues that can be * written to HFiles. * <p/> * KeyValues are produced by executing UPSERT statements on a Phoenix connection * and then extracting the created KeyValues and rolling back the statement * execution before it is committed to HBase. */ public class PhoenixCsvToKeyValueMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> { private static final Logger LOG = LoggerFactory.getLogger(PhoenixCsvToKeyValueMapper.class); private static final String COUNTER_GROUP_NAME = "Phoenix MapReduce Import"; /** * Configuration key for the class name of an * ImportPreUpsertKeyValueProcessor */ public static final String UPSERT_HOOK_CLASS_CONFKEY = "phoenix.mapreduce.import.kvprocessor"; /** Configuration key for the field delimiter for input csv records */ public static final String FIELD_DELIMITER_CONFKEY = "phoenix.mapreduce.import.fielddelimiter"; /** Configuration key for the array element delimiter for input arrays */ public static final String ARRAY_DELIMITER_CONFKEY = "phoenix.mapreduce.import.arraydelimiter"; /** Configuration key for the name of the output table */ public static final String TABLE_NAME_CONFKEY = "phoenix.mapreduce.import.tablename"; /** Configuration key for the columns to be imported */ public static final String COLUMN_INFO_CONFKEY = "phoenix.mapreduce.import.columninfos"; /** Configuration key for the flag to ignore invalid rows */ public static final String IGNORE_INVALID_ROW_CONFKEY = "phoenix.mapreduce.import.ignoreinvalidrow"; // ? /** * rowkey?? */ public static final String ROW_PREFIX_COLUMNS = "phoenix.mapreduce.import.rowprefixcolumns"; /** * rowkey?,md5 */ public static final String ROW_PREFIX_ALG = "phoenix.mapreduce.import.rowprefixalg"; /** * rowkey?() */ public static final String ROW_COLUMNS = "phoenix.mapreduce.import.rowcolumns"; /** * ?(?),???rowkey?, */ public static final String UNIQUE_INDEX_COLUMNS = "phoenix.mapreduce.import.uniqueindexcolumns"; private PhoenixConnection conn; private CsvUpsertExecutor csvUpsertExecutor; private MapperUpsertListener upsertListener; private CsvLineParser csvLineParser; private ImportPreUpsertKeyValueProcessor preUpdateProcessor; private RowKeyGenerator rowKeyGenerator; /** * csv */ private String separator; /** * rowkey?hash? */ private List<Integer> rowPrefixColIdxs; /** * rowkey? */ private List<Integer> rowColIdxs; /** * */ private List<Integer> unqIdxColIdxs; // ?rowkey?? private StringBuilder rowGentemp; @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); String jdbcUrl = getJdbcUrl(conf); // This statement also ensures that the driver class is loaded LOG.info("Connection with driver {} with url {}", PhoenixDriver.class.getName(), jdbcUrl); try { conn = (PhoenixConnection) DriverManager.getConnection(jdbcUrl); } catch (SQLException e) { throw new RuntimeException(e); } upsertListener = new MapperUpsertListener(context, conf.getBoolean(IGNORE_INVALID_ROW_CONFKEY, true)); csvUpsertExecutor = buildUpsertExecutor(conf); csvLineParser = new CsvLineParser(conf.get(FIELD_DELIMITER_CONFKEY).charAt(0)); preUpdateProcessor = loadPreUpsertProcessor(conf); // ? // ? List<String> importColumnList = new ArrayList<String>(); for (ColumnInfo colInfo : buildColumnInfoList(conf)) { importColumnList.add(colInfo.getColumnName()); } // ?hash? List<String> rowPrefixColumns = Lists .newArrayList(Splitter.on(",").trimResults().split(conf.get(ROW_PREFIX_COLUMNS))); // rowkey?hash? rowPrefixColIdxs = new ArrayList<Integer>(); for (String rpCol : rowPrefixColumns) { // ?1csv?? rowPrefixColIdxs.add(importColumnList.indexOf(rpCol) - 1); } // rowkey List<String> rowColumns = Lists.newArrayList(Splitter.on(",").trimResults().split(conf.get(ROW_COLUMNS))); rowColIdxs = new ArrayList<Integer>(); for (String rCol : rowColumns) { // ?1csv?? rowColIdxs.add(importColumnList.indexOf(rCol) - 1); } // List<String> uniqueIndexColumns = Lists .newArrayList(Splitter.on(",").trimResults().split(conf.get(UNIQUE_INDEX_COLUMNS, "_allColumns"))); if (uniqueIndexColumns.size() == 1 && uniqueIndexColumns.get(0).equals("_allColumns")) { unqIdxColIdxs = null; } else { unqIdxColIdxs = new ArrayList<Integer>(); for (String rCol : uniqueIndexColumns) { // ?1csv?? unqIdxColIdxs.add(importColumnList.indexOf(rCol) - 1); } } // ?rowkey???(md5) rowKeyGenerator = buildRowKeyGenerator(conf.get(ROW_PREFIX_ALG, "md5")); separator = conf.get(FIELD_DELIMITER_CONFKEY); // ?? rowGentemp = new StringBuilder(); } /** * ?rowkey???(md5) * * @param string * @return */ private RowKeyGenerator buildRowKeyGenerator(String alg) { if (null == alg || alg.equals("") || alg.equals("md5")) { return new MD5RowKeyGenerator(); } else { throw new RuntimeException("Temporarily does not support the other algorithms"); } } /** * ?csv???rowkey(hash?++md5?) */ private String generateRowKey(String lineStr) { String rowkey = ""; // 1.??(??) String[] lineArr = StringUtils.splitByWholeSeparatorPreserveAllTokens(lineStr, separator); rowGentemp.delete(0, rowGentemp.length()); for (int idx : rowPrefixColIdxs) { rowGentemp.append(lineArr[idx]); } rowkey += rowKeyGenerator.generatePrefix(rowGentemp.toString()); // 2.?(?) rowGentemp.delete(0, rowGentemp.length()); for (int idx : rowColIdxs) { rowGentemp.append(lineArr[idx]); } rowkey += rowGentemp.toString(); // 3.??(?,a.b.) if (null != unqIdxColIdxs) {// a. rowGentemp.delete(0, rowGentemp.length()); for (int idx : unqIdxColIdxs) { rowGentemp.append(lineArr[idx]); } rowkey += getUniquePostfix(rowGentemp.toString()); } else {// b., rowkey += getUniquePostfix(lineStr); } // 4. return rowkey; } /** * ?16?? */ private String getUniquePostfix(String plainText) { String result = null; try { MessageDigest md = MessageDigest.getInstance("MD5"); md.update(plainText.getBytes()); byte b[] = md.digest(); int i; StringBuffer buf = new StringBuffer(""); for (int offset = 0; offset < b.length; offset++) { i = b[offset]; if (i < 0) i += 256; if (i < 16) buf.append("0"); buf.append(Integer.toHexString(i)); } result = buf.toString().substring(8, 24); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); } return result; } @SuppressWarnings("deprecation") @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String lineStr = value.toString(); // ? if (null != lineStr && lineStr.length() > 1) { // ?rowkey(hash?++md5?)???? lineStr = generateRowKey(lineStr) + separator + lineStr; } ImmutableBytesWritable outputKey = new ImmutableBytesWritable(); try { CSVRecord csvRecord = null; try { csvRecord = csvLineParser.parse(lineStr); } catch (IOException e) { context.getCounter(COUNTER_GROUP_NAME, "CSV Parser errors").increment(1L); } if (csvRecord == null) { context.getCounter(COUNTER_GROUP_NAME, "Empty records").increment(1L); return; } csvUpsertExecutor.execute(ImmutableList.of(csvRecord)); Iterator<Pair<byte[], List<KeyValue>>> uncommittedDataIterator = PhoenixRuntime .getUncommittedDataIterator(conn); while (uncommittedDataIterator.hasNext()) { Pair<byte[], List<KeyValue>> kvPair = uncommittedDataIterator.next(); List<KeyValue> keyValueList = kvPair.getSecond(); keyValueList = preUpdateProcessor.preUpsert(kvPair.getFirst(), keyValueList); for (KeyValue kv : keyValueList) { outputKey.set(kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()); context.write(outputKey, kv); } } conn.rollback(); } catch (Exception e) { throw new RuntimeException(e); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { try { conn.close(); } catch (SQLException e) { throw new RuntimeException(e); } } /** * Load the configured ImportPreUpsertKeyValueProcessor, or supply a dummy * processor. */ @VisibleForTesting static ImportPreUpsertKeyValueProcessor loadPreUpsertProcessor(Configuration conf) { Class<? extends ImportPreUpsertKeyValueProcessor> processorClass = null; try { processorClass = conf.getClass(UPSERT_HOOK_CLASS_CONFKEY, DefaultImportPreUpsertKeyValueProcessor.class, ImportPreUpsertKeyValueProcessor.class); } catch (Exception e) { throw new IllegalStateException("Couldn't load upsert hook class", e); } return ReflectionUtils.newInstance(processorClass, conf); } /** * Build up the JDBC URL for connecting to Phoenix. * * @return the full JDBC URL for a Phoenix connection */ @VisibleForTesting static String getJdbcUrl(Configuration conf) { String zkQuorum = conf.get("hbase.zookeeper.quorum") + ":" + conf.get("hbase.zookeeper.property.clientPort"); if (zkQuorum == null) { throw new IllegalStateException(HConstants.ZOOKEEPER_QUORUM + " is not configured"); } return PhoenixRuntime.JDBC_PROTOCOL + PhoenixRuntime.JDBC_PROTOCOL_SEPARATOR + zkQuorum; } @VisibleForTesting CsvUpsertExecutor buildUpsertExecutor(Configuration conf) { String tableName = conf.get(TABLE_NAME_CONFKEY); String arraySeparator = conf.get(ARRAY_DELIMITER_CONFKEY, CSVCommonsLoader.DEFAULT_ARRAY_ELEMENT_SEPARATOR); Preconditions.checkNotNull(tableName, "table name is not configured"); List<ColumnInfo> columnInfoList = buildColumnInfoList(conf); return CsvUpsertExecutor.create(conn, tableName, columnInfoList, upsertListener, arraySeparator); } /** * Write the list of to-import columns to a job configuration. * * @param conf * configuration to be written to * @param columnInfoList * list of ColumnInfo objects to be configured for import */ @VisibleForTesting static void configureColumnInfoList(Configuration conf, List<ColumnInfo> columnInfoList) { conf.set(COLUMN_INFO_CONFKEY, Joiner.on("|").useForNull("").join(columnInfoList)); } /** * Build the list of ColumnInfos for the import based on information in the * configuration. */ @VisibleForTesting static List<ColumnInfo> buildColumnInfoList(Configuration conf) { return Lists.newArrayList(Iterables.transform(Splitter.on("|").split(conf.get(COLUMN_INFO_CONFKEY)), new Function<String, ColumnInfo>() { @Nullable @Override public ColumnInfo apply(@Nullable String input) { if (input.isEmpty()) { // An empty string represents a null that was passed // in to // the configuration, which corresponds to an input // column // which is to be skipped return null; } return ColumnInfo.fromString(input); } })); } /** * Listener that logs successful upserts and errors to job counters. */ @VisibleForTesting static class MapperUpsertListener implements CsvUpsertExecutor.UpsertListener { private final Context context; private final boolean ignoreRecordErrors; private MapperUpsertListener(Context context, boolean ignoreRecordErrors) { this.context = context; this.ignoreRecordErrors = ignoreRecordErrors; } @Override public void upsertDone(long upsertCount) { context.getCounter(COUNTER_GROUP_NAME, "Upserts Done").increment(1L); } @Override public void errorOnRecord(CSVRecord csvRecord, String errorMessage) { LOG.error("Error on record {}: {}", csvRecord, errorMessage); context.getCounter(COUNTER_GROUP_NAME, "Errors on records").increment(1L); if (!ignoreRecordErrors) { throw new RuntimeException("Error on record, " + errorMessage + ", " + "record =" + csvRecord); } } } /** * Parses a single CSV input line, returning a {@code CSVRecord}. */ @VisibleForTesting static class CsvLineParser { private final CSVFormat csvFormat; CsvLineParser(char fieldDelimiter) { this.csvFormat = CSVFormat.newFormat(fieldDelimiter); } public CSVRecord parse(String input) throws IOException { // TODO Creating a new parser for each line seems terribly // inefficient but // there's no public way to parse single lines via commons-csv. We // should update // it to create a LineParser class like this one. CSVParser csvParser = new CSVParser(new StringReader(input), csvFormat); return Iterables.getFirst(csvParser, null); } } /** * A default implementation of {@code ImportPreUpsertKeyValueProcessor} that * is used if no specific class is configured. This implementation simply * passes through the KeyValue list that is passed in. */ public static class DefaultImportPreUpsertKeyValueProcessor implements ImportPreUpsertKeyValueProcessor { @Override public List<KeyValue> preUpsert(byte[] rowKey, List<KeyValue> keyValues) { return keyValues; } } }