Java tutorial
/** * Copyright 2010 The Apache Software Foundation * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package bulkload; import bulkload.rowkeygenerator.IRowKeyGenerator; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2; import org.apache.hadoop.hbase.mapreduce.PutSortReducer; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.util.Base64; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import utils.StringUtil; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import static java.lang.String.format; /** * Tool to import data from a TSV file. * * This tool is rather simplistic - it doesn't do any quoting or escaping, but * is useful for many data loads. * * @see ImportTsv#usage(String) */ public class ImportTsv extends Configured implements Tool { protected static final Log LOG = LogFactory.getLog(ImportTsv.class); final static String NAME = "importtsv"; final static String JOB_NAME_CONF_KEY = "mapreduce.job.name"; final static String SKIP_LINES_CONF_KEY = "importtsv.skip.bad.lines"; final static String BULK_OUTPUT_CONF_KEY = "importtsv.bulk.output"; final static String COLUMNS_CONF_KEY = "importtsv.columns"; final static String SEPARATOR_CONF_KEY = "importtsv.separator"; final static String ROWKEY_CONF_KEY = "importtsv.hbase.rowkey"; final static String ROWKEY_GENERATOR_CONF_KEY = "importtsv.hbase.rowkey.generator"; final static String ROWKEY_GENERATOR_PARAM_CONF_KEY = "importtsv.hbase.rowkey.generator.param"; //true: call prefixkey to hbase row key. //false/null:keep original row key final static String ROW_KEY_PREFIX = "importtsv.rowkey.prefix"; final static String DEFAULT_SEPARATOR = "\t"; final static String TIMESTAMP = "importtsv.timestamp"; @Override public int run(String[] args) throws Exception { if (args.length < 2) { usage("Wrong number of arguments: " + args.length); return -1; } setConf(HBaseConfiguration.create(getConf())); Configuration conf = getConf(); // Make sure columns are specified String columns[] = conf.getStrings(COLUMNS_CONF_KEY); if (columns == null) { usage("No columns specified. Please specify with -D" + COLUMNS_CONF_KEY + "=..."); return -1; } // Make sure rowkey is specified String rowkey = conf.get(ROWKEY_CONF_KEY); if (StringUtil.isEmpty(rowkey)) { usage("No rowkey specified or rowkey is empty. Please specify with -D" + ROWKEY_CONF_KEY + "=..."); return -1; } // Make sure rowkey handler is specified String rowKeyGenerator = conf.get(ROWKEY_GENERATOR_CONF_KEY); if (StringUtil.isEmpty(rowKeyGenerator)) { usage("No rowkey_handler specified or rowkey generator is empty. Please specify with -D" + ROWKEY_GENERATOR_CONF_KEY + "=..."); return -1; } // Make sure they specify exactly one column as the row key int rowkeysFound = 0; for (String col : columns) { String[] parts = col.split(":", 3); if (parts.length > 1 && rowkey.equals(parts[1])) { rowkeysFound++; } } if (rowkeysFound != 1) { usage("Must specify exactly one column as " + rowkey); return -1; } // Make sure at least one columns are specified if (columns.length < 1) { usage("One or more columns in addition to the row key are required"); System.exit(-1); } Job job = createSubmittableJob(conf, args); return job.waitForCompletion(true) ? 0 : 1; } // final static String DEFAULT_SEPARATOR = "\u0001"; static class TsvParser { /** * Column families and qualifiers mapped to the TSV columns */ private final byte[][] families; private final byte[][] qualifiers; private final byte[][] postfix; private final byte separatorByte; private int rowKeyColumnIndex = -1; public static String ROWKEY_COLUMN_SPEC = ""; /** * @param columnsSpecification the list of columns to parser out, comma separated. * The row key should be the special token TsvParser.ROWKEY_COLUMN_SPEC * @param separatorStr * @param rowkeyStr */ public TsvParser(String columnsSpecification, String separatorStr, String rowkeyStr) { // Configure separator if (separatorStr.equals("\\001")) { separatorStr = "\001"; } if (separatorStr.equals("\\002")) { separatorStr = "\002"; } if (separatorStr.equals("\\t")) { separatorStr = "\t"; } if (separatorStr.equals("\\n")) { separatorStr = "\n"; } byte[] separator = Bytes.toBytes(separatorStr); Preconditions.checkArgument(separator.length == 1, "TsvParser only supports single-byte separators hello " + separator.length + "separtors::: " + separatorStr); separatorByte = separator[0]; ROWKEY_COLUMN_SPEC = rowkeyStr;//rowkey prefix_srcfield // Configure columns ArrayList<String> columnStrings = Lists .newArrayList(Splitter.on(',').trimResults().split(columnsSpecification)); families = new byte[columnStrings.size()][]; qualifiers = new byte[columnStrings.size()][]; postfix = new byte[columnStrings.size()][]; for (int i = 0; i < columnStrings.size(); i++) { String str = columnStrings.get(i); String[] parts = str.split(":", 3); if (parts.length > 1 && ROWKEY_COLUMN_SPEC.equals(parts[1])) { rowKeyColumnIndex = i; } switch (parts.length) { case 1: families[i] = str.getBytes(); qualifiers[i] = HConstants.EMPTY_BYTE_ARRAY; postfix[i] = HConstants.EMPTY_BYTE_ARRAY; break; case 2: families[i] = parts[0].getBytes(); qualifiers[i] = parts[1].getBytes(); postfix[i] = HConstants.EMPTY_BYTE_ARRAY; break; case 3: families[i] = parts[0].getBytes(); qualifiers[i] = parts[1].getBytes(); postfix[i] = parts[2].getBytes(); break; } } } public int getRowKeyColumnIndex() { return rowKeyColumnIndex; } public byte[] getFamily(int idx) { return families[idx]; } public byte[] getQualifier(int idx) { return qualifiers[idx]; } public byte[] getPostFix(int idx) { return postfix[idx]; } public static class BadTsvLineException extends Exception { public BadTsvLineException(String err) { super(err); } private static final long serialVersionUID = 1L; } public ParsedLine parse(byte[] lineBytes, int length) throws BadTsvLineException { // Enumerate separator offsets ArrayList<Integer> tabOffsets = new ArrayList<Integer>(families.length); for (int i = 0; i < length; i++) { if (lineBytes[i] == separatorByte) { tabOffsets.add(i); } } if (tabOffsets.isEmpty()) { throw new BadTsvLineException("No delimiter"); } tabOffsets.add(length); if (tabOffsets.size() > families.length) { throw new BadTsvLineException("Excessive columns"); } else if (tabOffsets.size() <= getRowKeyColumnIndex()) { throw new BadTsvLineException("No row key"); } return new ParsedLine(tabOffsets, lineBytes); } class ParsedLine { private final ArrayList<Integer> tabOffsets; private byte[] lineBytes; private HashMap<String, Integer> qualifierIdx = new HashMap<String, Integer>(); ParsedLine(ArrayList<Integer> tabOffsets, byte[] lineBytes) { this.tabOffsets = tabOffsets; this.lineBytes = lineBytes; for (int j = 0; j < qualifiers.length; j++) { qualifierIdx.put(new String(qualifiers[j]), j); } } public int getRowKeyOffset() { return getColumnOffset(rowKeyColumnIndex); } public int getRowKeyLength() { return getColumnLength(rowKeyColumnIndex); } public int getColumnOffsetByQualifier(byte[] qualifier) { Integer result = qualifierIdx.get(new String(qualifier)); if (result != null) return getColumnOffset(result); else return -1; } public int getColumnLengthByQualifier(byte[] qualifier) { Integer result = qualifierIdx.get(new String(qualifier)); if (result != null) return getColumnLength(result); else return -1; } public int getColumnOffset(int idx) { if (idx > 0) return tabOffsets.get(idx - 1) + 1; else if (idx == 0) { return 0; } else return -1; } public int getColumnLength(int idx) { if (idx >= 0) { return tabOffsets.get(idx) - getColumnOffset(idx); // if (Arrays.equals( // "\\N".getBytes(), // Arrays.copyOfRange(lineBytes, // getColumnOffset(idx), // tabOffsets.get(idx)))) { // //null field in hive // return 0; // } } else { return -1; } } public int getColumnCount() { return tabOffsets.size(); } public byte[] getLineBytes() { return lineBytes; } } } /** * Write table content out to files in hdfs. */ static class TsvImporter extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> { private ByteArrayOutputStream baos = new ByteArrayOutputStream(); /** Timestamp for all inserted rows */ private long ts; /** Should skip bad lines */ private boolean skipBadLines; private Counter badLineCount; private TsvParser parser; private IRowKeyGenerator rowkeyGenerator = null; private String rowkeyParam = ""; @Override protected void setup(Context context) { Configuration conf = context.getConfiguration(); // If a custom separator has been used, // decode it back from Base64 encoding. String separator = conf.get(SEPARATOR_CONF_KEY); if (separator == null) { separator = DEFAULT_SEPARATOR; } else { separator = new String(Base64.decode(separator)); } skipBadLines = context.getConfiguration().getBoolean(SKIP_LINES_CONF_KEY, true); badLineCount = context.getCounter("ImportTsv", "Bad Lines"); parser = new TsvParser(conf.get(COLUMNS_CONF_KEY), separator, conf.get(ROWKEY_CONF_KEY)); if (parser.getRowKeyColumnIndex() == -1) { throw new RuntimeException("No row key column specified"); } try { Class rowkeyCls = Class.forName(conf.get(ROWKEY_GENERATOR_CONF_KEY)); rowkeyGenerator = (IRowKeyGenerator) rowkeyCls.newInstance(); } catch (Exception e) { throw new IllegalArgumentException(e.getMessage(), e); } rowkeyParam = conf.get(ROWKEY_GENERATOR_PARAM_CONF_KEY); String timestamp = conf.get(TIMESTAMP); if (StringUtil.isEmpty(timestamp)) { ts = System.currentTimeMillis(); } else { ts = Long.parseLong(timestamp); } } /** * Convert a line of TSV text into an HBase table row. */ @Override public void map(LongWritable offset, Text value, Context context) throws IOException { try { byte[] lineBytes = value.getBytes(); TsvParser.ParsedLine parsed = parser.parse(lineBytes, value.getLength()); ImmutableBytesWritable rowKey = new ImmutableBytesWritable(lineBytes, parsed.getRowKeyOffset(), parsed.getRowKeyLength()); if (rowKey.getLength() == 0) { context.getCounter("ImportTsv", "0 length rowkey").increment(1); return; } String keyStr = new String(rowKey.copyBytes()); byte[] actualRowkey = rowkeyGenerator.genRowKey(keyStr, rowkeyParam); if (actualRowkey.length == 0) { context.getCounter("ImportTsv", "0 length rowkey").increment(1); return; } Put put = new Put(actualRowkey); byte[] qualifier = null; for (int i = 0; i < parsed.getColumnCount(); i++) { if (parsed.getColumnLength(i) == -1) { continue; } //nested table field if (parser.getPostFix(i) != HConstants.EMPTY_BYTE_ARRAY) { baos.reset(); baos.write(parser.getQualifier(i)); baos.write(':'); baos.write(lineBytes, parsed.getColumnOffsetByQualifier(parser.getPostFix(i)), parsed.getColumnLengthByQualifier(parser.getPostFix(i))); qualifier = baos.toByteArray(); } else {// normal table field qualifier = parser.getQualifier(i); } KeyValue kv = new KeyValue(actualRowkey, parser.getFamily(i), qualifier, ts, KeyValue.Type.Put, Arrays.copyOfRange(lineBytes, parsed.getColumnOffset(i), parsed.getColumnOffset(i) + parsed.getColumnLength(i))); put.add(kv); } context.write(new ImmutableBytesWritable(actualRowkey), put); } catch (TsvParser.BadTsvLineException badLine) { if (skipBadLines) { System.err.println("Bad line at offset: " + offset.get() + ":\n" + badLine.getMessage()); badLineCount.increment(1); return; } else { throw new IOException(badLine); } } catch (InterruptedException e) { e.printStackTrace(); } } } /** * Sets up the actual job. * * @param conf * The current configuration. * @param args * The command line parameters. * @return The newly created job. * @throws IOException * When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { Job job = null; try (Connection connection = ConnectionFactory.createConnection(conf)) { try (Admin admin = connection.getAdmin()) { // Support non-XML supported characters // by re-encoding the passed separator as a Base64 string. String actualSeparator = conf.get(SEPARATOR_CONF_KEY); if (actualSeparator != null) { conf.set(SEPARATOR_CONF_KEY, Base64.encodeBytes(actualSeparator.getBytes())); } TableName tableName = TableName.valueOf(args[0]); if (!admin.tableExists(tableName)) { String errorMsg = format("Table '%s' does not exist.", tableName); LOG.error(errorMsg); throw new TableNotFoundException(errorMsg); } Path inputDir = new Path(args[1]); String jobName = conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName.getNameAsString()); job = Job.getInstance(conf, jobName); job.setJarByClass(TsvImporter.class); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TsvImporter.class); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); if (hfileOutPath != null) { try (HTable table = (HTable) connection.getTable(tableName)) { Path outputDir = new Path(hfileOutPath); FileSystem fs = FileSystem.get(conf); if (fs.exists(outputDir)) { if (!fs.delete(outputDir, true)) { throw new IllegalStateException("delete path:" + outputDir + " failed"); } } FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(Put.class); job.setReducerClass(PutSortReducer.class); HFileOutputFormat2.configureIncrementalLoad(job, table, table); } } else { // No reducers. Just write straight to table. Call // initTableReducerJob // to set up the TableOutputFormat. TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job); job.setNumReduceTasks(0); // TableMapReduceUtil.addDependencyJars(job); // TableMapReduceUtil.addDependencyJars(job.getConfiguration(), // com.google.common.base.Function.class /* Guava used by TsvParser */); } // Workaround to remove unnecessary hadoop dependencies String[] jars = job.getConfiguration().get("tmpjars").split(",", -1); StringBuilder filteredJars = new StringBuilder(); for (String j : jars) { String[] parts = j.split("/", -1); String fileName = parts[parts.length - 1]; if (fileName.indexOf("hadoop-") != 0) { filteredJars.append(j); filteredJars.append(","); } } job.getConfiguration().set("tmpjars", filteredJars.toString()); } } return job; } /* * @param errorMsg Error message. Can be null. */ private static void usage(final String errorMsg) { if (errorMsg != null && errorMsg.length() > 0) { System.err.println("ERROR: " + errorMsg); } String usage = "Usage: " + NAME + " -Dimporttsv.columns=a,b,c <tablename> <inputdir>\n" + "\n" + "Imports the given input directory of TSV data into the specified table.\n" + "\n" + "The column names of the TSV data must be specified using the -Dimporttsv.columns\n" + "option. This option takes the form of comma-separated column names, where each\n" + "column name is either a simple column family, or a columnfamily:qualifier. The special\n" + "column name HBASE_ROW_KEY is used to designate that this column should be used\n" + "as the row key for each imported record. You must specify exactly one column\n" + "to be the row key.\n" + "\n" + "In order to prepare data for a bulk data load, pass the option:\n" + " -D" + BULK_OUTPUT_CONF_KEY + "=/path/for/output\n" + "\n" + "Other options that may be specified with -D include:\n" + " -D" + SKIP_LINES_CONF_KEY + "=false - fail if encountering an invalid line\n" + " '-D" + SEPARATOR_CONF_KEY + "=|' - eg separate on pipes instead of tabs"; System.err.println(usage); } /** * Main entry point. * * @param args * The command line parameters. * @throws Exception * When running the job fails. */ public static void main(String[] args) throws Exception { int status = ToolRunner.run(new ImportTsv(), args); System.exit(status); } }