Java tutorial
/** * Copyright 2010 Mozilla Foundation * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Description: Creating a custom Input Format, by help of RecordReader. Detail can be found in below links * https://www.inkling.com/read/hadoop-definitive-guide-tom-white-3rd/chapter-7/input-formats * http://hadoopi.wordpress.com/2013/05/27/understand-recordreader-inputsplit/ * * Implemented using RecordReader concepts. */ import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.filter.CompareFilter; import org.apache.hadoop.hbase.filter.RegexStringComparator; import org.apache.hadoop.hbase.filter.RowFilter; import org.apache.hadoop.hbase.mapreduce.TableSplit; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.util.StringUtils; import org.godhuli.rhipe.RHRaw; public class RHHBaseRecorder extends org.apache.hadoop.mapreduce.InputFormat<RHRaw, RHResult> implements Configurable { private final static Log LOG = LogFactory.getLog(RHHBaseRecorder.class); public static boolean ValueIsString = false; public static boolean SingleCFQ = false; public static byte[][][] CFQ; /** Job parameter that specifies the input table. */ public static final String INPUT_TABLE = "rhipe.hbase.tablename"; /** Job parameter that specifies the filtering parameter for table. */ public static final String Filter = "rhipe.hbase.filter"; /************SET BATCH for better performance****************/ public static final String batch = "rhipe.hbase.set.batch"; /************SET BATCH for better performance****************/ public static final String sizecal = "rhipe.hbase.set.size"; /** Base-64 encoded array of scanners. */ public static final String RHIPE_COLSPEC = "rhipe.hbase.colspec"; private Configuration conf = null; private HTable table = null; private Scan[] scans = null; private TableRecordReader trr = null; /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext) * http://hadoop.apache.org/docs/current2/api/org/apache/hadoop/mapreduce/InputFormat.html * * RHRAW and RHRESULT is immutable raw bytes format used in RHIPE. For more details see RHIPE. */ @Override public RecordReader<RHRaw, RHResult> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { if (scans == null) { throw new IOException("No scans were provided"); } if (table == null) { throw new IOException("No table was provided."); } if (trr == null) { trr = new TableRecordReader(); } TableSplit tSplit = (TableSplit) split; LOG.info("Split in Record Reader " + tSplit); Scan scan = new Scan(scans[0]); scan.setStartRow(tSplit.getStartRow()); scan.setStopRow(tSplit.getEndRow()); //LOG.info("Table in Record Reader " + Bytes.toStringBinary(tSplit.getTableName())); trr.setScan(scan); trr.setHTable(table); trr.init(); return trr; } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext) * http://hadoop.apache.org/docs/current2/api/org/apache/hadoop/mapreduce/InputFormat.html * * Get Number of splits between the start and end rowkeys. */ @Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { if (table == null) { throw new IOException("No table was provided."); } Pair<byte[][], byte[][]> keys = table.getStartEndKeys(); if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) { throw new IOException("Expecting at least one region."); } Set<InputSplit> splits = new HashSet<InputSplit>(); for (int i = 0; i < keys.getFirst().length; i++) { String regionLocation = table.getRegionLocation(keys.getFirst()[i]).getServerAddress().getHostname(); //LOG.info("Split length " + keys.getFirst().length); for (Scan s : scans) { byte[] startRow = s.getStartRow(); byte[] stopRow = s.getStopRow(); // determine if the given start an stop key fall into the region if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) { byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow; byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow; //LOG.info("Start Split now " + Bytes.toStringBinary(splitStart)); //LOG.info("Start stop now " + Bytes.toStringBinary(splitStop)); //System.out.println("\n================" + Bytes.toLong(splitStart,0)); //System.out.println("\n================" + splitStart.length); //LOG.info(" Start row-***- " +Bytes.toLong(startRow,0)); //LOG.info(" End row-***- " +Bytes.toLong(stopRow,0)); InputSplit split = new TableSplit(table.getTableName(), splitStart, splitStop, regionLocation); LOG.info("the current regionInfo's startKey is :" + Bytes.toStringBinary(splitStart) + " , the current regionInfo's endkey is : " + Bytes.toStringBinary(splitStop) + " , the current regionInfo's table is " + Bytes.toStringBinary(table.getTableName()) + " , the current regionInfo's regionLocation is :" + regionLocation); //LOG.info("Table Name " + table.getTableName()); //LOG.info("Split server =>" + " " + split); splits.add(split); } } } return new ArrayList<InputSplit>(splits); } /* (non-Javadoc) * @see org.apache.hadoop.conf.Configurable#getConf() */ @Override public Configuration getConf() { return conf; } /* (non-Javadoc) * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration) */ @Override public void setConf(Configuration conf) { this.conf = conf; RHHBaseRecorder.ValueIsString = conf.get("rhipe_hbase_values_are_string") != null && conf.get("rhipe_hbase_values_are_string").equals("TRUE"); RHHBaseRecorder.SingleCFQ = conf.get("rhipe.hbase.single.cfq") != null && conf.get("rhipe.hbase.single.cfq").equals("TRUE"); String tableName = conf.get(INPUT_TABLE); try { setHTable(new HTable(HBaseConfiguration.create(conf), tableName)); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } Scan[] scans = null; if (conf.get(RHIPE_COLSPEC) != null) { try { String[] cols = conf.get(RHIPE_COLSPEC).split(","); ArrayList<Pair<String, String>> l = null; if (cols.length > 0) { l = new ArrayList<Pair<String, String>>(cols.length); for (int i = 0; i < cols.length; i++) { String[] x = cols[i].split(":"); if (x.length == 1) { l.add(new Pair<String, String>(x[0], null)); LOG.info("Added family: " + x[0]); } else { l.add(new Pair<String, String>(x[0], x[1])); LOG.info("Added " + x[0] + ":" + x[1]); } } } String[] x = conf.get("rhipe.hbase.mozilla.cacheblocks").split(":"); scans = Fun.generateScans(conf.get("rhipe.hbase.rowlim.start"), conf.get("rhipe.hbase.rowlim.end"), l, Integer.parseInt(x[0]), Integer.parseInt(x[1]) == 1 ? true : false); } catch (Exception e) { LOG.error("An error occurred.", e); } } else { //Scan[] scans = null; scans = new Scan[] { new Scan() }; LOG.info("Start Row Key" + Bytes.toStringBinary( org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.rowlim.start")))); LOG.info("End Row Key" + Bytes.toStringBinary( org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.rowlim.end")))); //LOG.info("Filter in " + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.filter")))); //LOG.info("Filter out " + conf.get("rhipe.hbase.filter")); String[] x = conf.get("rhipe.hbase.mozilla.cacheblocks").split(":"); LOG.info("cache " + Integer.parseInt(x[0]) + " block " + Integer.parseInt(x[1])); scans = Fun.generateScansRows(conf.get("rhipe.hbase.rowlim.start"), conf.get("rhipe.hbase.rowlim.end"), Integer.parseInt(x[0]), Integer.parseInt(x[1]) == 1 ? true : false, conf.get("rhipe.hbase.filter"), Integer.parseInt(conf.get("rhipe.hbase.set.batch"))); //scans = getAllColumnQualifier(table); } setScans(scans); } public static int getSizeofFile(String st, String en, int caching, boolean cacheBlocks, String filter, int batch) throws IOException { Scan s = new Scan(); s.setCacheBlocks(false); // don't set to true for MR jobs LOG.info(" Calculation of File size in hbase at client side. ------ "); if (st != null) { byte[] stb1 = org.apache.commons.codec.binary.Base64.decodeBase64(st); //LOG.info(" Start row in ------ " +Bytes.toStringBinary(stb1)); s.setStartRow(stb1); } if (en != null) { byte[] enb2 = org.apache.commons.codec.binary.Base64.decodeBase64(en); //LOG.info(" End row in------ " +Bytes.toStringBinary(enb2)); s.setStopRow(enb2); } //LOG.info(" Filter- ----- " + filter); RowFilter rowFilterRegex = new RowFilter(CompareFilter.CompareOp.EQUAL, new RegexStringComparator( Bytes.toString(org.apache.commons.codec.binary.Base64.decodeBase64(filter)))); s.setFilter(rowFilterRegex); HTable tt = new HTable(HBaseConfiguration.create(), "tsdb"); ResultScanner ss = tt.getScanner(s); int col = 0; int size = 0; for (Result r : ss) { col = 0; for (KeyValue kv : r.raw()) { col = col + kv.getLength(); //System.out.print("\n Length keyValue " +kv.getLength() + "\n"); } size = size + col / 1000; } System.out.print("\n Size " + size + "\n"); return size; } public static Scan[] getAllColumnQualifier(HTable table) { ArrayList<Scan> scans = new ArrayList<Scan>(); Scan scans2 = new Scan(); try { int count = 0; ResultScanner ss = table.getScanner(scans2); for (Result r : ss) { for (KeyValue kv : r.raw()) { //System.out.print("\n Rowkey " +org.apache.commons.codec.binary.Base64.encodeBase64String(kv.getRow()) ); // System.out.print(new String(kv.getFamily()) + ":"); //System.out.print(org.apache.commons.codec.binary.Base64.encodeBase64String(kv.getQualifier() )+ " "); //s.addFamily(kv.getFamily()); scans2.addColumn(kv.getFamily(), kv.getQualifier()); } } //return s; } catch (IOException e) { e.printStackTrace(); } scans.add(scans2); return scans.toArray(new Scan[scans.size()]); } /** * Allows subclasses to get the {@link HTable}. * Get HBase Table */ protected HTable getHTable() { return this.table; } /** * Allows subclasses to set the {@link HTable}. * * @param table * The table to get the data from. Set Hbase table For our case it is normally tsdb. */ protected void setHTable(HTable table) { this.table = table; } /** * @return */ public Scan[] getScans() { return scans; } /** * @param scans The scans to use as boundaries. */ public void setScans(Scan[] scans) { this.scans = scans; } /** * Iterate over an HBase table data, return (ImmutableBytesWritable, Result) * pairs. */ protected class TableRecordReader extends RecordReader<RHRaw, RHResult> { private ResultScanner scanner = null; private Scan scan = null; private HTable htable = null; private byte[] lastRow = null; private RHRaw key = null; private RHResult value = null; private Result oldresult = null; public void restart(byte[] firstRow) throws IOException { Scan newScan = new Scan(scan); newScan.setStartRow(firstRow); this.scanner = this.htable.getScanner(newScan); } public void init() throws IOException { restart(scan.getStartRow()); } public void setHTable(HTable htable) { this.htable = htable; } public void setScan(Scan scan) { this.scan = scan; } public void close() { this.scanner.close(); } public RHRaw getCurrentKey() throws IOException, InterruptedException { return key; } public RHResult getCurrentValue() throws IOException, InterruptedException { return value; } public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException { } public boolean nextKeyValue() throws IOException, InterruptedException { if (key == null) key = new RHRaw(); if (value == null) { value = new RHResult(); oldresult = new Result(); } try { oldresult = this.scanner.next(); if (oldresult != null) { value.set(oldresult); } } catch (IOException e) { //LOG.debug("recovered from " + StringUtils.stringifyException(e)); restart(lastRow); scanner.next(); // skip presumed already mapped row oldresult = scanner.next(); value.set(oldresult); } if (oldresult != null && oldresult.size() > 0) { byte[] _b = oldresult.getRow(); key.set(_b); lastRow = _b; return true; } return false; } public float getProgress() { return 0; } } }