List of usage examples for org.apache.hadoop.io Text getBytes
@Override public byte[] getBytes()
From source file:com.kylinolap.job.hadoop.cube.FactDistinctColumnsMapper.java
License:Apache License
@Override public void map(KEYIN key, Text value, Context context) throws IOException, InterruptedException { try {//from w ww . j a v a 2 s . c o m bytesSplitter.split(value.getBytes(), value.getLength(), byteRowDelimiter); intermediateTableDesc.sanityCheck(bytesSplitter); SplittedBytes[] splitBuffers = bytesSplitter.getSplitBuffers(); int[] flatTableIndexes = intermediateTableDesc.getRowKeyColumnIndexes(); for (int i : factDictCols) { outputKey.set((short) i); SplittedBytes bytes = splitBuffers[flatTableIndexes[i]]; outputValue.set(bytes.value, 0, bytes.length); context.write(outputKey, outputValue); } } catch (Exception ex) { handleErrorRecord(bytesSplitter, ex); } }
From source file:com.kylinolap.job.hadoop.cube.FactDistinctColumnsReducer.java
License:Apache License
@Override public void reduce(ShortWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { TblColRef col = columnList.get(key.get()); HashSet<ByteArray> set = new HashSet<ByteArray>(); for (Text textValue : values) { ByteArray value = new ByteArray(Bytes.copy(textValue.getBytes(), 0, textValue.getLength())); set.add(value);//from w w w . j a v a 2 s . c o m } Configuration conf = context.getConfiguration(); FileSystem fs = FileSystem.get(conf); String outputPath = conf.get(BatchConstants.OUTPUT_PATH); FSDataOutputStream out = fs.create(new Path(outputPath, col.getName())); try { for (ByteArray value : set) { out.write(value.data); out.write('\n'); } } finally { out.close(); } }
From source file:com.kylinolap.job.hadoop.cube.MergeCuboidMapper.java
License:Apache License
@Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { long cuboidID = rowKeySplitter.split(key.getBytes(), key.getBytes().length); Cuboid cuboid = Cuboid.findById(cubeDesc, cuboidID); SplittedBytes[] splittedByteses = rowKeySplitter.getSplitBuffers(); int bufOffset = 0; BytesUtil.writeLong(cuboidID, newKeyBuf, bufOffset, RowConstants.ROWKEY_CUBOIDID_LEN); bufOffset += RowConstants.ROWKEY_CUBOIDID_LEN; for (int i = 0; i < cuboid.getColumns().size(); ++i) { TblColRef col = cuboid.getColumns().get(i); if (this.checkNeedMerging(col)) { // if dictionary on fact table column, needs rewrite DictionaryManager dictMgr = DictionaryManager.getInstance(config); Dictionary<?> sourceDict = dictMgr.getDictionary(sourceCubeSegment.getDictResPath(col)); Dictionary<?> mergedDict = dictMgr.getDictionary(mergedCubeSegment.getDictResPath(col)); while (sourceDict.getSizeOfValue() > newKeyBuf.length - bufOffset || mergedDict.getSizeOfValue() > newKeyBuf.length - bufOffset) { byte[] oldBuf = newKeyBuf; newKeyBuf = new byte[2 * newKeyBuf.length]; System.arraycopy(oldBuf, 0, newKeyBuf, 0, oldBuf.length); }/*w ww .j ava2 s .c om*/ int idInSourceDict = BytesUtil.readUnsigned(splittedByteses[i + 1].value, 0, splittedByteses[i + 1].length); int size = sourceDict.getValueBytesFromId(idInSourceDict, newKeyBuf, bufOffset); int idInMergedDict = mergedDict.getIdFromValueBytes(newKeyBuf, bufOffset, size); BytesUtil.writeUnsigned(idInMergedDict, newKeyBuf, bufOffset, mergedDict.getSizeOfId()); bufOffset += mergedDict.getSizeOfId(); } else { // keep as it is while (splittedByteses[i + 1].length > newKeyBuf.length - bufOffset) { byte[] oldBuf = newKeyBuf; newKeyBuf = new byte[2 * newKeyBuf.length]; System.arraycopy(oldBuf, 0, newKeyBuf, 0, oldBuf.length); } System.arraycopy(splittedByteses[i + 1].value, 0, newKeyBuf, bufOffset, splittedByteses[i + 1].length); bufOffset += splittedByteses[i + 1].length; } } byte[] newKey = Arrays.copyOf(newKeyBuf, bufOffset); outputKey.set(newKey, 0, newKey.length); context.write(outputKey, value); }
From source file:com.kylinolap.job.hadoop.cube.NDCuboidMapper.java
License:Apache License
@Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { long cuboidId = rowKeySplitter.split(key.getBytes(), key.getLength()); Cuboid parentCuboid = Cuboid.findById(cubeDesc, cuboidId); Collection<Long> myChildren = cuboidScheduler.getSpanningCuboid(cuboidId); // if still empty or null if (myChildren == null || myChildren.size() == 0) { context.getCounter(BatchConstants.MAPREDUCE_COUTNER_GROUP_NAME, "Skipped records").increment(1L); skipCounter++;/* www.j av a2s . com*/ if (skipCounter % BatchConstants.COUNTER_MAX == 0) { logger.info("Skipped " + skipCounter + " records!"); } return; } context.getCounter(BatchConstants.MAPREDUCE_COUTNER_GROUP_NAME, "Processed records").increment(1L); handleCounter++; if (handleCounter % BatchConstants.COUNTER_MAX == 0) { logger.info("Handled " + handleCounter + " records!"); } for (Long child : myChildren) { Cuboid childCuboid = Cuboid.findById(cubeDesc, child); int keyLength = buildKey(parentCuboid, childCuboid, rowKeySplitter.getSplitBuffers()); outputKey.set(keyBuf, 0, keyLength); context.write(outputKey, value); } }
From source file:com.kylinolap.job.hadoop.cube.NewBaseCuboidMapper.java
License:Apache License
@Override public void map(KEYIN key, Text value, Context context) throws IOException, InterruptedException { // combining the hive table flattening logic into base cuboid building. // the input of this mapper is the fact table rows counter++;/*ww w . j a v a 2s. co m*/ if (counter % BatchConstants.COUNTER_MAX == 0) { logger.info("Handled " + counter + " records!"); } if (!byteRowDelimiterInferred) byteRowDelimiter = bytesSplitter.inferByteRowDelimiter(value.getBytes(), value.getLength(), factTableDesc.getColumns().length); bytesSplitter.split(value.getBytes(), value.getLength(), byteRowDelimiter); try { byte[] rowKey = buildKey(bytesSplitter.getSplitBuffers()); if (rowKey == null) return;// skip this fact table row outputKey.set(rowKey, 0, rowKey.length); buildValue(bytesSplitter.getSplitBuffers()); outputValue.set(valueBuf.array(), 0, valueBuf.position()); context.write(outputKey, outputValue); } catch (Throwable t) { logger.error("", t); context.getCounter(BatchConstants.MAPREDUCE_COUTNER_GROUP_NAME, "Error records").increment(1L); return; } }
From source file:com.kylinolap.job.hadoop.cube.NewBaseCuboidMapperTest.java
License:Apache License
@Test @Ignore//from w ww . j av a2 s. co m public void testMapperWithHeader() throws Exception { String cubeName = "test_kylin_cube_with_slr_ready"; mapDriver.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); // mapDriver.getConfiguration().set(BatchConstants.CFG_METADATA_URL, // metadata); mapDriver.withInput(new Text("key"), new Text("0,2013-05-05,Auction,80053,0,5,41.204172263562,0,10000638")); List<Pair<Text, Text>> result = mapDriver.run(); CubeManager cubeMgr = CubeManager.getInstance(this.getTestConfig()); CubeInstance cube = cubeMgr.getCube(cubeName); assertEquals(1, result.size()); Text rowkey = result.get(0).getFirst(); byte[] key = rowkey.getBytes(); byte[] header = Bytes.head(key, 26); byte[] sellerId = Bytes.tail(header, 18); byte[] cuboidId = Bytes.head(header, 8); byte[] restKey = Bytes.tail(key, rowkey.getLength() - 26); RowKeyDecoder decoder = new RowKeyDecoder(cube.getFirstSegment()); decoder.decode(key); assertEquals( "[10000638, 2013-05-05, Computers/Tablets & Networking, MonitorProjectors & Accs, Monitors, Auction, 0, 5]", decoder.getValues().toString()); assertTrue(Bytes.toString(sellerId).startsWith("10000638")); assertEquals(255, Bytes.toLong(cuboidId)); assertEquals(21, restKey.length); verifyMeasures(cube.getDescriptor().getMeasures(), result.get(0).getSecond(), "41.204172263562", "41.204172263562", "41.204172263562", 1); }
From source file:com.kylinolap.job.hadoop.cube.RangeKeyDistributionReducer.java
License:Apache License
@Override protected void cleanup(Context context) throws IOException, InterruptedException { int nRegion = Math.round((float) gbPoints.size() / (float) cut); nRegion = Math.max(1, nRegion); nRegion = Math.min(MAX_REGION, nRegion); int gbPerRegion = gbPoints.size() / nRegion; gbPerRegion = Math.max(1, gbPerRegion); System.out.println(nRegion + " regions"); System.out.println(gbPerRegion + " GB per region"); for (int i = gbPerRegion; i < gbPoints.size(); i += gbPerRegion) { Text key = gbPoints.get(i); outputValue.set(i);// w w w. jav a 2 s .c om System.out.println(StringUtils.byteToHexString(key.getBytes()) + "\t" + outputValue.get()); context.write(key, outputValue); } }
From source file:com.kylinolap.job.hadoop.invertedindex.IIDistinctColumnsMapper.java
License:Apache License
@Override public void map(KEYIN key, Text value, Context context) throws IOException, InterruptedException { if (delim == -1) { delim = splitter.detectDelim(value, columns.length); }//from www . ja v a 2 s . c om int nParts = splitter.split(value.getBytes(), value.getLength(), (byte) delim); SplittedBytes[] parts = splitter.getSplitBuffers(); if (nParts != columns.length) { throw new RuntimeException("Got " + parts.length + " from -- " + value.toString() + " -- but only " + columns.length + " expected"); } for (short i = 0; i < nParts; i++) { outputKey.set(i); outputValue.set(parts[i].value, 0, parts[i].length); context.write(outputKey, outputValue); } }
From source file:com.kylinolap.job.hadoop.invertedindex.IIDistinctColumnsReducer.java
License:Apache License
@Override public void reduce(ShortWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String columnName = columns[key.get()]; HashSet<ByteArray> set = new HashSet<ByteArray>(); for (Text textValue : values) { ByteArray value = new ByteArray(Bytes.copy(textValue.getBytes(), 0, textValue.getLength())); set.add(value);//w ww. j a v a 2s.c o m } Configuration conf = context.getConfiguration(); FileSystem fs = FileSystem.get(conf); String outputPath = conf.get(BatchConstants.OUTPUT_PATH); FSDataOutputStream out = fs.create(new Path(outputPath, columnName)); try { for (ByteArray value : set) { out.write(value.data); out.write('\n'); } } finally { out.close(); } }
From source file:com.kylinolap.job.hadoop.invertedindex.InvertedIndexMapper.java
License:Apache License
@Override public void map(KEYIN key, Text value, Context context) throws IOException, InterruptedException { if (delim == -1) { delim = splitter.detectDelim(value, info.getColumnCount()); }// w w w . j ava2 s . c om int nParts = splitter.split(value.getBytes(), value.getLength(), (byte) delim); SplittedBytes[] parts = splitter.getSplitBuffers(); if (nParts != info.getColumnCount()) { throw new RuntimeException("Got " + parts.length + " from -- " + value.toString() + " -- but only " + info.getColumnCount() + " expected"); } rec.reset(); for (int i = 0; i < nParts; i++) { rec.setValueString(i, Bytes.toString(parts[i].value, 0, parts[i].length)); } outputKey.set(rec.getTimestamp()); // outputValue's backing bytes array is the same as rec context.write(outputKey, outputValue); }