List of usage examples for org.apache.hadoop.io Text getBytes
@Override public byte[] getBytes()
From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java
License:Apache License
/** * Decide the start of the reader.//from w w w . ja va 2 s. c om */ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // if (codec instanceof CryptoCodec && job instanceof JobConf) // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec, // (JobConf) job, file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn), job); } else { in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } LOG.info("Read from " + split.getPath().toString()); // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); // Read another line as previous. Text current = new Text(); int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start)); LOG.info("Skip line " + previous + " for last split."); start += newSize; // Keep reading until a splitable point is found. while (start <= end) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (canSplit(previous.getBytes(), current.getBytes())) { break; } start += newSize; previous.set(current.getBytes()); LOG.info("Skip line " + previous + " for last split."); } // If exceed the end, still read one extra line. if (start > end) { if (isContinue) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (!canSplit(previous.getBytes(), current.getBytes())) { // Still not splitable. So skip the block. start += newSize; isContinue = false; } } } LOG.info("Split between: \n" + previous + "\n" + current); // Restart at the last read line. fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } this.pos = start; } else { Text skip = new Text(); start += in.readLine(skip, maxLineLength, maxBytesToConsume(start)); // start += in.readLine(skip, 0, maxBytesToConsume(start)); LOG.info("Skip line " + skip + ". Start at " + start); } // Restart at the start index. }
From source file:com.cloudera.castagna.logparser.pig.LogLoader.java
License:Apache License
@Override public Tuple getNext() throws IOException { try {/*from w w w . ja v a2s . c om*/ boolean notDone = in.nextKeyValue(); if (!notDone) { return null; } Text value = (Text) in.getCurrentValue(); byte[] ba = value.getBytes(); // make a copy of the bytes representing the input since // TextInputFormat will reuse the byte array return mTupleFactory.newTuple(new DataByteArray(ba, 0, value.getLength())); } catch (InterruptedException e) { throw new IOException("Error getting input"); } }
From source file:com.cloudera.impala.hive.executor.TestUdf.java
License:Apache License
public Text evaluate(Text a) { if (a == null) return null; return new Text(a.getBytes()); }
From source file:com.cloudera.recordservice.examples.terasort.TeraValidate.java
License:Apache License
private static String textifyBytes(Text t) { BytesWritable b = new BytesWritable(); b.set(t.getBytes(), 0, t.getLength()); return b.toString(); }
From source file:com.dappervision.hbase.mapred.TypedBytesTableReducer.java
License:Apache License
@Override public void reduce(Text key, Iterator<Text> values, OutputCollector<TypedBytesWritable, TypedBytesWritable> outputCollector, Reporter arg3) throws IOException { byte[] keyBytes = key.getBytes(); TypedBytesWritable keyWritable = new TypedBytesWritable(); TypedBytesWritable valueWritable = new TypedBytesWritable(); keyWritable.setValue(new Buffer(keyBytes)); //merge the column family and qualifier HashMap<String, HashMap<String, String>> cfMap = new HashMap<String, HashMap<String, String>>(); while (values.hasNext()) { Text value = values.next(); String strVal = value.toString(); //Separate column family with comma (:) //Separate the qualifier and value with equity String[] cf_qual_val_parts = strVal.split(":"); String cf = cf_qual_val_parts[0]; String qual_val = cf_qual_val_parts[1]; String[] qual_val_parts = qual_val.split("="); String qual = qual_val_parts[0]; String val = qual_val_parts[1]; if (cfMap.get(cf) != null) { HashMap<String, String> qualMap = cfMap.get(cf); if (qualMap == null) { qualMap = new HashMap<String, String>(); }//from w w w .ja va2s .c om qualMap.put(qual, val); // the duplicated key will be replaced, if using Buffer, we should do it ourselves } else { HashMap<String, String> qualMap = new HashMap<String, String>(); qualMap.put(qual, val); cfMap.put(cf, qualMap); } } HashMap<Buffer, HashMap<Buffer, Buffer>> bufMap = new HashMap<Buffer, HashMap<Buffer, Buffer>>(); Set<Entry<String, HashMap<String, String>>> entrySet = cfMap.entrySet(); for (Entry<String, HashMap<String, String>> entry : entrySet) { HashMap<String, String> qualValMap = entry.getValue(); HashMap<Buffer, Buffer> qualValBufMap = new HashMap<Buffer, Buffer>(); for (Entry<String, String> qualValEntry : qualValMap.entrySet()) { qualValBufMap.put(new Buffer(qualValEntry.getKey().getBytes()), new Buffer(qualValEntry.getValue().getBytes())); } bufMap.put(new Buffer(entry.getKey().getBytes()), qualValBufMap); } valueWritable.setValue(bufMap); outputCollector.collect(keyWritable, valueWritable); }
From source file:com.datasalt.utils.mapred.joiner.MultiJoinChanneledMapper.java
License:Apache License
protected void emit(Text grouping, WritableComparable secondarySort, OUTPUT_VALUE datum) throws IOException, InterruptedException { emitBytes(grouping.getBytes(), 0, grouping.getLength(), secondarySort, datum); }
From source file:com.datasalt.utils.mapred.joiner.MultiJoinChanneledMapper.java
License:Apache License
protected void emit(Text grouping, OUTPUT_VALUE datum) throws IOException, InterruptedException { emitBytes(grouping.getBytes(), 0, grouping.getLength(), null, datum); }
From source file:com.datasalt.utils.mapred.joiner.MultiJoinMultiChannelMapper.java
License:Apache License
protected void emit(Text grouping, WritableComparable secondarySort, Object datum, int channel) throws IOException, InterruptedException { emitBytes(grouping.getBytes(), 0, grouping.getLength(), secondarySort, datum, channel); }
From source file:com.datasalt.utils.mapred.joiner.MultiJoinMultiChannelMapper.java
License:Apache License
protected void emit(Text grouping, Object datum, int channel) throws IOException, InterruptedException { emitBytes(grouping.getBytes(), 0, grouping.getLength(), null, datum, channel); }
From source file:com.ebay.nest.io.sede.binarysortable.BinarySortableSerDe.java
License:Apache License
static Text deserializeText(InputByteBuffer buffer, boolean invert, Text r) throws IOException { // Get the actual length first int start = buffer.tell(); int length = 0; do {/* w w w. java2s . co m*/ byte b = buffer.read(invert); if (b == 0) { // end of string break; } if (b == 1) { // the last char is an escape char. read the actual char buffer.read(invert); } length++; } while (true); if (length == buffer.tell() - start) { // No escaping happened, so we are already done. r.set(buffer.getData(), start, length); } else { // Escaping happened, we need to copy byte-by-byte. // 1. Set the length first. r.set(buffer.getData(), start, length); // 2. Reset the pointer. buffer.seek(start); // 3. Copy the data. byte[] rdata = r.getBytes(); for (int i = 0; i < length; i++) { byte b = buffer.read(invert); if (b == 1) { // The last char is an escape char, read the actual char. // The serialization format escape \0 to \1, and \1 to \2, // to make sure the string is null-terminated. b = (byte) (buffer.read(invert) - 1); } rdata[i] = b; } // 4. Read the null terminator. byte b = buffer.read(invert); assert (b == 0); } return r; }