List of usage examples for org.apache.hadoop.io Text getBytes
@Override public byte[] getBytes()
From source file:org.apache.pig.impl.util.StorageUtil.java
License:Apache License
/** * Transform a line of <code>Text</code> to a <code>Tuple</code> * * @param val a line of text/* w ww . j ava 2 s . c om*/ * @param fieldDel the field delimiter * @return tuple constructed from the text */ public static Tuple textToTuple(Text val, byte fieldDel) { return bytesToTuple(val.getBytes(), 0, val.getLength(), fieldDel); }
From source file:org.apache.pig.piggybank.storage.CSVExcelStorage.java
License:Apache License
@Override public Tuple getNext() throws IOException { // If SKIP_INPUT_HEADER and this is the first input split, skip header record // We store its value as a string though, so we can compare // further records to it. If they are the same (this would // happen if multiple small files each with a header were combined // into one split), we know to skip the duplicate header record as well. if (loadingFirstRecord && headerTreatment == Headers.SKIP_INPUT_HEADER && (splitIndex == 0 || splitIndex == -1)) { try {//from w w w . j av a 2 s . c o m if (!in.nextKeyValue()) return null; header = ((Text) in.getCurrentValue()).toString(); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } loadingFirstRecord = false; mProtoTuple = new ArrayList<Object>(); getNextInQuotedField = false; boolean evenQuotesSeen = true; boolean sawEmbeddedRecordDelimiter = false; byte[] buf = null; if (!mRequiredColumnsInitialized) { if (udfContextSignature != null) { Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature)); } mRequiredColumnsInitialized = true; } // Note: we cannot factor out the check for nextKeyValue() being null, // because that call overwrites buf with the new line, which is // bad if we have a field with a newline. try { int recordLen = 0; getNextFieldID = 0; while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) { Text value = null; if (sawEmbeddedRecordDelimiter) { // Deal with pulling more records from the input, because // a double quoted embedded newline was encountered in a field. // Save the length of the record so far, plus one byte for the // record delimiter (usually newline) that's embedded in the field // we were working on before falling into this branch: int prevLineLen = recordLen + 1; // Save previous line (the one with the field that has the newline) in a new array. // The last byte will be random; we'll fill in the embedded // record delimiter (usually newline) below: byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen); prevLineSaved[prevLineLen - 1] = RECORD_DEL; // Read the continuation of the record, unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); recordLen = value.getLength(); // Grab the continuation's bytes: buf = value.getBytes(); // Combine the previous line and the continuation into a new array. // The following copyOf() does half the job: it allocates all the // space, and also copies the previous line into that space: byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen); // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos, lengthToCopy: System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen); // We'll work with the combination now: buf = prevLineAndContinuation; // Do the whole record over from the start: mProtoTuple.clear(); getNextInQuotedField = false; evenQuotesSeen = true; getNextFieldID = 0; recordLen = prevLineAndContinuation.length; } else { // Previous record finished cleanly: start with the next record, // unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it // (this might happen if multiple files each with a header are combined into a single split) if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) { if (!in.nextKeyValue()) return null; value = (Text) in.getCurrentValue(); } buf = value.getBytes(); getNextFieldID = 0; recordLen = value.getLength(); } nextTupleSkipChar = false; ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen); sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer); // The last field is never delimited by a FIELD_DEL, but by // the end of the record. So we need to add that last field. // The '!sawEmbeddedRecordDelimiter' handles the case of // embedded newlines; we are amidst a field, not at // the final record: if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++); } // end while } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); return t; }
From source file:org.apache.rya.accumulo.query.RangeBindingSetEntries.java
License:Apache License
/** * // w ww . j a va2 s . c om * @param colFamily * @param startColFamily * @param stopColFamily * @return true if colFamily lies between startColFamily and stopColFamily */ private boolean validateContext(Text colFamily, Text startColFamily, Text stopColFamily) { byte[] cfBytes = colFamily.getBytes(); byte[] start = startColFamily.getBytes(); byte[] stop = stopColFamily.getBytes(); // range has empty column family, so all Keys falling with Range Row // constraints should match if (start.length == 0 && stop.length == 0) { return true; } int result1 = WritableComparator.compareBytes(cfBytes, 0, cfBytes.length, start, 0, start.length); int result2 = WritableComparator.compareBytes(cfBytes, 0, cfBytes.length, stop, 0, stop.length); return result1 >= 0 && result2 <= 0; }
From source file:org.apache.rya.indexing.accumulo.entity.AccumuloDocIdIndexer.java
License:Apache License
private QueryBindingSet deserializeKey(final Key key, final StarQuery sq, final BindingSet currentBs, final Set<String> unCommonVar) { final QueryBindingSet currentSolutionBs = new QueryBindingSet(); final Text row = key.getRow(); final Text cq = key.getColumnQualifier(); final String[] cqArray = cq.toString().split(DocIndexIteratorUtil.DOC_ID_INDEX_DELIM); boolean commonVarSet = false; //if common Var is constant there is no common variable to assign a value to if (sq.commonVarConstant()) { commonVarSet = true;/* ww w . j av a 2 s . c o m*/ } if (!commonVarSet && sq.isCommonVarURI()) { final RyaURI rURI = new RyaURI(row.toString()); currentSolutionBs.addBinding(sq.getCommonVarName(), RyaToRdfConversions.convertValue(rURI)); commonVarSet = true; } for (final String s : sq.getUnCommonVars()) { final byte[] cqBytes = cqArray[sq.getVarPos().get(s)].getBytes(StandardCharsets.UTF_8); final int firstIndex = Bytes.indexOf(cqBytes, DELIM_BYTE); final int secondIndex = Bytes.lastIndexOf(cqBytes, DELIM_BYTE); final int typeIndex = Bytes.indexOf(cqBytes, TYPE_DELIM_BYTE); final String tripleComponent = new String(Arrays.copyOfRange(cqBytes, firstIndex + 1, secondIndex), StandardCharsets.UTF_8); final byte[] cqContent = Arrays.copyOfRange(cqBytes, secondIndex + 1, typeIndex); final byte[] objType = Arrays.copyOfRange(cqBytes, typeIndex, cqBytes.length); if (tripleComponent.equals("object")) { final byte[] object = Bytes.concat(cqContent, objType); org.openrdf.model.Value v = null; try { v = RyaToRdfConversions.convertValue(RyaContext.getInstance().deserialize(object)); } catch (final RyaTypeResolverException e) { e.printStackTrace(); } currentSolutionBs.addBinding(s, v); } else if (tripleComponent.equals("subject")) { if (!commonVarSet) { final byte[] object = Bytes.concat(row.getBytes(), objType); org.openrdf.model.Value v = null; try { v = RyaToRdfConversions.convertValue(RyaContext.getInstance().deserialize(object)); } catch (final RyaTypeResolverException e) { e.printStackTrace(); } currentSolutionBs.addBinding(sq.getCommonVarName(), v); commonVarSet = true; } final RyaURI rURI = new RyaURI(new String(cqContent, StandardCharsets.UTF_8)); currentSolutionBs.addBinding(s, RyaToRdfConversions.convertValue(rURI)); } else { throw new IllegalArgumentException("Invalid row."); } } for (final String s : unCommonVar) { currentSolutionBs.addBinding(s, currentBs.getValue(s)); } return currentSolutionBs; }
From source file:org.apache.tajo.storage.sequencefile.SequenceFileScanner.java
License:Apache License
@Override public Tuple next() throws IOException { if (!more)//w w w . jav a 2 s . c o m return null; long pos = reader.getPosition(); boolean remaining = reader.next(EMPTY_KEY); if (pos >= end && reader.syncSeen()) { more = false; } else { more = remaining; } if (more) { Tuple tuple = null; byte[][] cells; if (hasBinarySerDe) { BytesWritable bytesWritable = new BytesWritable(); reader.getCurrentValue(bytesWritable); tuple = makeTuple(bytesWritable); totalBytes += (long) bytesWritable.getBytes().length; } else { Text text = new Text(); reader.getCurrentValue(text); cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), delimiter, projectionMap, schema.getColumns().size()); totalBytes += (long) text.getBytes().length; tuple = new LazyTuple(schema, cells, 0, nullChars, serde); } currentIdx++; return tuple; } else { return null; } }
From source file:org.archive.jbs.Merge.java
License:Apache License
/** * Utility method to construct a JSON Object from a Text *//*from www . j a v a2 s . c om*/ public static Document fromText(Text text) throws IOException { return new Document(new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "utf-8")); }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Position the input stream at the start of the first record. * * @param stream The stream to reposition. *///from ww w.j a v a2 s . c o m protected final int positionAtFirstRecord(final FSDataInputStream stream, final CompressionCodec codec) throws IOException { Text buffer = new Text(); long originalStart = start; LineReader reader; if (codec == null) { // Advance to the start of the first record that ends with /1 // We use a temporary LineReader to read lines until we find the // position of the right one. We then seek the file to that position. stream.seek(start); reader = new LineReader(stream); } else { // Unlike the codec == null case, we don't seek before creating the // reader, SplittableCompressionCodec.createInputStream places the // stream at the start of the first compression block after our // split start // // as noted above, we need to be at pos 0 in the stream before // calling this reader = new LineReader(((SplittableCompressionCodec) codec).createInputStream(stream, null, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK)); } int bytesRead = 0; do { bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); int bufferLength = buffer.getLength(); if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) { start += bytesRead; } else { // line starts with @. Read two more and verify that it starts // with a +: // // @<readname> // <sequence> // +[readname] // // if the second line we read starts with a @, we know that // we've read: // // <qualities> <-- @ is a valid ASCII phred encoding // @<readname> // // and thus, the second read is the delimiter and we can break long trackForwardPosition = start + bytesRead; bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); if (buffer.getLength() > 0 && buffer.getBytes()[0] == '@') { start = trackForwardPosition; break; } else { trackForwardPosition += bytesRead; } bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); trackForwardPosition += bytesRead; if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') { break; // all good! } else { start = trackForwardPosition; } } } while (bytesRead > 0); pos = start; start = originalStart; stream.seek(start); return (int) (pos - originalStart); }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Parses a read from an interleaved FASTQ file. * * Only reads a single record./*from www. java 2 s . c o m*/ * * @param readName Text record containing read name. Output parameter. * @param value Text record containing full record. Output parameter. * @return Returns true if read was successful (did not hit EOF). * * @throws RuntimeException Throws exception if FASTQ record doesn't * have proper formatting (e.g., record doesn't start with @). */ protected final boolean lowLevelFastqRead(final Text readName, final Text value) throws IOException { if (endOfCompressedSplit) { return false; } // ID line readName.clear(); long skipped = appendLineInto(readName, true); if (skipped == 0) { return false; // EOF } if (readName.getBytes()[0] != '@') { throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage() + ". Line: " + readName + ". \n"); } value.append(readName.getBytes(), 0, readName.getLength()); // sequence appendLineInto(value, false); // separator line appendLineInto(value, false); // quality appendLineInto(value, false); return true; }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Reads a newline into a text record from the underlying line reader. * * @param dest Text record to read line into. * @param eofOk Whether an EOF is acceptable in this line. * @return Returns the number of bytes read. * * @throws EOFException Throws if eofOk was false and we hit an EOF in * the current line.//from w w w. j a va 2 s . c o m */ private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException { Text buf = new Text(); int bytesRead = lineReader.readLine(buf, (int) Math.min(maxLineLength, end - start)); // ok, so first, split/unsplit, compressed/uncompressed notwithstanding, // there are three cases we can run into: // // 1. we read data // 2. we are at an acceptable eof/end-of-split and don't read data // 3. we are at an unacceptable eof/end-of-split and don't read data // // cases 1 and 2 are consistent across split/unsplit, compressed/uncompressed. // // case 3 is simple in the unsplit or uncompressed cases; something has // gone wrong, we throw an EOFException, and move on with our lives // // case 3 is where working with split compressed files gets fun. // // with the split compression stream, the first time we read past the // end of the last compression block within a file split, we get no // bytes back. the BZip2Codec and BGZFCodec's actually tell us that // we'll get -2 back in this case, but we'll cast a wider net yet. // // this is important information---if we don't know this, we'll keep reading // past the end of the split to the end of the file---but we still need to // finish reading our multiline record, so we set some state to let us know // that we're reading the last record in the split (endOfCompressedSplit) // and repeat the read. if the read fails again, then that means that // something has actually gone wrong, and we want to fall through and // throw an EOFException or return no bytes read (depending on eofOk). // that's why we have the lastReadWasZeroBytes flag around. we set this // to true on the first read that gets bytesRead <= 0, and clear it on // any read that reads more than 0 bytes. if (isSplittable && isCompressed && !lastReadWasZeroBytes && bytesRead <= 0 && !eofOk) { // we need to clear the reader state so we can continue reading ((ResettableCompressedSplitLineReader) lineReader).reset(); // set the state to stop us from reading another record and // to catch back-to-back failed reads lastReadWasZeroBytes = true; endOfCompressedSplit = true; // recursively call to redo the read return appendLineInto(dest, eofOk); } else if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) { throw new EOFException(); } else { lastReadWasZeroBytes = false; } dest.append(buf.getBytes(), 0, buf.getLength()); dest.append(newline, 0, 1); if (isSplittable && isCompressed) { pos = ((SplitCompressionInputStream) inputStream).getPos(); } else { pos += bytesRead; } return bytesRead; }
From source file:org.cloudata.examples.web.DocFreqReduce.java
License:Apache License
public void reduce(WritableComparable key, Iterator<Writable> values, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { if (exception != null) { throw exception; }//from ww w .j a va 2s .com Text tKey = (Text) key; Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength()); int docFreq = 0; while (values.hasNext()) { docFreq++; } Row row = new Row(rowKey); try { row.addCell("df", new Cell(Cell.Key.EMPTY_KEY, Long.toString(docFreq).getBytes())); termTable.put(row); } catch (Exception e) { LOG.error(e.getMessage(), e); } }