List of usage examples for org.apache.hadoop.io Text clear
public void clear()
From source file:accumulo.ingest.AbstractAccumuloCsvIngest.java
License:Apache License
protected void setRowId(Text buffer, Text fileName, long recordCount) { final byte[] rowSuffix = lex.encode(recordCount); buffer.clear(); buffer.append(fileName.getBytes(), 0, fileName.getLength()); buffer.append(rowSuffix, 0, rowSuffix.length); }
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Parses a read from an interleaved FASTQ file. * * Only reads a single record.//from w ww .j av a 2 s. c o m * * @param readName Text record containing read name. Output parameter. * @param value Text record containing full record. Output parameter. * @return Returns true if read was successful (did not hit EOF). * * @throws RuntimeException Throws exception if FASTQ record doesn't * have proper formatting (e.g., record doesn't start with @). */ protected boolean lowLevelFastqRead(Text readName, Text value) throws IOException { // ID line readName.clear(); long skipped = appendLineInto(readName, true); pos += skipped; if (skipped == 0) { return false; // EOF } if (readName.getBytes()[0] != '@') { throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage() + ". Line: " + readName + ". \n"); } value.append(readName.getBytes(), 0, readName.getLength()); // sequence appendLineInto(value, false); // separator line appendLineInto(value, false); // quality appendLineInto(value, false); return true; }
From source file:cn.ac.ncic.mastiff.io.coding.ORCStringEcnodingUtil.java
License:Apache License
public String readEachValue(Text previous) throws IOException { Text result = null; int entry = (int) reader.next(); if (previous == null) { result = new Text(); } else {/*from ww w.j a v a2 s.c om*/ result = (Text) previous; } int offset = dictionaryOffsets[entry]; int length; // if it isn't the last entry, subtract the offsets otherwise use // the buffer length. if (entry < dictionaryOffsets.length - 1) { length = dictionaryOffsets[entry + 1] - offset; } else { length = dictionaryBuffer.size() - offset; } // If the column is just empty strings, the size will be zero, // so the buffer will be null, in that case just return result // as it will default to empty if (dictionaryBuffer != null) { dictionaryBuffer.setText(result, offset, length); } else { result.clear(); } return result.toString(); }
From source file:cn.ac.ncic.mastiff.io.coding.RedBlackTreeStringReader.java
License:Apache License
public String readEachValue(Text previous) throws IOException { Text result = null; int entry = (int) reader.next(); if (previous == null) { result = new Text(); } else {/*from w w w .java2 s . c o m*/ result = (Text) previous; } int offset = dictionaryOffsets[entry]; int length; if (entry < dictionaryOffsets.length - 1) { length = dictionaryOffsets[entry + 1] - offset; } else { length = dictionaryBuffer.size() - offset; } // If the column is just empty strings, the size will be zero, // so the buffer will be null, in that case just return result // as it will default to empty if (dictionaryBuffer != null) { dictionaryBuffer.setText(result, offset, length); } else { result.clear(); } // } return result.toString(); }
From source file:com.alexholmes.hadooputils.sort.DelimitedLineReader.java
License:Apache License
/** * Read a line terminated by one of CR, LF, or CRLF. *//* w ww . j a v a2 s. c om*/ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) { ++bytesConsumed; //account for CR from previous read } bufferLength = fillBuffer(in, buffer, prevCharCR); if (bufferLength <= 0) { break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) { --readLength; //CR at the end of the buffer } bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:com.alexholmes.hadooputils.sort.DelimitedLineReader.java
License:Apache License
/** * Read a line terminated by a custom delimiter. *///from ww w .ja v a 2 s. c o m private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from inputStream, but the head of the stream may be * already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a * ambiguous byte count = 0 * * 2. The buffer tail contains a X number of characters, * that forms a sequence, which matches with the * head of delimiter. We count ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and * I have completely read about it. record 1793: This bug * can be solved easily record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" * There for next buffer = "ad about it. record 179 ...." * * The matching characters in the input * buffer tail and delimiter head = "re" * Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of * the delimiter, then we have to capture only up to the starting * position of delimiter. That means, we need not include the * ambiguous characters in str. * * 2.2 If the following bytes are not the remaining characters of * the delimiter ( as mentioned in the example ), * then we have to include the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount = 0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0); if (bufferLength <= 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn--; delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); //appending the ambiguous characters (refer case 2.2) bytesConsumed += ambiguousByteCount; ambiguousByteCount = 0; } str.append(buffer, startPosn, appendLength); txtLength += appendLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; //to be consumed in next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:com.asakusafw.runtime.io.sequencefile.SequenceFileUtilTest.java
License:Apache License
/** * Reads a sequence file./*from w w w . ja v a 2s . c o m*/ * @throws Exception if failed */ @Test public void read() throws Exception { Path path = new Path("testing"); Text key = new Text(); Text value = new Text(); try (SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass())) { key.set("Hello"); value.set("World"); writer.append(key, value); } key.clear(); value.clear(); FileStatus status = fs.getFileStatus(path); try (InputStream in = new FileInputStream(fs.pathToFile(path)); SequenceFile.Reader reader = SequenceFileUtil.openReader(in, status, conf)) { assertThat(reader.next(key, value), is(true)); assertThat(key.toString(), is("Hello")); assertThat(value.toString(), is("World")); assertThat(reader.next(key, value), is(false)); } }
From source file:com.asakusafw.runtime.io.sequencefile.SequenceFileUtilTest.java
License:Apache License
/** * Reads a sequence file./*from w w w. j a v a 2 s . com*/ * @throws Exception if failed */ @Test public void read_new() throws Exception { Path path = new Path("testing"); Text key = new Text(); Text value = new Text(); try (SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass())) { key.set("Hello"); value.set("World"); writer.append(key, value); } key.clear(); value.clear(); FileStatus status = fs.getFileStatus(path); try (InputStream in = new FileInputStream(fs.pathToFile(path)); SequenceFile.Reader reader = SequenceFileUtil.openReader(in, status.getLen(), conf)) { assertThat(reader.next(key, value), is(true)); assertThat(key.toString(), is("Hello")); assertThat(value.toString(), is("World")); assertThat(reader.next(key, value), is(false)); } }
From source file:com.asakusafw.runtime.io.sequencefile.SequenceFileUtilTest.java
License:Apache License
/** * Creates a sequence file./*from w w w . j a v a 2 s. c om*/ * @throws Exception if failed */ @Test public void write() throws Exception { Path path = new Path("testing"); Text key = new Text(); Text value = new Text(); try (OutputStream out = new FileOutputStream(fs.pathToFile(path)); SequenceFile.Writer writer = SequenceFileUtil.openWriter(new BufferedOutputStream(out), conf, key.getClass(), value.getClass(), null)) { key.set("Hello"); value.set("World"); writer.append(key, value); } key.clear(); value.clear(); try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf)) { assertThat(reader.next(key, value), is(true)); assertThat(key.toString(), is("Hello")); assertThat(value.toString(), is("World")); assertThat(reader.next(key, value), is(false)); } }
From source file:com.ashishpaliwal.hadoop.utils.inputformat.CsvLineReader.java
License:Apache License
/** * Read from the InputStream into the given Text. * * @param txt the object to store the given line * @param maxLineLength the maximum number of bytes to store into txt. * @param maxBytesToConsume the maximum number of bytes to consume in this * call.//from w w w .j a v a2s . c om * @return the number of bytes read including the newline * @throws IOException if the underlying stream throws */ public int readLine(Text txt, int maxLineLength, int maxBytesToConsume) throws IOException { txt.clear(); boolean hadFinalNewline = false; boolean hadFinalReturn = false; boolean hitEndOfFile = false; int startPosn = bufferPosn; long bytesConsumed = 0; boolean inQuote = false; boolean isLastCharEscapeChar = false; outerLoop: while (true) { if (bufferPosn >= bufferLength) { if (!backfill()) { hitEndOfFile = true; break; } } startPosn = bufferPosn; for (; bufferPosn < bufferLength; ++bufferPosn) { switch (buffer[bufferPosn]) { case '\\': isLastCharEscapeChar = !isLastCharEscapeChar; break; case '"': if (!inQuote && hadFinalReturn) { break outerLoop; } if (!isLastCharEscapeChar) { inQuote = !inQuote; } isLastCharEscapeChar = false; break; case '\n': isLastCharEscapeChar = false; if (!inQuote) { hadFinalNewline = true; bufferPosn += 1; break outerLoop; } break; case '\r': isLastCharEscapeChar = false; if (!inQuote) { if (hadFinalReturn) { // leave this \r in the stream, so we'll get it next time break outerLoop; } hadFinalReturn = true; } break; default: isLastCharEscapeChar = false; if (!inQuote && hadFinalReturn) { break outerLoop; } } } bytesConsumed += bufferPosn - startPosn; int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0); length = Math.min(length, maxLineLength - txt.getLength()); if (length >= 0) txt.append(buffer, startPosn, length); if (bytesConsumed >= maxBytesToConsume) return (int) Math.min(bytesConsumed, (long) Integer.MAX_VALUE); } int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 : 0); if (!hitEndOfFile) { bytesConsumed += bufferPosn - startPosn; int length = bufferPosn - startPosn - newlineLength; length = Math.min(length, maxLineLength - txt.getLength()); if (length > 0) txt.append(buffer, startPosn, length); } return (int) Math.min(bytesConsumed, (long) Integer.MAX_VALUE); }