List of usage examples for org.apache.hadoop.io Text append
public void append(byte[] utf8, int start, int len)
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Reads a newline into a text record from the underlying line reader. * * @param dest Text record to read line into. * @param eofOk Whether an EOF is acceptable in this line. * @return Returns the number of bytes read. * * @throws EOFException Throws if eofOk was false and we hit an EOF in * the current line.//from w w w .j a v a 2 s .com */ private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException { Text buf = new Text(); int bytesRead = lineReader.readLine(buf, (int) Math.min(maxLineLength, end - start)); // ok, so first, split/unsplit, compressed/uncompressed notwithstanding, // there are three cases we can run into: // // 1. we read data // 2. we are at an acceptable eof/end-of-split and don't read data // 3. we are at an unacceptable eof/end-of-split and don't read data // // cases 1 and 2 are consistent across split/unsplit, compressed/uncompressed. // // case 3 is simple in the unsplit or uncompressed cases; something has // gone wrong, we throw an EOFException, and move on with our lives // // case 3 is where working with split compressed files gets fun. // // with the split compression stream, the first time we read past the // end of the last compression block within a file split, we get no // bytes back. the BZip2Codec and BGZFCodec's actually tell us that // we'll get -2 back in this case, but we'll cast a wider net yet. // // this is important information---if we don't know this, we'll keep reading // past the end of the split to the end of the file---but we still need to // finish reading our multiline record, so we set some state to let us know // that we're reading the last record in the split (endOfCompressedSplit) // and repeat the read. if the read fails again, then that means that // something has actually gone wrong, and we want to fall through and // throw an EOFException or return no bytes read (depending on eofOk). // that's why we have the lastReadWasZeroBytes flag around. we set this // to true on the first read that gets bytesRead <= 0, and clear it on // any read that reads more than 0 bytes. if (isSplittable && isCompressed && !lastReadWasZeroBytes && bytesRead <= 0 && !eofOk) { // we need to clear the reader state so we can continue reading ((ResettableCompressedSplitLineReader) lineReader).reset(); // set the state to stop us from reading another record and // to catch back-to-back failed reads lastReadWasZeroBytes = true; endOfCompressedSplit = true; // recursively call to redo the read return appendLineInto(dest, eofOk); } else if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) { throw new EOFException(); } else { lastReadWasZeroBytes = false; } dest.append(buf.getBytes(), 0, buf.getLength()); dest.append(newline, 0, 1); if (isSplittable && isCompressed) { pos = ((SplitCompressionInputStream) inputStream).getPos(); } else { pos += bytesRead; } return bytesRead; }
From source file:org.cloudata.core.common.util.CloudataLineReader.java
License:Apache License
/** * Read from the InputStream into the given Text. * /*from w w w. ja v a2s. c om*/ * @param str * the object to store the given line * @return the number of bytes read including the newline * @throws IOException * if the underlying stream throws */ public int readLine(Text str) throws IOException { str.clear(); boolean hadFinalNewline = false; boolean hadFinalReturn = false; boolean hitEndOfFile = false; int startPosn = bufferPosn; outerLoop: while (true) { if (bufferPosn >= bufferLength) { if (!backfill()) { hitEndOfFile = true; break; } } startPosn = bufferPosn; for (; bufferPosn < bufferLength; ++bufferPosn) { switch (buffer[bufferPosn]) { case '\n': hadFinalNewline = true; bufferPosn += 1; break outerLoop; case '\r': if (hadFinalReturn) { // leave this \n in the stream, so we'll get it next time break outerLoop; } hadFinalReturn = true; break; default: if (hadFinalReturn) { break outerLoop; } } } int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0); if (length >= 0) { str.append(buffer, startPosn, length); } } int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 : 0); if (!hitEndOfFile) { int length = bufferPosn - startPosn - newlineLength; if (length > 0) { str.append(buffer, startPosn, length); } } return str.getLength() + newlineLength; }
From source file:org.gestore.hadoop.LongRecordReader.java
License:Apache License
/****** * Gets one complete entry//w w w .j ava2s.c o m */ private int getEntry(Pattern matcherStart, Pattern matcherStop) throws IOException { boolean started = false; boolean done = false; ByteBuffer newLine = ByteBuffer.allocate(2); newLine.putChar('\n'); byte[] newLineBytes = newLine.array(); Text tempLine = new Text(); int totalRead = 0; int newRead = 0; // Discard lines before start record match, save first line that matches regex while (!started) { if (lastLine.getLength() <= 0) { newRead = in.readLine(tempLine, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); } else { tempLine = lastLine; newRead = lastLine.getLength(); lastLine = new Text(); } if (newRead == 0) { return 0; } totalRead += newRead; Matcher m = matcherStart.matcher(tempLine.toString()); if (m.matches()) { started = true; tempLine.append(newLineBytes, 0, newLineBytes.length); value.append(tempLine.getBytes(), 0, tempLine.getLength()); break; } } // Save lines until end record match, save last line while (!done) { newRead = in.readLine(tempLine, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); if (newRead == 0) { return totalRead; } totalRead += newRead; Matcher m = matcherStop.matcher(tempLine.toString()); if (m.matches()) { done = true; lastLine = tempLine; return totalRead -= newRead; } tempLine.append(newLineBytes, 0, newLineBytes.length); value.append(tempLine.getBytes(), 0, tempLine.getLength()); } return totalRead; }
From source file:org.gistic.spatialHadoop.NYCTripData.Trip.java
@Override public Text toText(Text text) { text.append(attributes.getBytes(), 0, attributes.getBytes().length); return text; }
From source file:org.hypertable.hadoop.mapred.HypertableRecordReader.java
License:Open Source License
private void fill_key(Text key, Key cell_key) { boolean clear = false; /* XXX not sure if "clear" is necessary */ /*// www . j a v a2 s.c o m * !! * If the key format changes, the code which invokes fill_key() * will need to be adjusted because it uses a hard-coded length * of 24 + cell.key.row.length()! */ try { if (m_include_timestamps && cell_key.isSetTimestamp()) { t_timestamp = Long.toString(cell_key.timestamp).getBytes("UTF-8"); clear = true; } if (cell_key.isSetRow()) { t_row = cell_key.row.getBytes("UTF-8"); clear = true; } if (cell_key.isSetColumn_family()) { t_column_family = cell_key.column_family.getBytes("UTF-8"); clear = true; } if (cell_key.isSetColumn_qualifier()) { t_column_qualifier = cell_key.column_qualifier.getBytes("UTF-8"); clear = true; } } catch (UnsupportedEncodingException e) { e.printStackTrace(); System.exit(-1); } if (clear) { key.clear(); if (m_include_timestamps) { key.append(t_timestamp, 0, t_timestamp.length); key.append(tab, 0, tab.length); } key.append(t_row, 0, t_row.length); key.append(tab, 0, tab.length); key.append(t_column_family, 0, t_column_family.length); if (t_column_qualifier.length > 0) { key.append(colon, 0, colon.length); key.append(t_column_qualifier, 0, t_column_qualifier.length); } } }
From source file:org.hypertable.hadoop.mapred.HypertableRecordWriter.java
License:Open Source License
/** * Write data to HT// www .ja v a 2s .c o m */ public void write(Text key, Text value) throws IOException { try { key.append(tab, 0, tab.length); m_line.clear(); m_line.append(key.getBytes(), 0, key.getLength()); m_line.append(value.getBytes(), 0, value.getLength()); int len = m_line.getLength(); int tab_count = 0; int tab_pos = 0; int found = 0; while (found != -1) { found = m_line.find(tab_str, found + 1); if (found > 0) { tab_count++; if (tab_count == 1) tab_pos = found; } } boolean has_timestamp; if (tab_count >= 3) { has_timestamp = true; } else if (tab_count == 2) { has_timestamp = false; } else { throw new Exception("incorrect output line format only " + tab_count + " tabs"); } byte[] byte_array = m_line.getBytes(); int row_offset, row_length; int family_offset = 0, family_length = 0; int qualifier_offset = 0, qualifier_length = 0; int value_offset = 0, value_length = 0; long timestamp = SerializedCellsFlag.AUTO_ASSIGN; int offset = 0; if (has_timestamp) { timestamp = Long.parseLong(m_line.decode(byte_array, 0, tab_pos)); offset = tab_pos + 1; } row_offset = offset; tab_pos = m_line.find(tab_str, offset); row_length = tab_pos - row_offset; offset = tab_pos + 1; family_offset = offset; tab_pos = m_line.find(tab_str, offset); for (int i = family_offset; i < tab_pos; i++) { if (byte_array[i] == ':' && qualifier_offset == 0) { family_length = i - family_offset; qualifier_offset = i + 1; } } // no qualifier if (qualifier_offset == 0) family_length = tab_pos - family_offset; else qualifier_length = tab_pos - qualifier_offset; offset = tab_pos + 1; value_offset = offset; value_length = len - value_offset; if (!mCellsWriter.add(byte_array, row_offset, row_length, byte_array, family_offset, family_length, byte_array, qualifier_offset, qualifier_length, timestamp, byte_array, value_offset, value_length, SerializedCellsFlag.FLAG_INSERT)) { mClient.mutator_set_cells_serialized(mMutator, mCellsWriter.buffer(), false); mCellsWriter.clear(); if ((row_length + family_length + qualifier_length + value_length + 32) > mCellsWriter.capacity()) mCellsWriter = new SerializedCellsWriter( row_length + family_length + qualifier_length + value_length + 32); if (!mCellsWriter.add(byte_array, row_offset, row_length, byte_array, family_offset, family_length, byte_array, qualifier_offset, qualifier_length, timestamp, byte_array, value_offset, value_length, SerializedCellsFlag.FLAG_INSERT)) throw new IOException("Unable to add cell to SerializedCellsWriter " + "(row='" + new String(byte_array, row_offset, row_length, "UTF-8") + "'"); } } catch (Exception e) { log.error(e); throw new IOException("Unable to write cell - " + e.toString()); } }
From source file:org.rassee.omniture.hadoop.util.EscapedLineReader.java
License:Open Source License
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF), '\r' (CR), * or '\r\n' (CR+LF). Will ignore any of these termination characters * if they are proceeded by a designated escape character. EOF also * terminates an otherwise unterminated line. * * @param str the object to store the given line (without the newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest will be silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume in * this call. This is only a hint, because if the line crosses this * threshold, we allow it to happen. It can overshoot potentially by * as much as one buffer length. * @return the number of bytes read including the (longest) newline * found/* w w w . java 2s. c o m*/ * @throws IOException */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; // tracks str.getLength() as an optimization int newLineLength = 0; // length of the terminating newline boolean prevCharCR = false; // true if prev char was \r long bytesConsumed = 0; do { int startPos = bufferPos; // starting from where we left off if (bufferPos >= bufferLength) { startPos = bufferPos = 0; if (prevCharCR) ++bytesConsumed; // account for CR from previous read bufferLength = in.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPos < bufferLength; ++bufferPos) { boolean escaped = false; if (prevCharCR && bufferPos > 1) escaped = (buffer[bufferPos - 2] == escapeChar); if (!prevCharCR && bufferPos > 0) escaped = (buffer[bufferPos - 1] == escapeChar); if (buffer[bufferPos] == LF && !escaped) { newLineLength = prevCharCR ? 2 : 1; ++bufferPos; // at next loop proceed from following byte break; } if (prevCharCR && !escaped) { // CR + notLF, we are at notLF newLineLength = 1; break; } prevCharCR = (buffer[bufferPos] == CR); //prevCharCR = (buffer[bufferPos] == CR && !escaped); } int readLength = bufferPos - startPos; if (prevCharCR && newLineLength == 0) --readLength; bytesConsumed += readLength; int appendLength = readLength - newLineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPos, appendLength); txtLength += appendLength; } } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int) bytesConsumed; }
From source file:org.springframework.yarn.batch.item.LineReader.java
License:Apache License
/** * Read a line terminated by one of CR, LF, or CRLF. *//*from w w w. java 2 s. c om*/ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* * We're reading data from in, but the head of the stream may be already * buffered in buffer, so we have several cases: 1. No newline * characters are in the buffer, so we need to copy everything and read * another buffer from the stream. 2. An unambiguously terminated line * is in buffer, so we just copy to str. 3. Ambiguously terminated line * is in buffer, i.e. buffer ends in CR. In this case we copy everything * up to CR to str, but we also need to see what follows CR: if it's LF, * then we need consume LF as well, so next call to readLine will read * from after that. We use a flag prevCharCR to signal if previous * character was CR and, if it happens to be at the end of the buffer, * delay consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization int newlineLength = 0; // length of terminating newline boolean prevCharCR = false; // true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; // starting from where we left off the // last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) { ++bytesConsumed; // account for CR from previous read } bufferLength = in.read(buffer); if (bufferLength <= 0) { break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { // search for // newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following // byte break; } if (prevCharCR) { // CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) { --readLength; // CR at the end of the buffer } bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:org.springframework.yarn.batch.item.LineReader.java
License:Apache License
/** * Read a line terminated by a custom delimiter. *//* w w w.java2s . co m*/ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* * We're reading data from inputStream, but the head of the stream may * be already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a ambiguous byte * count = 0 * * 2. The buffer tail contains a X number of characters, that forms a * sequence, which matches with the head of delimiter. We count * ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and I have * completely read about it. record 1793: This bug can be solved easily * record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" There * for next buffer = "ad about it. record 179 ...." * * The matching characters in the input buffer tail and delimiter head = * "re" Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of the * delimiter, then we have to capture only up to the starting position * of delimiter. That means, we need not include the ambiguous * characters in str. * * 2.2 If the following bytes are not the remaining characters of the * delimiter ( as mentioned in the example ), then we have to include * the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount = 0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn--; delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); // appending the ambiguous characters (refer case 2.2) bytesConsumed += ambiguousByteCount; ambiguousByteCount = 0; } str.append(buffer, startPosn, appendLength); txtLength += appendLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; // to be consumed in // next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:StorageEngineClient.MyLineReader.java
License:Open Source License
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { str.clear();/* w w w. j a v a 2 s . co m*/ int txtLength = 0; int newlineLength = 0; boolean prevCharCR = false; long bytesConsumed = 0; do { int startPosn = bufferPosn; if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) ++bytesConsumed; bufferLength = in.read(buffer); if (bufferLength <= 0) break; } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR && lineendmode != 2) ? 2 : 1; ++bufferPosn; break; } if (prevCharCR) { if (lineendmode == 0) { newlineLength = 1; break; } } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) --readLength; bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int) bytesConsumed; }