Example usage for org.apache.hadoop.io Text append

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text append.

Prototype

public void append(byte[] utf8, int start, int len)

Source Link

Document

Append a range of bytes to the end of the given text

Usage

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Reads a newline into a text record from the underlying line reader.
 *
 * @param dest Text record to read line into.
 * @param eofOk Whether an EOF is acceptable in this line.
 * @return Returns the number of bytes read.
 *
 * @throws EOFException Throws if eofOk was false and we hit an EOF in
 *    the current line.//from  w  w  w .j  a v a  2 s  .com
 */
private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException {
    Text buf = new Text();
    int bytesRead = lineReader.readLine(buf, (int) Math.min(maxLineLength, end - start));

    // ok, so first, split/unsplit, compressed/uncompressed notwithstanding,
    // there are three cases we can run into:
    //
    // 1. we read data
    // 2. we are at an acceptable eof/end-of-split and don't read data
    // 3. we are at an unacceptable eof/end-of-split and don't read data
    //
    // cases 1 and 2 are consistent across split/unsplit, compressed/uncompressed.
    //
    // case 3 is simple in the unsplit or uncompressed cases; something has
    // gone wrong, we throw an EOFException, and move on with our lives
    //
    // case 3 is where working with split compressed files gets fun.
    //
    // with the split compression stream, the first time we read past the
    // end of the last compression block within a file split, we get no
    // bytes back. the BZip2Codec and BGZFCodec's actually tell us that
    // we'll get -2 back in this case, but we'll cast a wider net yet.
    //
    // this is important information---if we don't know this, we'll keep reading
    // past the end of the split to the end of the file---but we still need to
    // finish reading our multiline record, so we set some state to let us know
    // that we're reading the last record in the split (endOfCompressedSplit)
    // and repeat the read. if the read fails again, then that means that
    // something has actually gone wrong, and we want to fall through and
    // throw an EOFException or return no bytes read (depending on eofOk).
    // that's why we have the lastReadWasZeroBytes flag around. we set this
    // to true on the first read that gets bytesRead <= 0, and clear it on
    // any read that reads more than 0 bytes.
    if (isSplittable && isCompressed && !lastReadWasZeroBytes && bytesRead <= 0 && !eofOk) {

        // we need to clear the reader state so we can continue reading
        ((ResettableCompressedSplitLineReader) lineReader).reset();

        // set the state to stop us from reading another record and
        // to catch back-to-back failed reads
        lastReadWasZeroBytes = true;
        endOfCompressedSplit = true;

        // recursively call to redo the read
        return appendLineInto(dest, eofOk);
    } else if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) {
        throw new EOFException();
    } else {
        lastReadWasZeroBytes = false;
    }

    dest.append(buf.getBytes(), 0, buf.getLength());
    dest.append(newline, 0, 1);
    if (isSplittable && isCompressed) {
        pos = ((SplitCompressionInputStream) inputStream).getPos();
    } else {
        pos += bytesRead;
    }

    return bytesRead;
}

From source file:org.cloudata.core.common.util.CloudataLineReader.java

License:Apache License

/**
 * Read from the InputStream into the given Text.
 * /*from   w  w w. ja  v a2s.  c  om*/
 * @param str
 *          the object to store the given line
 * @return the number of bytes read including the newline
 * @throws IOException
 *           if the underlying stream throws
 */
public int readLine(Text str) throws IOException {
    str.clear();
    boolean hadFinalNewline = false;
    boolean hadFinalReturn = false;
    boolean hitEndOfFile = false;
    int startPosn = bufferPosn;
    outerLoop: while (true) {
        if (bufferPosn >= bufferLength) {
            if (!backfill()) {
                hitEndOfFile = true;
                break;
            }
        }
        startPosn = bufferPosn;
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            switch (buffer[bufferPosn]) {
            case '\n':
                hadFinalNewline = true;
                bufferPosn += 1;
                break outerLoop;
            case '\r':
                if (hadFinalReturn) {
                    // leave this \n in the stream, so we'll get it next time
                    break outerLoop;
                }
                hadFinalReturn = true;
                break;
            default:
                if (hadFinalReturn) {
                    break outerLoop;
                }
            }
        }
        int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0);
        if (length >= 0) {
            str.append(buffer, startPosn, length);
        }
    }
    int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 : 0);
    if (!hitEndOfFile) {
        int length = bufferPosn - startPosn - newlineLength;
        if (length > 0) {
            str.append(buffer, startPosn, length);
        }
    }
    return str.getLength() + newlineLength;
}

From source file:org.gestore.hadoop.LongRecordReader.java

License:Apache License

/******
 * Gets one complete entry//w w w  .j ava2s.c  o m
 */

private int getEntry(Pattern matcherStart, Pattern matcherStop) throws IOException {
    boolean started = false;
    boolean done = false;

    ByteBuffer newLine = ByteBuffer.allocate(2);
    newLine.putChar('\n');
    byte[] newLineBytes = newLine.array();

    Text tempLine = new Text();
    int totalRead = 0;
    int newRead = 0;
    // Discard lines before start record match, save first line that matches regex
    while (!started) {
        if (lastLine.getLength() <= 0) {
            newRead = in.readLine(tempLine, maxLineLength,
                    Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
        } else {
            tempLine = lastLine;
            newRead = lastLine.getLength();
            lastLine = new Text();
        }
        if (newRead == 0) {
            return 0;
        }
        totalRead += newRead;
        Matcher m = matcherStart.matcher(tempLine.toString());
        if (m.matches()) {
            started = true;
            tempLine.append(newLineBytes, 0, newLineBytes.length);
            value.append(tempLine.getBytes(), 0, tempLine.getLength());
            break;
        }
    }

    // Save lines until end record match, save last line
    while (!done) {
        newRead = in.readLine(tempLine, maxLineLength,
                Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
        if (newRead == 0) {
            return totalRead;
        }
        totalRead += newRead;
        Matcher m = matcherStop.matcher(tempLine.toString());
        if (m.matches()) {
            done = true;
            lastLine = tempLine;
            return totalRead -= newRead;
        }
        tempLine.append(newLineBytes, 0, newLineBytes.length);
        value.append(tempLine.getBytes(), 0, tempLine.getLength());
    }
    return totalRead;
}

From source file:org.gistic.spatialHadoop.NYCTripData.Trip.java

@Override
public Text toText(Text text) {
    text.append(attributes.getBytes(), 0, attributes.getBytes().length);
    return text;
}

From source file:org.hypertable.hadoop.mapred.HypertableRecordReader.java

License:Open Source License

private void fill_key(Text key, Key cell_key) {
    boolean clear = false;
    /* XXX not sure if "clear" is necessary */

    /*// www . j  a v a2 s.c o m
     * !!
     * If the key format changes, the code which invokes fill_key()
     * will need to be adjusted because it uses a hard-coded length
     * of 24 + cell.key.row.length()!
     */

    try {
        if (m_include_timestamps && cell_key.isSetTimestamp()) {
            t_timestamp = Long.toString(cell_key.timestamp).getBytes("UTF-8");
            clear = true;
        }
        if (cell_key.isSetRow()) {
            t_row = cell_key.row.getBytes("UTF-8");
            clear = true;
        }
        if (cell_key.isSetColumn_family()) {
            t_column_family = cell_key.column_family.getBytes("UTF-8");
            clear = true;
        }
        if (cell_key.isSetColumn_qualifier()) {
            t_column_qualifier = cell_key.column_qualifier.getBytes("UTF-8");
            clear = true;
        }
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
        System.exit(-1);
    }

    if (clear) {
        key.clear();
        if (m_include_timestamps) {
            key.append(t_timestamp, 0, t_timestamp.length);
            key.append(tab, 0, tab.length);
        }
        key.append(t_row, 0, t_row.length);
        key.append(tab, 0, tab.length);
        key.append(t_column_family, 0, t_column_family.length);
        if (t_column_qualifier.length > 0) {
            key.append(colon, 0, colon.length);
            key.append(t_column_qualifier, 0, t_column_qualifier.length);
        }
    }
}

From source file:org.hypertable.hadoop.mapred.HypertableRecordWriter.java

License:Open Source License

/**
 * Write data to HT// www .ja  v  a  2s .c  o  m
 */
public void write(Text key, Text value) throws IOException {
    try {
        key.append(tab, 0, tab.length);

        m_line.clear();
        m_line.append(key.getBytes(), 0, key.getLength());
        m_line.append(value.getBytes(), 0, value.getLength());
        int len = m_line.getLength();

        int tab_count = 0;
        int tab_pos = 0;
        int found = 0;
        while (found != -1) {
            found = m_line.find(tab_str, found + 1);
            if (found > 0) {
                tab_count++;
                if (tab_count == 1)
                    tab_pos = found;
            }
        }

        boolean has_timestamp;
        if (tab_count >= 3) {
            has_timestamp = true;
        } else if (tab_count == 2) {
            has_timestamp = false;
        } else {
            throw new Exception("incorrect output line format only " + tab_count + " tabs");
        }

        byte[] byte_array = m_line.getBytes();
        int row_offset, row_length;
        int family_offset = 0, family_length = 0;
        int qualifier_offset = 0, qualifier_length = 0;
        int value_offset = 0, value_length = 0;
        long timestamp = SerializedCellsFlag.AUTO_ASSIGN;

        int offset = 0;
        if (has_timestamp) {
            timestamp = Long.parseLong(m_line.decode(byte_array, 0, tab_pos));
            offset = tab_pos + 1;
        }

        row_offset = offset;
        tab_pos = m_line.find(tab_str, offset);
        row_length = tab_pos - row_offset;

        offset = tab_pos + 1;
        family_offset = offset;

        tab_pos = m_line.find(tab_str, offset);
        for (int i = family_offset; i < tab_pos; i++) {
            if (byte_array[i] == ':' && qualifier_offset == 0) {
                family_length = i - family_offset;
                qualifier_offset = i + 1;
            }
        }
        // no qualifier
        if (qualifier_offset == 0)
            family_length = tab_pos - family_offset;
        else
            qualifier_length = tab_pos - qualifier_offset;

        offset = tab_pos + 1;
        value_offset = offset;
        value_length = len - value_offset;

        if (!mCellsWriter.add(byte_array, row_offset, row_length, byte_array, family_offset, family_length,
                byte_array, qualifier_offset, qualifier_length, timestamp, byte_array, value_offset,
                value_length, SerializedCellsFlag.FLAG_INSERT)) {
            mClient.mutator_set_cells_serialized(mMutator, mCellsWriter.buffer(), false);
            mCellsWriter.clear();
            if ((row_length + family_length + qualifier_length + value_length + 32) > mCellsWriter.capacity())
                mCellsWriter = new SerializedCellsWriter(
                        row_length + family_length + qualifier_length + value_length + 32);
            if (!mCellsWriter.add(byte_array, row_offset, row_length, byte_array, family_offset, family_length,
                    byte_array, qualifier_offset, qualifier_length, timestamp, byte_array, value_offset,
                    value_length, SerializedCellsFlag.FLAG_INSERT))
                throw new IOException("Unable to add cell to SerializedCellsWriter " + "(row='"
                        + new String(byte_array, row_offset, row_length, "UTF-8") + "'");
        }
    } catch (Exception e) {
        log.error(e);
        throw new IOException("Unable to write cell - " + e.toString());
    }
}

From source file:org.rassee.omniture.hadoop.util.EscapedLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text. A line
 * can be terminated by one of the following: '\n' (LF), '\r' (CR),
 * or '\r\n' (CR+LF).  Will ignore any of these termination characters
 * if they are proceeded by a designated escape character. EOF also
 * terminates an otherwise unterminated line.
 *
 * @param str               the object to store the given line (without the newline)
 * @param maxLineLength     the maximum number of bytes to store into str;
 *                          the rest will be silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume in
 *                          this call.  This is only a hint, because if the line crosses this
 *                          threshold, we allow it to happen.  It can overshoot potentially by
 *                          as much as one buffer length.
 * @return the number of bytes read including the (longest) newline
 * found/* w  w  w  .  java 2s.  c o m*/
 * @throws IOException
 */
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
    * 1. No newline characters are in the buffer, so we need to copy
    *    everything and read another buffer from the stream.
    * 2. An unambiguously terminated line is in buffer, so we just
    *    copy to str.
    * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
    *    in CR.  In this case we copy everything up to CR to str, but
    *    we also need to see what follows CR: if it's LF, then we
    *    need consume LF as well, so next call to readLine will read
    *    from after that.
    * We use a flag prevCharCR to signal if previous character was CR
    * and, if it happens to be at the end of the buffer, delay
    * consuming it until we have a chance to look at the char that
    * follows.
    */
    str.clear();
    int txtLength = 0; // tracks str.getLength() as an optimization
    int newLineLength = 0; // length of the terminating newline
    boolean prevCharCR = false; // true if prev char was \r
    long bytesConsumed = 0;

    do {
        int startPos = bufferPos; // starting from where we left off
        if (bufferPos >= bufferLength) {
            startPos = bufferPos = 0;
            if (prevCharCR)
                ++bytesConsumed; // account for CR from previous read
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPos < bufferLength; ++bufferPos) {
            boolean escaped = false;
            if (prevCharCR && bufferPos > 1)
                escaped = (buffer[bufferPos - 2] == escapeChar);
            if (!prevCharCR && bufferPos > 0)
                escaped = (buffer[bufferPos - 1] == escapeChar);

            if (buffer[bufferPos] == LF && !escaped) {
                newLineLength = prevCharCR ? 2 : 1;
                ++bufferPos; // at next loop proceed from following byte
                break;
            }
            if (prevCharCR && !escaped) { // CR + notLF, we are at notLF
                newLineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPos] == CR);
            //prevCharCR = (buffer[bufferPos] == CR && !escaped);
        }
        int readLength = bufferPos - startPos;
        if (prevCharCR && newLineLength == 0)
            --readLength;
        bytesConsumed += readLength;
        int appendLength = readLength - newLineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPos, appendLength);
            txtLength += appendLength;
        }
    } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);

    return (int) bytesConsumed;
}

From source file:org.springframework.yarn.batch.item.LineReader.java

License:Apache License

/**
 * Read a line terminated by one of CR, LF, or CRLF.
 *//*from   w  w  w. java  2 s.  c om*/
private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /*
     * We're reading data from in, but the head of the stream may be already
     * buffered in buffer, so we have several cases: 1. No newline
     * characters are in the buffer, so we need to copy everything and read
     * another buffer from the stream. 2. An unambiguously terminated line
     * is in buffer, so we just copy to str. 3. Ambiguously terminated line
     * is in buffer, i.e. buffer ends in CR. In this case we copy everything
     * up to CR to str, but we also need to see what follows CR: if it's LF,
     * then we need consume LF as well, so next call to readLine will read
     * from after that. We use a flag prevCharCR to signal if previous
     * character was CR and, if it happens to be at the end of the buffer,
     * delay consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    int newlineLength = 0; // length of terminating newline
    boolean prevCharCR = false; // true of prev char was CR
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; // starting from where we left off the
        // last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR) {
                ++bytesConsumed; // account for CR from previous read
            }
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { // search for
            // newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following
                // byte
                break;
            }
            if (prevCharCR) { // CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0) {
            --readLength; // CR at the end of the buffer
        }
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:org.springframework.yarn.batch.item.LineReader.java

License:Apache License

/**
 * Read a line terminated by a custom delimiter.
 *//*  w w  w.java2s .  co  m*/
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /*
     * We're reading data from inputStream, but the head of the stream may
     * be already captured in the previous buffer, so we have several cases:
     *
     * 1. The buffer tail does not contain any character sequence which
     * matches with the head of delimiter. We count it as a ambiguous byte
     * count = 0
     *
     * 2. The buffer tail contains a X number of characters, that forms a
     * sequence, which matches with the head of delimiter. We count
     * ambiguous byte count = X
     *
     * // *** eg: A segment of input file is as follows
     *
     * " record 1792: I found this bug very interesting and I have
     * completely read about it. record 1793: This bug can be solved easily
     * record 1794: This ."
     *
     * delimiter = "record";
     *
     * supposing:- String at the end of buffer =
     * "I found this bug very interesting and I have completely re" There
     * for next buffer = "ad about it. record 179       ...."
     *
     * The matching characters in the input buffer tail and delimiter head =
     * "re" Therefore, ambiguous byte count = 2 **** //
     *
     * 2.1 If the following bytes are the remaining characters of the
     * delimiter, then we have to capture only up to the starting position
     * of delimiter. That means, we need not include the ambiguous
     * characters in str.
     *
     * 2.2 If the following bytes are not the remaining characters of the
     * delimiter ( as mentioned in the example ), then we have to include
     * the ambiguous characters in str.
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    int ambiguousByteCount = 0; // To capture the ambiguous characters count
    do {
        int startPosn = bufferPosn; // Start from previous end position
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
                delPosn++;
                if (delPosn >= recordDelimiterBytes.length) {
                    bufferPosn++;
                    break;
                }
            } else if (delPosn != 0) {
                bufferPosn--;
                delPosn = 0;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - delPosn;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            if (ambiguousByteCount > 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                // appending the ambiguous characters (refer case 2.2)
                bytesConsumed += ambiguousByteCount;
                ambiguousByteCount = 0;
            }
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
        if (bufferPosn >= bufferLength) {
            if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
                ambiguousByteCount = delPosn;
                bytesConsumed -= ambiguousByteCount; // to be consumed in
                // next
            }
        }
    } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume);
    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:StorageEngineClient.MyLineReader.java

License:Open Source License

public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {

    str.clear();/* w w  w. j  a  v  a 2  s .  co m*/
    int txtLength = 0;
    int newlineLength = 0;
    boolean prevCharCR = false;
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn;
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR)
                ++bytesConsumed;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break;
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR && lineendmode != 2) ? 2 : 1;
                ++bufferPosn;
                break;
            }
            if (prevCharCR) {
                if (lineendmode == 0) {
                    newlineLength = 1;
                    break;
                }
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0)
            --readLength;
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int) bytesConsumed;
}