Example usage for org.apache.hadoop.io Text append

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text append.

Prototype

public void append(byte[] utf8, int start, int len)

Source Link

Document

Append a range of bytes to the end of the given text

Usage

From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.EscapedLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text. A line
 * can be terminated by one of the following: '\n' (LF), '\r' (CR),
 * or '\r\n' (CR+LF).  Will ignore any of these termination characters
 * if they are proceeded by a designated escape character. EOF also
 * terminates an otherwise unterminated line.
 *
 * @param str               the object to store the given line (without the newline)
 * @param maxLineLength     the maximum number of bytes to store into str; the rest will be silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume in this call.  This is only a hint, because if the line crosses this threshold, we allow it to happen.  It can overshoot
 *                          potentially by as much as one buffer length.
 * @return the number of bytes read including the (longest) newline found
 *///from w  ww .  jav a 2 s.c o  m
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
    * 1. No newline characters are in the buffer, so we need to copy
    *    everything and read another buffer from the stream.
    * 2. An unambiguously terminated line is in buffer, so we just
    *    copy to str.
    * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
    *    in CR.  In this case we copy everything up to CR to str, but
    *    we also need to see what follows CR: if it's LF, then we
    *    need consume LF as well, so next call to readLine will read
    *    from after that.
    * We use a flag prevCharCR to signal if previous character was CR
    * and, if it happens to be at the end of the buffer, delay
    * consuming it until we have a chance to look at the char that
    * follows.
    */
    str.clear();
    int txtLength = 0; // tracks str.getLength() as an optimization
    int newLineLength = 0; // length of the terminating newline
    boolean prevCharCR = false; // true if prev char was \r
    long bytesConsumed = 0;

    do {
        int startPos = bufferPos; // starting from where we left off
        if (bufferPos >= bufferLength) {
            startPos = bufferPos = 0;
            if (prevCharCR) {
                ++bytesConsumed; // account for CR from previous read
            }
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                break; // EOF
            }
        }
        for (; bufferPos < bufferLength; ++bufferPos) {
            boolean escaped = false;
            if (prevCharCR && bufferPos > 1) {
                escaped = (buffer[bufferPos - 2] == escapeChar);
            }
            if (!prevCharCR && bufferPos > 0) {
                escaped = (buffer[bufferPos - 1] == escapeChar);
            }

            if (buffer[bufferPos] == LF && !escaped) {
                newLineLength = prevCharCR ? 2 : 1;
                ++bufferPos; // at next loop proceed from following byte
                break;
            }
            if (prevCharCR && !escaped) { // CR + notLF, we are at notLF
                newLineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPos] == CR);
            //prevCharCR = (buffer[bufferPos] == CR && !escaped);
        }
        int readLength = bufferPos - startPos;
        if (prevCharCR && newLineLength == 0) {
            --readLength;
        }
        bytesConsumed += readLength;
        int appendLength = readLength - newLineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPos, appendLength);
            txtLength += appendLength;
        }
    } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }

    return (int) bytesConsumed;
}

From source file:cosmos.impl.CosmosImpl.java

License:Apache License

protected Mutation getDocumentPrefix(Store id, Record<?> record, byte[] suffix) {
    final Text t = new Text();
    byte[] b = id.uuid().getBytes();
    t.append(b, 0, b.length);
    t.append(new byte[] { 0 }, 0, 1);
    t.append(suffix, 0, suffix.length);//  ww w.j  a v  a  2  s  .c  om

    return new Mutation(t);
}

From source file:cosmos.mapred.AggregatingRecordReader.java

License:Apache License

private void textAppend(Text t, String s) throws IOException {
    try {//  w  w w . j  a v a 2s .  c  o m
        ByteBuffer buf = Text.encode(s, false);
        t.append(buf.array(), 0, buf.limit());
    } catch (CharacterCodingException e) {
        throw new IOException(e);
    }
}

From source file:cosmos.mapred.LfLineReader.java

License:Apache License

/**
 * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line.
 * //from   w  w  w .j a  va2s.c  om
 * @param str
 *          the object to store the given line (without newline)
 * @param maxLineLength
 *          the maximum number of bytes to store into str; the rest of the line is silently discarded.
 * @param maxBytesToConsume
 *          the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It can
 *          overshoot potentially by as much as one buffer length.
 * 
 * @return the number of bytes read including the (longest) newline found.
 * 
 * @throws IOException
 *           if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /*
     * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in the
     * buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy to str.
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    int newlineLength = 0; // length of terminating newline
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; // starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int) bytesConsumed;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step2DistinctDataJobTest.java

License:Apache License

@Test
public void testSplit() throws Exception {
    Text key = new Text("123_456789");

    // hard-split using array copy
    int i = key.find("_", 0);

    Text outputKey = new Text("");
    byte[] bytes = key.getBytes();
    outputKey.append(bytes, i + 1, bytes.length - i - 2);

    String fileName = new String(bytes, 0, i);

    assertEquals("123", fileName);
    assertEquals("456789", outputKey.toString());
}

From source file:eastcircle.terasort.TotalOrderPartitioner.java

License:Apache License

/**
 * Given a sorted set of cut points, build a trie that will find the correct
 * partition quickly./*from   w ww .j  av  a  2  s  .c  o m*/
 * @param splits the list of cut points
 * @param lower the lower bound of partitions 0..numPartitions-1
 * @param upper the upper bound of partitions 0..numPartitions-1
 * @param prefix the prefix that we have already checked against
 * @param maxDepth the maximum depth we will build a trie for
 * @return the trie node that will divide the splits correctly
 */
private static TrieNode buildTrie(Text[] splits, int lower, int upper, Text prefix, int maxDepth) {
    int depth = prefix.getLength();
    if (depth >= maxDepth || lower == upper) {
        return new LeafTrieNode(depth, splits, lower, upper);
    }
    InnerTrieNode result = new InnerTrieNode(depth);
    Text trial = new Text(prefix);
    // append an extra byte on to the prefix
    trial.append(new byte[1], 0, 1);
    int currentBound = lower;
    for (int ch = 0; ch < 255; ++ch) {
        trial.getBytes()[depth] = (byte) (ch + 1);
        lower = currentBound;
        while (currentBound < upper) {
            if (splits[currentBound].compareTo(trial) >= 0) {
                break;
            }
            currentBound += 1;
        }
        trial.getBytes()[depth] = (byte) ch;
        result.child[ch] = buildTrie(splits, lower, currentBound, trial, maxDepth);
    }
    // pick up the rest
    trial.getBytes()[depth] = (byte) 255;
    result.child[255] = buildTrie(splits, currentBound, upper, trial, maxDepth);
    return result;
}

From source file:edu.cshl.schatz.jnomics.util.TextCutter.java

License:Open Source License

/**
 * Sets the contents of <code>text</code> to the value of the requested cuts
 * (including any intermediate delimiters).
 * <p>//from w  w w .  j a  v  a  2 s  .  c o  m
 * Negative cut indices are interpreted as
 * <code>{@link #getCutCount()} + cutIndex</code>. For example, a
 * <code>cutIndex</code> of -1 would return the last cut.
 * <p>
 * If <code>lastIndex < firstIndex</code> (after converting negative indices
 * to their positive equivalents), then the resulting cut order is reversed.
 * For example, given the input "0 1 2 3 4", <code>getCutRange(4,2)</code>
 * and <code>getCutRange(-1,-3)</code> would both return "4 3 2".
 * 
 * @param text The {@link Text} instance to reset.
 * @param firstIndex The 0-based index of the first desired cut in the range
 *            (inclusive).
 * @param lastIndex The 0-based index of the last desired cut in the range
 *            (inclusive).
 * @return The passed {@link Text} instance text (not a copy).
 * @throws ArrayIndexOutOfBoundsException if cutIndex is greater than or
 *             equal to the number of cuts.
 */
public Text getCutRange(Text text, int firstIndex, int lastIndex) {
    if (modFlag) {
        reinitialize();
    }

    if (firstIndex < 0) {
        firstIndex = cutCount + firstIndex;
    }

    if (lastIndex < 0) {
        lastIndex = cutCount + lastIndex;
    }

    if (lastIndex >= cutCount) {
        throw new ArrayIndexOutOfBoundsException("Requested cut does not exist (cutCount=" + cutCount + ")");
    }

    int position, length;

    if (firstIndex <= lastIndex) {
        position = cutIndices[firstIndex][0];
        length = lastIndex - firstIndex;

        for (int i = firstIndex; i <= lastIndex; i++) {
            length += cutIndices[i][1];
        }

        text.set(sourceText.getBytes(), position, length);
    } else {
        final byte[] delimBytes = new byte[] { (byte) delimiterChar };

        position = cutIndices[firstIndex][0];
        length = cutIndices[firstIndex][1];
        text.set(sourceText.getBytes(), position, length);

        for (int i = firstIndex - 1; i >= lastIndex; i--) {
            position = cutIndices[i][0];
            length = cutIndices[i][1];

            text.append(delimBytes, 0, 1);
            text.append(sourceText.getBytes(), position, length);
        }
    }

    return text;
}

From source file:edu.isi.mavuno.extract.ChunkExtractor.java

License:Apache License

private void loadChunkPairs() {
    // clear chunk pairs
    mChunkPairs.clear();//  ww w . ja v a 2  s .c  o m

    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();

    // extract chunks from sentence
    mChunks.clear();
    mChunkTokens.clear();
    List<TratzParsedTokenWritable> tokens = sentence.getTokens();
    Text lastNETag = new Text();
    for (int i = 0; i < tokens.size(); i++) {
        TratzParsedTokenWritable t = tokens.get(i);
        byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0;
        Text neTag = t.getNETag();
        if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0
                || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O'))
                        && (chunkType == 'B' || chunkType == 'O')) {
            if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') {
                Text chunk = createChunk(mChunkTokens, mChunkType);
                mChunks.add(chunk);
            }
            mChunkTokens.clear();
            mChunkType.set(t.getChunkTag());
        }
        mChunkTokens.add(t.getToken());
        lastNETag.set(neTag);
    }

    // handle last chunk in sentence
    if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') {
        Text chunk = createChunk(mChunkTokens, mChunkType);
        mChunks.add(chunk);
    }

    // generate adjacent (context, pattern) pairs
    for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) {
        Text leftPattern = new Text();
        leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength());
        leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1));

        Text rightPattern = new Text();
        rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength());
        addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1));
    }

    // generate non-adjacent (context, pattern) pairs based on chunks
    for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) {
        for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) {
            if (patternPos - leftSkip - 1 < 0) {
                continue;
            }

            if (mOrContextStyle && !mRightOnlyContextStyle) {
                addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos),
                        ContextPatternWritable.ASTERISK);
            }

            if (mOrContextStyle && mLeftOnlyContextStyle) {
                continue;
            }

            for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) {
                if (patternPos + rightSkip + 1 >= mChunks.size()) {
                    continue;
                }

                // construct (context, pattern) pair
                if (mOrContextStyle) {
                    addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos),
                            mChunks.get(patternPos + rightSkip + 1));
                } else {
                    addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos),
                            mChunks.get(patternPos + rightSkip + 1));
                }
            }
        }
    }

    // get iterator
    mChunkPairsIter = mChunkPairs.iterator();
}

From source file:edu.isi.mavuno.extract.ChunkExtractor.java

License:Apache License

private Text createChunk(List<Text> terms, Text type) {
    Text t = new Text();

    for (int i = 0; i < terms.size(); i++) {
        Text term = terms.get(i);

        t.append(term.getBytes(), 0, term.getLength());

        if (i != terms.size() - 1) {
            t.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
        }/*from   ww  w  .j a  v  a2 s  . c  o m*/
    }

    if (t.getLength() > 0 && !mSurfaceForms) {
        t.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        if (type.getLength() > 2) {
            t.append(type.getBytes(), 2, type.getLength() - 2);
        }
    }

    return t;
}

From source file:edu.isi.mavuno.extract.CooccurExtractor.java

License:Apache License

protected boolean getPattern(Text pattern, Text[] terms, int start, int len) {
    if (start < 0 || start + len > terms.length) {
        return false;
    }//from   w  w w. j  ava2s . c  o m

    pattern.clear();
    for (int i = start; i < start + len; i++) {
        pattern.append(terms[i].getBytes(), 0, terms[i].getLength());
        if (i != start + len - 1) {
            pattern.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
        }
    }

    return true;
}