List of usage examples for org.apache.hadoop.io Text append
public void append(byte[] utf8, int start, int len)
From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.EscapedLineReader.java
License:Open Source License
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF), '\r' (CR), * or '\r\n' (CR+LF). Will ignore any of these termination characters * if they are proceeded by a designated escape character. EOF also * terminates an otherwise unterminated line. * * @param str the object to store the given line (without the newline) * @param maxLineLength the maximum number of bytes to store into str; the rest will be silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume in this call. This is only a hint, because if the line crosses this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * @return the number of bytes read including the (longest) newline found *///from w ww . jav a 2 s.c o m public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; // tracks str.getLength() as an optimization int newLineLength = 0; // length of the terminating newline boolean prevCharCR = false; // true if prev char was \r long bytesConsumed = 0; do { int startPos = bufferPos; // starting from where we left off if (bufferPos >= bufferLength) { startPos = bufferPos = 0; if (prevCharCR) { ++bytesConsumed; // account for CR from previous read } bufferLength = in.read(buffer); if (bufferLength <= 0) { break; // EOF } } for (; bufferPos < bufferLength; ++bufferPos) { boolean escaped = false; if (prevCharCR && bufferPos > 1) { escaped = (buffer[bufferPos - 2] == escapeChar); } if (!prevCharCR && bufferPos > 0) { escaped = (buffer[bufferPos - 1] == escapeChar); } if (buffer[bufferPos] == LF && !escaped) { newLineLength = prevCharCR ? 2 : 1; ++bufferPos; // at next loop proceed from following byte break; } if (prevCharCR && !escaped) { // CR + notLF, we are at notLF newLineLength = 1; break; } prevCharCR = (buffer[bufferPos] == CR); //prevCharCR = (buffer[bufferPos] == CR && !escaped); } int readLength = bufferPos - startPos; if (prevCharCR && newLineLength == 0) { --readLength; } bytesConsumed += readLength; int appendLength = readLength - newLineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPos, appendLength); txtLength += appendLength; } } while (newLineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:cosmos.impl.CosmosImpl.java
License:Apache License
protected Mutation getDocumentPrefix(Store id, Record<?> record, byte[] suffix) { final Text t = new Text(); byte[] b = id.uuid().getBytes(); t.append(b, 0, b.length); t.append(new byte[] { 0 }, 0, 1); t.append(suffix, 0, suffix.length);// ww w.j a v a 2 s .c om return new Mutation(t); }
From source file:cosmos.mapred.AggregatingRecordReader.java
License:Apache License
private void textAppend(Text t, String s) throws IOException { try {// w w w . j a v a 2s . c o m ByteBuffer buf = Text.encode(s, false); t.append(buf.array(), 0, buf.limit()); } catch (CharacterCodingException e) { throw new IOException(e); } }
From source file:cosmos.mapred.LfLineReader.java
License:Apache License
/** * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line. * //from w w w .j a va2s.c om * @param str * the object to store the given line (without newline) * @param maxLineLength * the maximum number of bytes to store into str; the rest of the line is silently discarded. * @param maxBytesToConsume * the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It can * overshoot potentially by as much as one buffer length. * * @return the number of bytes read including the (longest) newline found. * * @throws IOException * if the underlying stream throws */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in the * buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy to str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization int newlineLength = 0; // length of terminating newline long bytesConsumed = 0; do { int startPosn = bufferPosn; // starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline if (buffer[bufferPosn] == LF) { newlineLength = 1; ++bufferPosn; // at next invocation proceed from following byte break; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int) bytesConsumed; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step2DistinctDataJobTest.java
License:Apache License
@Test public void testSplit() throws Exception { Text key = new Text("123_456789"); // hard-split using array copy int i = key.find("_", 0); Text outputKey = new Text(""); byte[] bytes = key.getBytes(); outputKey.append(bytes, i + 1, bytes.length - i - 2); String fileName = new String(bytes, 0, i); assertEquals("123", fileName); assertEquals("456789", outputKey.toString()); }
From source file:eastcircle.terasort.TotalOrderPartitioner.java
License:Apache License
/** * Given a sorted set of cut points, build a trie that will find the correct * partition quickly./*from w ww .j av a 2 s .c o m*/ * @param splits the list of cut points * @param lower the lower bound of partitions 0..numPartitions-1 * @param upper the upper bound of partitions 0..numPartitions-1 * @param prefix the prefix that we have already checked against * @param maxDepth the maximum depth we will build a trie for * @return the trie node that will divide the splits correctly */ private static TrieNode buildTrie(Text[] splits, int lower, int upper, Text prefix, int maxDepth) { int depth = prefix.getLength(); if (depth >= maxDepth || lower == upper) { return new LeafTrieNode(depth, splits, lower, upper); } InnerTrieNode result = new InnerTrieNode(depth); Text trial = new Text(prefix); // append an extra byte on to the prefix trial.append(new byte[1], 0, 1); int currentBound = lower; for (int ch = 0; ch < 255; ++ch) { trial.getBytes()[depth] = (byte) (ch + 1); lower = currentBound; while (currentBound < upper) { if (splits[currentBound].compareTo(trial) >= 0) { break; } currentBound += 1; } trial.getBytes()[depth] = (byte) ch; result.child[ch] = buildTrie(splits, lower, currentBound, trial, maxDepth); } // pick up the rest trial.getBytes()[depth] = (byte) 255; result.child[255] = buildTrie(splits, currentBound, upper, trial, maxDepth); return result; }
From source file:edu.cshl.schatz.jnomics.util.TextCutter.java
License:Open Source License
/** * Sets the contents of <code>text</code> to the value of the requested cuts * (including any intermediate delimiters). * <p>//from w w w . j a v a 2 s . c o m * Negative cut indices are interpreted as * <code>{@link #getCutCount()} + cutIndex</code>. For example, a * <code>cutIndex</code> of -1 would return the last cut. * <p> * If <code>lastIndex < firstIndex</code> (after converting negative indices * to their positive equivalents), then the resulting cut order is reversed. * For example, given the input "0 1 2 3 4", <code>getCutRange(4,2)</code> * and <code>getCutRange(-1,-3)</code> would both return "4 3 2". * * @param text The {@link Text} instance to reset. * @param firstIndex The 0-based index of the first desired cut in the range * (inclusive). * @param lastIndex The 0-based index of the last desired cut in the range * (inclusive). * @return The passed {@link Text} instance text (not a copy). * @throws ArrayIndexOutOfBoundsException if cutIndex is greater than or * equal to the number of cuts. */ public Text getCutRange(Text text, int firstIndex, int lastIndex) { if (modFlag) { reinitialize(); } if (firstIndex < 0) { firstIndex = cutCount + firstIndex; } if (lastIndex < 0) { lastIndex = cutCount + lastIndex; } if (lastIndex >= cutCount) { throw new ArrayIndexOutOfBoundsException("Requested cut does not exist (cutCount=" + cutCount + ")"); } int position, length; if (firstIndex <= lastIndex) { position = cutIndices[firstIndex][0]; length = lastIndex - firstIndex; for (int i = firstIndex; i <= lastIndex; i++) { length += cutIndices[i][1]; } text.set(sourceText.getBytes(), position, length); } else { final byte[] delimBytes = new byte[] { (byte) delimiterChar }; position = cutIndices[firstIndex][0]; length = cutIndices[firstIndex][1]; text.set(sourceText.getBytes(), position, length); for (int i = firstIndex - 1; i >= lastIndex; i--) { position = cutIndices[i][0]; length = cutIndices[i][1]; text.append(delimBytes, 0, 1); text.append(sourceText.getBytes(), position, length); } } return text; }
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private void loadChunkPairs() { // clear chunk pairs mChunkPairs.clear();// ww w . ja v a 2 s .c o m // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // extract chunks from sentence mChunks.clear(); mChunkTokens.clear(); List<TratzParsedTokenWritable> tokens = sentence.getTokens(); Text lastNETag = new Text(); for (int i = 0; i < tokens.size(); i++) { TratzParsedTokenWritable t = tokens.get(i); byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0; Text neTag = t.getNETag(); if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0 || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O')) && (chunkType == 'B' || chunkType == 'O')) { if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } mChunkTokens.clear(); mChunkType.set(t.getChunkTag()); } mChunkTokens.add(t.getToken()); lastNETag.set(neTag); } // handle last chunk in sentence if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } // generate adjacent (context, pattern) pairs for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) { Text leftPattern = new Text(); leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength()); leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1)); Text rightPattern = new Text(); rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength()); addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1)); } // generate non-adjacent (context, pattern) pairs based on chunks for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) { for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) { if (patternPos - leftSkip - 1 < 0) { continue; } if (mOrContextStyle && !mRightOnlyContextStyle) { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), ContextPatternWritable.ASTERISK); } if (mOrContextStyle && mLeftOnlyContextStyle) { continue; } for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) { if (patternPos + rightSkip + 1 >= mChunks.size()) { continue; } // construct (context, pattern) pair if (mOrContextStyle) { addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } else { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } } } } // get iterator mChunkPairsIter = mChunkPairs.iterator(); }
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private Text createChunk(List<Text> terms, Text type) { Text t = new Text(); for (int i = 0; i < terms.size(); i++) { Text term = terms.get(i); t.append(term.getBytes(), 0, term.getLength()); if (i != terms.size() - 1) { t.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); }/*from ww w .j a v a2 s . c o m*/ } if (t.getLength() > 0 && !mSurfaceForms) { t.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); if (type.getLength() > 2) { t.append(type.getBytes(), 2, type.getLength() - 2); } } return t; }
From source file:edu.isi.mavuno.extract.CooccurExtractor.java
License:Apache License
protected boolean getPattern(Text pattern, Text[] terms, int start, int len) { if (start < 0 || start + len > terms.length) { return false; }//from w w w. j ava2s . c o m pattern.clear(); for (int i = start; i < start + len; i++) { pattern.append(terms[i].getBytes(), 0, terms[i].getLength()); if (i != start + len - 1) { pattern.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); } } return true; }