List of usage examples for org.apache.hadoop.io Text getBytes
@Override public byte[] getBytes()
From source file:edu.cmu.cs.in.hadoop.HoopKeyComparer.java
License:Open Source License
/** * */// w ww. j a v a2s . co m public int compare(Text o1, Text o2) { return compare(o1.getBytes(), 0, o1.getLength(), o2.getBytes(), 0, o2.getLength()); }
From source file:edu.cshl.schatz.jnomics.util.TextCutter.java
License:Open Source License
/** * Defines the text that the user wishes to get cuts of. The original * {@link Text} instance is copied, and isn't used directly. * //from w w w . j a v a2 s. c o m * @param text The text to cut get cuts of. * @param start The 0-based position within the text to begin the cut. * @param length The number of characters to copy to the text cutter. * @return This instance. */ public TextCutter set(Text text, int start, int length) { sourceText.set(text.getBytes(), start, length); modFlag = true; return this; }
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private void loadChunkPairs() { // clear chunk pairs mChunkPairs.clear();/*w w w .j av a 2 s . c om*/ // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // extract chunks from sentence mChunks.clear(); mChunkTokens.clear(); List<TratzParsedTokenWritable> tokens = sentence.getTokens(); Text lastNETag = new Text(); for (int i = 0; i < tokens.size(); i++) { TratzParsedTokenWritable t = tokens.get(i); byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0; Text neTag = t.getNETag(); if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0 || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O')) && (chunkType == 'B' || chunkType == 'O')) { if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } mChunkTokens.clear(); mChunkType.set(t.getChunkTag()); } mChunkTokens.add(t.getToken()); lastNETag.set(neTag); } // handle last chunk in sentence if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } // generate adjacent (context, pattern) pairs for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) { Text leftPattern = new Text(); leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength()); leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1)); Text rightPattern = new Text(); rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength()); addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1)); } // generate non-adjacent (context, pattern) pairs based on chunks for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) { for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) { if (patternPos - leftSkip - 1 < 0) { continue; } if (mOrContextStyle && !mRightOnlyContextStyle) { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), ContextPatternWritable.ASTERISK); } if (mOrContextStyle && mLeftOnlyContextStyle) { continue; } for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) { if (patternPos + rightSkip + 1 >= mChunks.size()) { continue; } // construct (context, pattern) pair if (mOrContextStyle) { addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } else { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } } } } // get iterator mChunkPairsIter = mChunkPairs.iterator(); }
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private void addPair(Text left, Text pattern, Text right) { ContextPatternWritable c;// ww w .j a v a 2s.c o m // forward pattern c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(left, right)); c.setPattern(pattern); // add to chunk pairs mChunkPairs.add(c); // reverse pattern c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(right, left)); c.setPattern(REVERSE_PATTERN); c.getPattern().append(pattern.getBytes(), 0, pattern.getLength()); // add to chunk pairs mChunkPairs.add(c); }
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private Text createChunk(List<Text> terms, Text type) { Text t = new Text(); for (int i = 0; i < terms.size(); i++) { Text term = terms.get(i); t.append(term.getBytes(), 0, term.getLength()); if (i != terms.size() - 1) { t.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); }//from w w w . j a v a2 s .c om } if (t.getLength() > 0 && !mSurfaceForms) { t.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); if (type.getLength() > 2) { t.append(type.getBytes(), 2, type.getLength() - 2); } } return t; }
From source file:edu.isi.mavuno.extract.DIRTExtractor.java
License:Apache License
private void loadDependPairs() { // clear dependency pairs mDependPairs.clear();//from w w w. j a v a 2 s.com // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // get sentence tokens List<TratzParsedTokenWritable> tokens = sentence.getTokens(); // get chunk ids int[] chunkIds = NLProcTools.getChunkIds(tokens); // get mapping from positions to chunks Text[] chunks = new Text[tokens.size()]; Text curChunk = null; for (int i = 0; i < tokens.size(); i++) { Text text = tokens.get(i).getToken(); if (i == 0 || (i > 0 && chunkIds[i] != chunkIds[i - 1])) { curChunk = new Text(text); } else { curChunk.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); curChunk.append(text.getBytes(), 0, text.getLength()); } chunks[i] = curChunk; } // populate parse tree ArrayListOfInts[] children = new ArrayListOfInts[tokens.size() + 1]; for (int i = 0; i < tokens.size() + 1; i++) { children[i] = new ArrayListOfInts(); } for (int i = 0; i < tokens.size(); i++) { TratzParsedTokenWritable t = tokens.get(i); // ignore punctuation if (!t.getDependType().equals(PUNCTUATION_TYPE)) { children[t.getDependIndex()].add(i + 1); } } // extract (context, pattern) pairs from parse tree for (int i = 0; i < children[0].size(); i++) { extractPairs(children, children[0].get(i), tokens, chunks); } // get iterator mDependPairsIter = mDependPairs.iterator(); }
From source file:edu.isi.mavuno.extract.DIRTExtractor.java
License:Apache License
private List<ContextPatternWritable> getContext(ArrayListOfInts leftPath, ArrayListOfInts rightPath, List<TratzParsedTokenWritable> tokens, Text[] chunks) { //, int leftContextSize, int rightContextSize) { // construct (context, pattern) pairs List<ContextPatternWritable> contexts = new ArrayList<ContextPatternWritable>(); // make sure that the dimensions are feasible if (leftPath.size() < 1 || rightPath.size() < 1) { return contexts; }/*from ww w. j av a 2 s . c om*/ // make sure we don't split the left context's chunk Text leftChunk = chunks[leftPath.get(0) - 1]; for (int i = 1; i <= leftPath.size() - 1; i++) { if (chunks[leftPath.get(i) - 1].equals(leftChunk)) { return contexts; } } // make sure we don't split the right context's chunk Text rightChunk = chunks[rightPath.get(0) - 1]; for (int i = rightPath.size() - 1; i >= 1; i--) { if (chunks[rightPath.get(i) - 1].equals(rightChunk)) { return contexts; } } TratzParsedTokenWritable t; Text term, posTag, dependType; // construct pattern based on the parse tree path final Text pattern = new Text(); // encode left context chunk type // TODO: replace this with a more robust way of checking if this is an actual named entity or not Text leftNETag = tokens.get(leftPath.get(0) - 1).getNETag(); Text leftChunkTag = tokens.get(leftPath.get(0) - 1).getChunkTag(); if (leftNETag.getLength() != 1 || (leftNETag.getLength() > 0 && leftNETag.getBytes()[0] != 'O')) { pattern.append(leftNETag.getBytes(), 0, leftNETag.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } else { if (leftChunkTag.getLength() > 2) { pattern.append(leftChunkTag.getBytes(), 2, leftChunkTag.getLength() - 2); } else { pattern.append(leftChunkTag.getBytes(), 0, leftChunkTag.getLength()); } pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } // left path portion of pattern if (!mHeadOnly) { for (int i = 0; i <= leftPath.size() - 2; i++) { t = tokens.get(leftPath.get(i) - 1); term = mUseLemmas ? t.getLemma() : t.getToken(); dependType = t.getDependType(); posTag = t.getPosTag(); if (i != 0) { pattern.append(term.getBytes(), 0, term.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } pattern.append(dependType.getBytes(), 0, dependType.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } } else { dependType = tokens.get(leftPath.get(0) - 1).getDependType(); posTag = tokens.get(leftPath.get(0) - 1).getPosTag(); pattern.append(dependType.getBytes(), 0, dependType.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } // root portion of pattern if (leftPath.get(leftPath.size() - 1) != rightPath.get(rightPath.size() - 1)) { throw new RuntimeException( "Left and right paths do not share the same root! -- " + leftPath + " <--> " + rightPath); } t = tokens.get(leftPath.get(leftPath.size() - 1) - 1); term = mUseLemmas ? t.getLemma() : t.getToken(); dependType = t.getDependType(); posTag = t.getPosTag(); pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(term.getBytes(), 0, term.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength()); // right path portion of pattern if (!mHeadOnly) { for (int i = rightPath.size() - 2; i >= 0; i--) { t = tokens.get(rightPath.get(i) - 1); term = mUseLemmas ? t.getLemma() : t.getToken(); dependType = t.getDependType(); posTag = t.getPosTag(); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(dependType.getBytes(), 0, dependType.getLength()); if (i != 0) { pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(term.getBytes(), 0, term.getLength()); } } } else { dependType = tokens.get(rightPath.get(0) - 1).getDependType(); posTag = tokens.get(rightPath.get(0) - 1).getPosTag(); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(dependType.getBytes(), 0, dependType.getLength()); } // encode right context chunk type // TODO: replace this with a more robust way of checking if this is an actual named entity or not Text rightNETag = tokens.get(rightPath.get(0) - 1).getNETag(); Text rightChunkTag = tokens.get(rightPath.get(0) - 1).getChunkTag(); if (rightNETag.getLength() != 1 || (rightNETag.getLength() > 0 && rightNETag.getBytes()[0] != 'O')) { pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(rightNETag.getBytes(), 0, rightNETag.getLength()); } else { pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); if (rightChunkTag.getLength() > 2) { pattern.append(rightChunkTag.getBytes(), 2, rightChunkTag.getLength() - 2); } else { pattern.append(rightChunkTag.getBytes(), 0, rightChunkTag.getLength()); } } if (mOrContextStyle) { if (!mRightOnlyContextStyle) { ContextPatternWritable c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(leftChunk, MavunoUtils.ASTERISK)); c.setPattern(pattern); contexts.add(c); } if (!mLeftOnlyContextStyle) { ContextPatternWritable c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(MavunoUtils.ASTERISK, rightChunk)); c.setPattern(pattern); contexts.add(c); } } else { ContextPatternWritable c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(leftChunk, rightChunk)); c.setPattern(pattern); contexts.add(c); } return contexts; }
From source file:edu.isi.mavuno.extract.NAryChunkExtractor.java
License:Apache License
private void loadChunkPairs() { // clear chunk pairs mChunkPairs.clear();// w w w . j a va 2s .c o m // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // get chunk ids List<TratzParsedTokenWritable> sentenceTokens = sentence.getTokens(); int[] chunkIds = NLProcTools.getChunkIds(sentenceTokens); mChunks.clear(); mChunkTokens.clear(); // extract chunks from sentence for (int i = 0; i < chunkIds.length; i++) { if (i > 0 && chunkIds[i] != chunkIds[i - 1]) { Chunk chunk = createChunk(mChunkTokens); mChunks.add(chunk); mChunkTokens.clear(); } mChunkTokens.add(sentenceTokens.get(i)); } // handle last chunk in sentence if (mChunkTokens.size() > 0) { Chunk chunk = createChunk(mChunkTokens); mChunks.add(chunk); } // there's nothing we can do if there aren't at least mArity chunks in the sentence if (mArity > mChunks.size()) { mChunkPairsIter = mChunkPairs.iterator(); return; } // initialize context positions for (int i = 0; i < mArity; i++) { mContextPositions[i] = i; } // initialize pattern positions for (int i = 0; i < mArity - 1; i++) { mPatternPositions[i] = i; } // generate (context, pattern) pairs based on chunks final Text basePattern = new Text(); while (true) { // construct context for (int i = 0; i < mArity; i++) { mContextChunks[i] = mChunks.get(mContextPositions[i]); } // construct pattern for (int i = 0; i < mArity - 1; i++) { mPatternChunks[i] = mChunks.get(mPatternPositions[i]); } // add (context, pattern) pair basePattern.clear(); for (int i = 0; i < mArity - 1; i++) { // left chunk type basePattern.append(mContextChunks[i].type.getBytes(), 0, mContextChunks[i].type.getLength()); basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); if (mContextPositions[i + 1] - mPatternPositions[i] > 1 || mPatternPositions[i] - mContextPositions[i] > 1) { if (mPatternPositions[i] == mContextPositions[i]) { basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); } else if (mPatternPositions[i] == mContextPositions[i] + 1) { basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); } else if (mPatternPositions[i] + 1 == mContextPositions[i + 1]) { basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); } else { basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); } } else if (mPatternPositions[i] == mContextPositions[i]) { basePattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); } else { basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); } basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } // last chunk type basePattern.append(mContextChunks[mArity - 1].type.getBytes(), 0, mContextChunks[mArity - 1].type.getLength()); basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); int[] indices; mPermGen.reset(); while (mPermGen.hasMore()) { // get next permutation indices = mPermGen.getNext(); ContextPatternWritable c = new ContextPatternWritable(); // pattern c.setPattern(basePattern); Text numLeftText = new Text(mPermGen.getNumLeft() + "/" + mArity); c.getPattern().append(numLeftText.getBytes(), 0, numLeftText.getLength()); // context c.getContext().clear(); for (int i = 0; i < mArity; i++) { c.getContext().append(mContextChunks[indices[i]].text.getBytes(), 0, mContextChunks[indices[i]].text.getLength()); if (i != mArity - 1) { c.getContext().append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } } // add to chunk pairs mChunkPairs.add(c); } // get next set of context and pattern positions int pos = mArity - 2; while (pos >= 0) { if (mPatternPositions[pos] + 1 < mChunks.size() && mPatternPositions[pos] + 1 < mContextPositions[pos + 1]) { mPatternPositions[pos]++; for (int i = pos + 1; i < mArity - 2; i++) { mPatternPositions[i] = mContextPositions[i]; } break; } pos--; } // update the context positions if the pattern positions can't be updated any further if (pos < 0) { pos = mArity - 1; while (pos >= 0) { if (mContextPositions[pos] + 1 < mChunks.size() && (pos + 1 >= mArity || mContextPositions[pos + 1] - (mContextPositions[pos] + 1) >= 1) && (pos <= 0 || mContextPositions[pos] - mContextPositions[pos - 1] - 1 <= mMaxSkipSize)) { mContextPositions[pos]++; if (pos < mArity - 1) { mPatternPositions[pos] = mContextPositions[pos]; } for (int i = pos + 1; i < mArity; i++) { mContextPositions[i] = mContextPositions[pos] + (i - pos); if (i < mArity - 1) { mPatternPositions[i] = mContextPositions[i]; } } break; } pos--; } // if neither the context nor the pattern positions can be updated then we're done if (pos < 0) { // get iterator mChunkPairsIter = mChunkPairs.iterator(); return; } } } }
From source file:edu.isi.mavuno.extract.NAryChunkExtractor.java
License:Apache License
private Chunk createChunk(List<TratzParsedTokenWritable> terms) { Chunk chunk = new Chunk(); // construct the chunk text and detect the chunk type Text chunkNEType = null;/*from ww w .j ava 2 s . c o m*/ for (int i = 0; i < terms.size(); i++) { Text term = terms.get(i).getToken(); Text neTag = terms.get(i).getNETag(); chunk.text.append(term.getBytes(), 0, term.getLength()); if (i != terms.size() - 1) { chunk.text.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); } // TODO: replace this with a more robust way of checking if this is an actual named entity or not if (neTag.getLength() != 1 || (neTag.getLength() > 0 && neTag.getBytes()[0] != 'O')) { chunkNEType = neTag; } } // set the chunk type (note that this is somewhat heuristic) if (chunkNEType != null) { // chunk type = named entity type, if present chunk.type.set(chunkNEType); } else { // otherwise, chunk type = chunk tag Text chunkTag = terms.get(0).getChunkTag(); if (chunkTag.getLength() > 2) { chunk.type.set(chunkTag.getBytes(), 2, chunkTag.getLength() - 2); } else { chunk.type.set(chunkTag); } } return chunk; }
From source file:edu.jhuapl.accumulo.proxy.AbstractProxyScanner.java
License:Apache License
public void fetchColumn(Text colFam, Text colQual) { ScanColumn sc = new ScanColumn(); if (colFam != null) { sc.setColFamily(colFam.getBytes()); }/*from w w w . j a va 2 s . c o m*/ if (colQual != null) { sc.setColQualifier(colQual.getBytes()); } addToFetchOptions(sc); }