List of usage examples for org.apache.hadoop.io Text compareTo
public int compareTo(byte[] other, int off, int len)
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private void loadChunkPairs() { // clear chunk pairs mChunkPairs.clear();/* w w w . ja v a 2 s. c om*/ // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // extract chunks from sentence mChunks.clear(); mChunkTokens.clear(); List<TratzParsedTokenWritable> tokens = sentence.getTokens(); Text lastNETag = new Text(); for (int i = 0; i < tokens.size(); i++) { TratzParsedTokenWritable t = tokens.get(i); byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0; Text neTag = t.getNETag(); if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0 || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O')) && (chunkType == 'B' || chunkType == 'O')) { if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } mChunkTokens.clear(); mChunkType.set(t.getChunkTag()); } mChunkTokens.add(t.getToken()); lastNETag.set(neTag); } // handle last chunk in sentence if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } // generate adjacent (context, pattern) pairs for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) { Text leftPattern = new Text(); leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength()); leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1)); Text rightPattern = new Text(); rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength()); addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1)); } // generate non-adjacent (context, pattern) pairs based on chunks for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) { for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) { if (patternPos - leftSkip - 1 < 0) { continue; } if (mOrContextStyle && !mRightOnlyContextStyle) { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), ContextPatternWritable.ASTERISK); } if (mOrContextStyle && mLeftOnlyContextStyle) { continue; } for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) { if (patternPos + rightSkip + 1 >= mChunks.size()) { continue; } // construct (context, pattern) pair if (mOrContextStyle) { addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } else { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } } } } // get iterator mChunkPairsIter = mChunkPairs.iterator(); }