List of usage examples for org.apache.hadoop.io Text set
public void set(Text other)
From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansReducer.java
License:Apache License
@Override public void reduce(IntWritable key, Iterable<VectorWritable> values, Context context) throws IOException, InterruptedException { for (VectorWritable vectorWritable : values) { skmeans.cluster(vectorWritable.get()); }/*from w w w .ja v a2s.c o m*/ Text identifier = new Text(); StreamingKMeansCluster cluster = null; Searcher centroids = skmeans.getCentroids(); for (MatrixSlice matrixSlice : centroids) { cluster = StreamingKMeansCluster.getStreamingKMeansCluster(matrixSlice.vector(), distance); identifier.set(cluster.getIdentifier()); context.write(identifier, cluster); } }
From source file:edu.indiana.d2i.htrc.util.Utilities.java
License:Apache License
public static void Dictionary2SeqFile(String input, String output) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(input)); Configuration conf = new Configuration(); SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), conf, new Path(output), Text.class, IntWritable.class); String line = null;/*from w w w. j a v a 2 s . com*/ Text key = new Text(); IntWritable value = new IntWritable(); int count = 0; while ((line = reader.readLine()) != null) { key.set(line); value.set(count++); writer.append(key, value); } writer.close(); reader.close(); }
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private void loadChunkPairs() { // clear chunk pairs mChunkPairs.clear();//from w w w . j a v a 2 s . c om // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // extract chunks from sentence mChunks.clear(); mChunkTokens.clear(); List<TratzParsedTokenWritable> tokens = sentence.getTokens(); Text lastNETag = new Text(); for (int i = 0; i < tokens.size(); i++) { TratzParsedTokenWritable t = tokens.get(i); byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0; Text neTag = t.getNETag(); if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0 || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O')) && (chunkType == 'B' || chunkType == 'O')) { if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } mChunkTokens.clear(); mChunkType.set(t.getChunkTag()); } mChunkTokens.add(t.getToken()); lastNETag.set(neTag); } // handle last chunk in sentence if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } // generate adjacent (context, pattern) pairs for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) { Text leftPattern = new Text(); leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength()); leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1)); Text rightPattern = new Text(); rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength()); addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1)); } // generate non-adjacent (context, pattern) pairs based on chunks for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) { for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) { if (patternPos - leftSkip - 1 < 0) { continue; } if (mOrContextStyle && !mRightOnlyContextStyle) { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), ContextPatternWritable.ASTERISK); } if (mOrContextStyle && mLeftOnlyContextStyle) { continue; } for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) { if (patternPos + rightSkip + 1 >= mChunks.size()) { continue; } // construct (context, pattern) pair if (mOrContextStyle) { addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } else { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } } } } // get iterator mChunkPairsIter = mChunkPairs.iterator(); }
From source file:edu.isi.mavuno.nlp.NLProcTools.java
License:Apache License
private static Text extractGeneralizedChunk(int chunkStart, int chunkEnd, List<TratzParsedTokenWritable> tokens, boolean appendPOSTag) { Text chunk = new Text(); int startPos; for (startPos = chunkEnd; startPos >= chunkStart; startPos--) { if (!tokens.get(startPos).getPosTag().toString().startsWith("NN")) { break; }/* w ww.ja v a 2 s . com*/ } if (startPos != chunkStart - 1 && startPos != chunkEnd) { Text generalChunk = extractMainChunk(startPos + 1, chunkEnd + 1, tokens, appendPOSTag); if (generalChunk != null) { chunk.set(generalChunk); } } // return null if no valid terms are found if (chunk.getLength() == 0) { return null; } return chunk; }
From source file:edu.isi.mavuno.util.TokenWritable.java
License:Apache License
protected static void safeSet(Text t, Text s) { if (s == null) { t.clear();/*from w w w . java2s .c om*/ } else { t.set(s); } }
From source file:edu.isi.mavuno.util.TokenWritable.java
License:Apache License
protected static void safeSet(Text t, String s) { if (s == null) { t.clear();/* w w w .j av a2s. c om*/ } else { t.set(s); } }
From source file:edu.jhuapl.tinkerpop.AccumuloGraph.java
License:Apache License
public Vertex getVertex(Object id) { if (id == null) { throw ExceptionFactory.vertexIdCanNotBeNull(); }/*from ww w. jav a2s. c o m*/ String myID; try { myID = (String) id; } catch (ClassCastException e) { return null; } Vertex vertex = null; if (vertexCache != null) { vertex = vertexCache.retrieve(myID); if (vertex != null) { return vertex; } } vertex = new AccumuloVertex(this, myID); Scanner scan = null; try { if (!config.skipExistenceChecks()) { // in addition to just an "existence" check, we will also load // any "preloaded" properties now, which saves us a round-trip // to Accumulo later... scan = getElementScanner(Vertex.class); scan.setRange(new Range(myID)); scan.fetchColumn(TLABEL, TEXISTS); String[] preload = config.getPreloadedProperties(); if (preload != null) { // user has requested specific properties... Text colf = new Text(""); for (String key : preload) { if (StringFactory.LABEL.equals(key)) { colf.set(AccumuloGraph.LABEL); } else { colf.set(key); } scan.fetchColumnFamily(colf); } } Iterator<Entry<Key, Value>> iter = scan.iterator(); if (!iter.hasNext()) { return null; } preloadProperties(iter, (AccumuloElement) vertex); } } finally { if (scan != null) { scan.close(); } } if (vertexCache != null) { vertexCache.cache(vertex); } return vertex; }
From source file:edu.jhuapl.tinkerpop.AccumuloGraph.java
License:Apache License
void preloadProperties(AccumuloElement element, Class<? extends Element> type) { String[] toPreload = config.getPreloadedProperties(); if (toPreload == null) { return;/*ww w . j a v a 2 s. c o m*/ } Scanner s = getElementScanner(type); s.setRange(new Range(element.getId().toString())); // user has requested specific properties... Text colf = new Text(""); for (String key : toPreload) { if (StringFactory.LABEL.equals(key)) { colf.set(AccumuloGraph.LABEL); } else { colf.set(key); } s.fetchColumnFamily(colf); } Iterator<Entry<Key, Value>> iter = s.iterator(); // Integer timeout = config.getPropertyCacheTimeoutMillis(); // Change this while (iter.hasNext()) { Entry<Key, Value> entry = iter.next(); Object val = AccumuloByteSerializer.desserialize(entry.getValue().get()); element.cacheProperty(entry.getKey().getColumnFamily().toString(), val, config.getPropertyCacheTimeoutMillis(entry.getKey().getColumnFamily().toString())); } s.close(); }
From source file:edu.mit.ll.graphulo.pig.backend.GraphuloOneTableStorage.java
License:Apache License
@Override protected Tuple getTuple(Key key, Value value) throws IOException { SortedMap<Key, Value> rowKVs = WholeRowIterator.decodeRow(key, value); Tuple tuple = TupleFactory.getInstance().newTuple(columns.size() + 1); final Text cfHolder = new Text(); final Text cqHolder = new Text(); final Text row = key.getRow(); int tupleOffset = 0; tuple.set(tupleOffset, new DataByteArray(Text.decode(row.getBytes(), 0, row.getLength()))); for (Column column : this.columns) { tupleOffset++;//from ww w . ja v a 2s. co m switch (column.getType()) { case LITERAL: cfHolder.set(column.getColumnFamily()); if (null != column.getColumnQualifier()) { cqHolder.set(column.getColumnQualifier()); } else { cqHolder.set(EMPTY_TEXT); } // Get the key where our literal would exist (accounting for // "colf:colq" or "colf:" empty colq) Key literalStartKey = new Key(row, cfHolder, cqHolder); SortedMap<Key, Value> tailMap = rowKVs.tailMap(literalStartKey); // Find the element if (tailMap.isEmpty()) { tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } else { Key actualKey = tailMap.firstKey(); // Only place it in the tuple if it matches the user // request, avoid using a value from a // key with the wrong colqual if (0 == literalStartKey.compareTo(actualKey, PartialKey.ROW_COLFAM_COLQUAL)) { tuple.set(tupleOffset, new DataByteArray(tailMap.get(actualKey).get())); } else { // This row doesn't have the column we were looking for tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } } break; case COLFAM_PREFIX: cfHolder.set(column.getColumnFamily()); Range colfamPrefixRange = Range.prefix(row, cfHolder); Key colfamPrefixStartKey = new Key(row, cfHolder); SortedMap<Key, Value> cfTailMap = rowKVs.tailMap(colfamPrefixStartKey); // Find the element if (cfTailMap.isEmpty()) { tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } else { HashMap<String, DataByteArray> tupleMap = new HashMap<String, DataByteArray>(); // Build up a map for all the entries in this row that match // the colfam prefix for (Entry<Key, Value> entry : cfTailMap.entrySet()) { if (colfamPrefixRange.contains(entry.getKey())) { entry.getKey().getColumnFamily(cfHolder); entry.getKey().getColumnQualifier(cqHolder); DataByteArray val = new DataByteArray(entry.getValue().get()); // Avoid adding an extra ':' when colqual is empty if (0 == cqHolder.getLength()) { tupleMap.put(cfHolder.toString(), val); } else { tupleMap.put(cfHolder.toString() + COLON + cqHolder.toString(), val); } } else { break; } } if (!tupleMap.isEmpty()) { tuple.set(tupleOffset, tupleMap); } } break; case COLQUAL_PREFIX: cfHolder.set(column.getColumnFamily()); cqHolder.set(column.getColumnQualifier()); Range colqualPrefixRange = Range.prefix(row, cfHolder, cqHolder); Key colqualPrefixStartKey = new Key(row, cfHolder, cqHolder); SortedMap<Key, Value> cqTailMap = rowKVs.tailMap(colqualPrefixStartKey); if (cqTailMap.isEmpty()) { tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } else { HashMap<String, DataByteArray> tupleMap = new HashMap<String, DataByteArray>(); // Build up a map for all the entries in this row that match // the colqual prefix for (Entry<Key, Value> entry : cqTailMap.entrySet()) { if (colqualPrefixRange.contains(entry.getKey())) { entry.getKey().getColumnFamily(cfHolder); entry.getKey().getColumnQualifier(cqHolder); DataByteArray val = new DataByteArray(entry.getValue().get()); // Avoid the extra ':' on empty colqual if (0 == cqHolder.getLength()) { tupleMap.put(cfHolder.toString(), val); } else { tupleMap.put(cfHolder.toString() + COLON + cqHolder.toString(), val); } } else { break; } } if (!tupleMap.isEmpty()) { tuple.set(tupleOffset, tupleMap); } } break; default: break; } } return tuple; }
From source file:edu.stolaf.cs.wmrserver.streaming.StreamKeyValUtil.java
License:Apache License
/** * split a UTF-8 byte array into key and value * assuming that the delimilator is at splitpos. * @param utf utf-8 encoded string/*from w w w .j a v a 2s. c o m*/ * @param start starting offset * @param length no. of bytes * @param key contains key upon the method is returned * @param val contains value upon the method is returned * @param splitPos the split pos * @param separatorLength the length of the separator between key and value * @throws IOException */ public static void splitKeyVal(byte[] utf, int start, int length, Text key, Text val, int splitPos, int separatorLength) throws IOException { if (splitPos < start || splitPos >= (start + length)) throw new IllegalArgumentException( "splitPos must be in the range " + "[" + start + ", " + (start + length) + "]: " + splitPos); int keyLen = (splitPos - start); byte[] keyBytes = new byte[keyLen]; System.arraycopy(utf, start, keyBytes, 0, keyLen); int valLen = (start + length) - splitPos - separatorLength; byte[] valBytes = new byte[valLen]; System.arraycopy(utf, splitPos + separatorLength, valBytes, 0, valLen); key.set(keyBytes); val.set(valBytes); }