List of usage examples for org.apache.hadoop.io Text Text
public Text()
From source file:cc.slda.AnnotateDocuments.java
License:Apache License
public static Map<Integer, String> importParameter(SequenceFile.Reader sequenceFileReader) throws IOException { Map<Integer, String> hashMap = new HashMap<Integer, String>(); IntWritable intWritable = new IntWritable(); Text text = new Text(); while (sequenceFileReader.next(intWritable, text)) { hashMap.put(intWritable.get(), text.toString()); }/* w w w.j a v a2 s . c o m*/ return hashMap; }
From source file:cc.slda.DisplayTopic.java
License:Apache License
@SuppressWarnings("unchecked") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(Settings.HELP_OPTION, false, "print the help message"); options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg() .withDescription("input beta file").create(Settings.INPUT_OPTION)); options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg() .withDescription("term index file").create(ParseCorpus.INDEX)); options.addOption(OptionBuilder.withArgName(Settings.INTEGER_INDICATOR).hasArg() .withDescription("display top terms only (default - 10)").create(TOP_DISPLAY_OPTION)); String betaString = null;//from w ww. j a v a2 s . co m String indexString = null; int topDisplay = TOP_DISPLAY; CommandLineParser parser = new GnuParser(); HelpFormatter formatter = new HelpFormatter(); try { CommandLine line = parser.parse(options, args); if (line.hasOption(Settings.HELP_OPTION)) { formatter.printHelp(ParseCorpus.class.getName(), options); System.exit(0); } if (line.hasOption(Settings.INPUT_OPTION)) { betaString = line.getOptionValue(Settings.INPUT_OPTION); } else { throw new ParseException("Parsing failed due to " + Settings.INPUT_OPTION + " not initialized..."); } if (line.hasOption(ParseCorpus.INDEX)) { indexString = line.getOptionValue(ParseCorpus.INDEX); } else { throw new ParseException("Parsing failed due to " + ParseCorpus.INDEX + " not initialized..."); } if (line.hasOption(TOP_DISPLAY_OPTION)) { topDisplay = Integer.parseInt(line.getOptionValue(TOP_DISPLAY_OPTION)); } } catch (ParseException pe) { System.err.println(pe.getMessage()); formatter.printHelp(ParseCorpus.class.getName(), options); System.exit(0); } catch (NumberFormatException nfe) { System.err.println(nfe.getMessage()); System.exit(0); } JobConf conf = new JobConf(DisplayTopic.class); FileSystem fs = FileSystem.get(conf); Path indexPath = new Path(indexString); Preconditions.checkArgument(fs.exists(indexPath) && fs.isFile(indexPath), "Invalid index path..."); Path betaPath = new Path(betaString); Preconditions.checkArgument(fs.exists(betaPath) && fs.isFile(betaPath), "Invalid beta path..."); SequenceFile.Reader sequenceFileReader = null; try { IntWritable intWritable = new IntWritable(); Text text = new Text(); Map<Integer, String> termIndex = new HashMap<Integer, String>(); sequenceFileReader = new SequenceFile.Reader(fs, indexPath, conf); while (sequenceFileReader.next(intWritable, text)) { termIndex.put(intWritable.get(), text.toString()); } PairOfIntFloat pairOfIntFloat = new PairOfIntFloat(); // HMapIFW hmap = new HMapIFW(); HMapIDW hmap = new HMapIDW(); TreeMap<Double, Integer> treeMap = new TreeMap<Double, Integer>(); sequenceFileReader = new SequenceFile.Reader(fs, betaPath, conf); while (sequenceFileReader.next(pairOfIntFloat, hmap)) { treeMap.clear(); System.out.println("=============================="); System.out.println( "Top ranked " + topDisplay + " terms for Topic " + pairOfIntFloat.getLeftElement()); System.out.println("=============================="); Iterator<Integer> itr1 = hmap.keySet().iterator(); int temp1 = 0; while (itr1.hasNext()) { temp1 = itr1.next(); treeMap.put(-hmap.get(temp1), temp1); if (treeMap.size() > topDisplay) { treeMap.remove(treeMap.lastKey()); } } Iterator<Double> itr2 = treeMap.keySet().iterator(); double temp2 = 0; while (itr2.hasNext()) { temp2 = itr2.next(); if (termIndex.containsKey(treeMap.get(temp2))) { System.out.println(termIndex.get(treeMap.get(temp2)) + "\t\t" + -temp2); } else { System.out.println("How embarrassing! Term index not found..."); } } } } finally { IOUtils.closeStream(sequenceFileReader); } return 0; }
From source file:cereal.impl.ProtobufMessageMapping.java
License:Apache License
@Override public void update(Iterable<Entry<Key, Value>> iter, InstanceOrBuilder<T> obj) { checkNotNull(iter, "Iterable was null"); checkNotNull(obj, "InstanceOrBuilder was null"); checkArgument(Type.BUILDER == obj.getType(), "Expected argument to be a builder"); final GeneratedMessage.Builder<?> builder = (GeneratedMessage.Builder<?>) obj.get(); final List<Entry<Key, Value>> leftoverFields = new LinkedList<>(); for (Entry<Key, Value> entry : iter) { String fieldName = entry.getKey().getColumnQualifier().toString(); int index = fieldName.indexOf(PERIOD); if (0 <= index) { leftoverFields.add(entry);//from ww w .jav a 2 s. co m continue; } // Find the FieldDescriptor from the Key for (FieldDescriptor fieldDesc : builder.getDescriptorForType().getFields()) { if (fieldDesc.isRepeated()) { int offset = fieldName.lastIndexOf(DOLLAR); if (offset < 0) { throw new RuntimeException( "Could not find offset of separator for repeated field count in " + fieldName); } fieldName = fieldName.substring(0, offset); } if (fieldName.equals(fieldDesc.getName())) { Value value = entry.getValue(); switch (fieldDesc.getJavaType()) { case INT: Integer intVal = Integer.parseInt(value.toString()); if (fieldDesc.isRepeated()) { builder.addRepeatedField(fieldDesc, intVal); } else { builder.setField(fieldDesc, intVal); } break; case LONG: Long longVal = Long.parseLong(value.toString()); if (fieldDesc.isRepeated()) { builder.addRepeatedField(fieldDesc, longVal); } else { builder.setField(fieldDesc, longVal); } break; case FLOAT: Float floatVal = Float.parseFloat(value.toString()); if (fieldDesc.isRepeated()) { builder.addRepeatedField(fieldDesc, floatVal); } else { builder.setField(fieldDesc, floatVal); } break; case DOUBLE: Double doubleVal = Double.parseDouble(value.toString()); if (fieldDesc.isRepeated()) { builder.addRepeatedField(fieldDesc, doubleVal); } else { builder.setField(fieldDesc, doubleVal); } break; case BOOLEAN: Boolean booleanVal = Boolean.parseBoolean(value.toString()); if (fieldDesc.isRepeated()) { builder.addRepeatedField(fieldDesc, booleanVal); } else { builder.setField(fieldDesc, booleanVal); } break; case STRING: String strVal = value.toString(); if (fieldDesc.isRepeated()) { builder.addRepeatedField(fieldDesc, strVal); } else { builder.setField(fieldDesc, strVal); } break; case BYTE_STRING: ByteString byteStrVal = ByteString.copyFrom(entry.getValue().get()); if (fieldDesc.isRepeated()) { builder.addRepeatedField(fieldDesc, byteStrVal); } else { builder.setField(fieldDesc, byteStrVal); } break; default: log.warn("Ignoring unknown serialized type {}", fieldDesc.getJavaType()); break; } break; } } } // All primitives in object should be filled out. // Make sure nested messages get filled out too. if (!leftoverFields.isEmpty()) { for (FieldDescriptor fieldDesc : builder.getDescriptorForType().getFields()) { if (JavaType.MESSAGE == fieldDesc.getJavaType()) { // For each Key-Value pair which have this prefix as the fieldname (column qualifier) final String fieldName = fieldDesc.getName(); final String singularPrefix = fieldName + PERIOD, repeatedPrefix = fieldName + DOLLAR; log.debug("Extracting Key-Value pairs for {}", fieldDesc.getName()); // Use a TreeMap to ensure the correct repetition order is preserved Map<Integer, List<Entry<Key, Value>>> fieldsForNestedMessage = new TreeMap<>(); final Text _holder = new Text(); Iterator<Entry<Key, Value>> leftoverFieldsIter = leftoverFields.iterator(); while (leftoverFieldsIter.hasNext()) { final Entry<Key, Value> entry = leftoverFieldsIter.next(); final Key key = entry.getKey(); entry.getKey().getColumnQualifier(_holder); String colqual = _holder.toString(); if (colqual.startsWith(singularPrefix)) { // Make a copy of the original Key, stripping the prefix off of the qualifier Key copy = new Key(key.getRow(), key.getColumnFamily(), new Text(colqual.substring(singularPrefix.length())), key.getColumnVisibility(), key.getTimestamp()); List<Entry<Key, Value>> kvPairs = fieldsForNestedMessage.get(-1); if (null == kvPairs) { kvPairs = new LinkedList<>(); fieldsForNestedMessage.put(-1, kvPairs); } kvPairs.add(Maps.immutableEntry(copy, entry.getValue())); // Remove it from the list as we should never have to reread this one again leftoverFieldsIter.remove(); } else if (colqual.startsWith(repeatedPrefix)) { // Make a copy of the original Key, stripping the prefix off of the qualifier int index = colqual.indexOf(PERIOD, repeatedPrefix.length()); if (0 > index) { throw new RuntimeException("Could not find period after dollar sign: " + colqual); } Integer repetition = Integer .parseInt(colqual.substring(repeatedPrefix.length(), index)); Key copy = new Key(key.getRow(), key.getColumnFamily(), new Text(colqual.substring(index + 1)), key.getColumnVisibility(), key.getTimestamp()); List<Entry<Key, Value>> kvPairs = fieldsForNestedMessage.get(repetition); if (null == kvPairs) { kvPairs = new LinkedList<>(); fieldsForNestedMessage.put(repetition, kvPairs); } kvPairs.add(Maps.immutableEntry(copy, entry.getValue())); // Remove it from the list as we should never have to reread this one again leftoverFieldsIter.remove(); } } if (!fieldsForNestedMessage.isEmpty()) { // We have keys, pass them down to the nested message String nestedMsgClzName = getClassName(fieldDesc); log.debug("Found {} Key-Value pairs for {}. Reconstituting the message.", fieldsForNestedMessage.size(), nestedMsgClzName); try { @SuppressWarnings("unchecked") // Get the class, builder and InstanceOrBuilder for the nested message Class<GeneratedMessage> msgClz = (Class<GeneratedMessage>) Class .forName(nestedMsgClzName); Method newBuilderMethod = msgClz.getMethod("newBuilder"); for (Entry<Integer, List<Entry<Key, Value>>> pairsPerRepetition : fieldsForNestedMessage .entrySet()) { Message.Builder subBuilder = (Message.Builder) newBuilderMethod.invoke(null); InstanceOrBuilder<GeneratedMessage> subIob = new InstanceOrBuilderImpl<>(subBuilder, msgClz); // Get the mapping from the registry ProtobufMessageMapping<GeneratedMessage> subMapping = (ProtobufMessageMapping<GeneratedMessage>) registry .get(subIob); // Invoke update on the mapping with the subset of Key-Values subMapping.update(pairsPerRepetition.getValue(), subIob); // Set the result on the top-level obj if (fieldDesc.isRepeated()) { builder.addRepeatedField(fieldDesc, subBuilder.build()); } else { builder.setField(fieldDesc, subBuilder.build()); } } } catch (Exception e) { throw new RuntimeException(e); } } // No fields for the sub message, therefore it's empty log.debug("Found no Key-Value pairs for {}", fieldName); } // Not a message, so we can ignore it } if (!leftoverFields.isEmpty()) { log.warn("Found {} leftover Key-Value pairs that were not consumed", leftoverFields.size()); } } }
From source file:clustering.link_back.io.Step2KeyWritable.java
License:Apache License
/** * joinKey means entry_id@@g_no,//from w ww. java 2s .c o m * and tag is the secondary sort field, * 1 = cluster_id, 2 = content */ public Step2KeyWritable() { this.joinKey = new Text(); this.tag = new IntWritable(); }
From source file:cn.ac.ncic.mastiff.io.coding.ORCStringEcnodingUtil.java
License:Apache License
public String readEachValue(Text previous) throws IOException { Text result = null;/* w w w.j a va2 s.c o m*/ int entry = (int) reader.next(); if (previous == null) { result = new Text(); } else { result = (Text) previous; } int offset = dictionaryOffsets[entry]; int length; // if it isn't the last entry, subtract the offsets otherwise use // the buffer length. if (entry < dictionaryOffsets.length - 1) { length = dictionaryOffsets[entry + 1] - offset; } else { length = dictionaryBuffer.size() - offset; } // If the column is just empty strings, the size will be zero, // so the buffer will be null, in that case just return result // as it will default to empty if (dictionaryBuffer != null) { dictionaryBuffer.setText(result, offset, length); } else { result.clear(); } return result.toString(); }
From source file:cn.ac.ncic.mastiff.io.coding.RedBlackTreeStringReader.java
License:Apache License
public String readEachValue(Text previous) throws IOException { Text result = null;/* www . j a v a 2 s . c o m*/ int entry = (int) reader.next(); if (previous == null) { result = new Text(); } else { result = (Text) previous; } int offset = dictionaryOffsets[entry]; int length; if (entry < dictionaryOffsets.length - 1) { length = dictionaryOffsets[entry + 1] - offset; } else { length = dictionaryBuffer.size() - offset; } // If the column is just empty strings, the size will be zero, // so the buffer will be null, in that case just return result // as it will default to empty if (dictionaryBuffer != null) { dictionaryBuffer.setText(result, offset, length); } else { result.clear(); } // } return result.toString(); }
From source file:cn.com.warlock.SequenceFilesTest.java
License:Apache License
public static void main(String[] args) throws IOException { String hdfsUri = "hdfs://hlg-2p238-fandongsheng:8020"; String pathStr = "/tmp/example/seq1"; String compressType = "1"; // ??windows? // System.setProperty("hadoop.home.dir", "E:\\tools"); Configuration conf = new Configuration(); conf.set("fs.defaultFS", hdfsUri); Path path = new Path(pathStr); IntWritable key = new IntWritable(); Text value = new Text(); SequenceFile.Writer writer = null; try {//from w ww . ja v a2s.c o m SequenceFile.Writer.Option pathOpt = SequenceFile.Writer.file(path); SequenceFile.Writer.Option keyClassOpt = SequenceFile.Writer.keyClass(key.getClass()); SequenceFile.Writer.Option valueClassOpt = SequenceFile.Writer.valueClass(value.getClass()); SequenceFile.Writer.Option compressionOpt = null; // compress type if (compressType.equals("1")) { System.out.println("compress none"); compressionOpt = SequenceFile.Writer.compression(CompressionType.NONE); } else if (compressType.equals("2")) { System.out.println("compress record"); compressionOpt = SequenceFile.Writer.compression(CompressionType.RECORD); } else if (compressType.equals("3")) { System.out.println("compress block"); compressionOpt = SequenceFile.Writer.compression(CompressionType.BLOCK); } else { System.out.println("Default : compress none"); compressionOpt = SequenceFile.Writer.compression(CompressionType.NONE); } writer = SequenceFile.createWriter(conf, pathOpt, keyClassOpt, valueClassOpt, compressionOpt); for (int i = 0; i < 100; i++) { key.set(100 - i); value.set(DATA[i % DATA.length]); System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value); writer.append(key, value); } } finally { IOUtils.closeStream(writer); } }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.RecordGenerator.java
@Override public CrawlDatum next() { Text text = new Text(); CrawlDatum datum = new CrawlDatum(); boolean hasMore; try {// w ww .j a v a 2 s .c om hasMore = reader.next(text, datum); } catch (IOException ex) { ex.printStackTrace(); return null; } if (hasMore) return datum; else return null; }
From source file:cn.lhfei.hadoop.ch04.MapFileWriteDemo.java
License:Apache License
public static void main(String[] args) { String uri = args[0];//from w ww. j a v a 2 s . com Configuration conf = new Configuration(); FileSystem fs = null; IntWritable key = new IntWritable(); Text value = new Text(); MapFile.Writer writer = null; try { fs = FileSystem.get(URI.create(uri), conf); /*writer = new MapFile.Writer(conf, fs, uri, key.getClass(), value.getClass());*/ writer = new MapFile.Writer(conf, new Path(uri), Writer.keyClass(key.getClass()), Writer.valueClass(value.getClass())); for (int i = 0; i < 1024; i++) { key.set(i + 1); value.set(DATA[i % DATA.length]); writer.append(key, value); } } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeStream(writer); } }
From source file:cn.lhfei.hadoop.ch04.SequenceFileWriteDemo.java
License:Apache License
public static void main(String[] args) { String uri = args[0];/* www. j a v a 2 s . co m*/ Configuration conf = new Configuration(); FileSystem fs = null; SequenceFile.Writer writer = null; try { fs = FileSystem.get(URI.create(uri), conf); Path path = new Path(uri); IntWritable key = new IntWritable(); Text value = new Text(); //writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass()); writer = SequenceFile.createWriter(conf, Writer.keyClass(key.getClass()), writer.valueClass(value.getClass())); for (int i = 0; i < 100; i++) { key.set(100 - i); value.set(DATA[i % DATA.length]); System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value); writer.append(key, value); } } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeStream(writer); } }