List of usage examples for org.apache.hadoop.io Text set
public void set(Text other)
From source file:com.github.seqware.queryengine.plugins.contribs.DonorsToMutationsAndGenesAggregationPlugin.java
License:Open Source License
@Override public void reduce(Text key, Iterable<Text> values, ReducerInterface<Text, Text> reducerInterface) { // key is feature set, value is mutation->gene that can just be cat'd Text newVal = new Text(); StringBuilder newValSB = new StringBuilder(); newValSB.append(key).append("\t"); boolean first = true; for (Text val : values) { if (first) { first = false;/*from w w w. j ava 2 s. c o m*/ } else { newValSB.append(";"); } newValSB.append(val.toString()); } newVal.set(newValSB.toString()); reducerInterface.write(newVal, null); }
From source file:com.github.seqware.queryengine.plugins.contribs.GenesToDonorsAggregationPlugin.java
License:Open Source License
@Override public void reduce(Text key, Iterable<Text> values, ReducerInterface<Text, Text> reducerInterface) { // val are the values for a given gene, in this case it's a comma sep list String newFeatStr = ""; boolean first = true; for (Text val : values) { String[] fsArr = val.toString().split(","); for (String currFS : fsArr) { if (first) { first = false;//from w w w . ja v a 2 s. c om newFeatStr += currFS; } else { newFeatStr += "," + currFS; } } // HELP, not sure what's going in here, why are you writing the text? //reducerInterface.write(val, text); } Text newVal = new Text(); newVal.set(key.toString() + "\t" + newFeatStr); reducerInterface.write(newVal, null); }
From source file:com.github.seqware.queryengine.plugins.contribs.MutationsToDonorsAggregationPlugin.java
License:Open Source License
@Override public void reduce(Text key, Iterable<Text> values, ReducerInterface<Text, Text> reducerInterface) { // values /* w w w. j av a 2s . com*/ for (Text val : values) { String[] valArr = val.toString().split("\t"); String[] fsArr = valArr[2].split(","); String newFeatStr = ""; boolean first = true; for (String currFS : fsArr) { if (first) { first = false; newFeatStr += currFS; } else { newFeatStr += "," + currFS; } } val.set(valArr[0] + "\t" + valArr[1] + "\t" + newFeatStr); reducerInterface.write(val, null); } }
From source file:com.google.mr4c.hadoop.MR4CRecordReader.java
License:Open Source License
public boolean next(Text key, DataKeyList value) { if (m_done) { return false; }//from w ww . j av a 2s.com key.set("" + m_split.getSequenceNumber()); value.setKeys(m_split.getKeys().getKeys()); m_done = true; return true; }
From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java
License:Open Source License
private int fillText(char[] chars, Random r, int charsMax, Text text) { StringBuilder sb = new StringBuilder(); // get a reasonable string length int stringLength = r.nextInt(charsMax * 2); for (int j = 0; j < stringLength; j++) { sb.append(chars[r.nextInt(charsMax)]); }/*from w ww .ja va 2s .c o m*/ text.set(sb.toString()); return stringLength; }
From source file:com.hazelcast.jet.hadoop.impl.ReadHdfsPTest.java
License:Open Source License
private static void writeToSequenceFile(Configuration conf, Path path) throws IOException { IntWritable key = new IntWritable(); Text value = new Text(); Option fileOption = Writer.file(path); Option keyClassOption = Writer.keyClass(key.getClass()); Option valueClassOption = Writer.valueClass(value.getClass()); try (Writer writer = SequenceFile.createWriter(conf, fileOption, keyClassOption, valueClassOption)) { for (int i = 0; i < ENTRIES.length; i++) { key.set(i);//from ww w .ja v a2s.co m value.set(ENTRIES[i]); writer.append(key, value); } } }
From source file:com.hazelcast.jet.impl.connector.hadoop.ReadHdfsPTest.java
License:Open Source License
private void writeToSequenceFile(Configuration conf, Path path) throws IOException { IntWritable key = new IntWritable(); Text value = new Text(); Option fileOption = Writer.file(path); Option keyClassOption = Writer.keyClass(key.getClass()); Option valueClassOption = Writer.valueClass(value.getClass()); try (Writer writer = SequenceFile.createWriter(conf, fileOption, keyClassOption, valueClassOption)) { for (int i = 0; i < ENTRIES.length; i++) { key.set(i);//from w w w . j a v a 2s.c o m value.set(ENTRIES[i]); writer.append(key, value); } } }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
private void cloneOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); /*//from ww w . jav a2s .c o m * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is * used in the subsequent iterations. */ List<Path> crushInput = emptyList(); Text srcFile = new Text(); Text crushOut = new Text(); Text prevCrushOut = new Text(); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { if (!crushOut.equals(prevCrushOut)) { swap(crushInput, prevCrushOut.toString()); prevCrushOut.set(crushOut); crushInput = new LinkedList<Path>(); } crushInput.add(new Path(srcFile.toString())); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } swap(crushInput, prevCrushOut.toString()); } }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
void writeDirs() throws IOException { print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath()); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); /*//from w w w .j av a 2 s. com * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks")); Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); print(Verbosity.INFO, "\n\n" + dir.toUri().getPath()); FileStatus[] contents = fs.listStatus(dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFiles == null) return true; ignoredFiles.reset(testPath.toUri().getPath()); return !ignoredFiles.matches(); } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, " is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { boolean changed = uncrushedFiles.add(path.toUri().getPath()); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (fileLength <= maxEligibleSize) { crushables.add(content); crushableBytes += fileLength; } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, " has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, " => " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> bucketFiles = crushFile.contents(); print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), bucketFiles.size())); key.set(bucketId); for (String f : bucketFiles) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size()); partitionBucketer.add(crushFile); } } } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + bucketFiles, e); } } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= numPartitions; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); try { for (Bucket partition : partitions) { String partitionName = partition.name(); partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1))); for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } DataOutputStream countersStream = fs.create(this.counters); try { jobCounters.write(countersStream); } finally { try { countersStream.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } }
From source file:com.hdfs.concat.crush.CrushPartitionerTest.java
License:Apache License
@Test public void partition() throws IOException { Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); Text key = new Text(); IntWritable partNum = new IntWritable(); key.set("bucket-1"); partNum.set(0);/*from w ww . j a v a 2s .c o m*/ writer.append(key, partNum); key.set("bucket-2"); partNum.set(0); writer.append(key, partNum); key.set("bucket-3"); partNum.set(1); writer.append(key, partNum); key.set("bucket-4"); partNum.set(2); writer.append(key, partNum); key.set("bucket-5"); partNum.set(2); writer.append(key, partNum); key.set("bucket-6"); partNum.set(2); writer.append(key, partNum); writer.close(); job.setNumReduceTasks(3); partitioner.configure(job); Text fileName = new Text(); key.set("bucket-1"); for (int file = 0; file < 4; file++) { fileName.set("file" + file); assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0)); } key.set("bucket-2"); for (int file = 0; file < 4; file++) { fileName.set("file" + file); assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0)); } key.set("bucket-3"); for (int file = 0; file < 4; file++) { fileName.set("file" + file); assertThat(partitioner.getPartition(key, fileName, 3), equalTo(1)); } key.set("bucket-4"); for (int file = 0; file < 4; file++) { fileName.set("file" + file); assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2)); } key.set("bucket-5"); for (int file = 0; file < 4; file++) { fileName.set("file" + file); assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2)); } key.set("bucket-6"); for (int file = 0; file < 4; file++) { fileName.set("file" + file); assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2)); } }