Example usage for org.apache.hadoop.io Text set

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text set.

Prototype

public void set(Text other)

Source Link

Document

copy a text.

Usage

From source file:com.github.seqware.queryengine.plugins.contribs.DonorsToMutationsAndGenesAggregationPlugin.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<Text> values, ReducerInterface<Text, Text> reducerInterface) {
    // key is feature set, value is mutation->gene that can just be cat'd
    Text newVal = new Text();
    StringBuilder newValSB = new StringBuilder();
    newValSB.append(key).append("\t");
    boolean first = true;
    for (Text val : values) {
        if (first) {
            first = false;/*from w  w w.  j  ava  2  s. c  o  m*/
        } else {
            newValSB.append(";");
        }
        newValSB.append(val.toString());
    }
    newVal.set(newValSB.toString());
    reducerInterface.write(newVal, null);
}

From source file:com.github.seqware.queryengine.plugins.contribs.GenesToDonorsAggregationPlugin.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<Text> values, ReducerInterface<Text, Text> reducerInterface) {
    // val are the values for a given gene, in this case it's a comma sep list
    String newFeatStr = "";
    boolean first = true;
    for (Text val : values) {
        String[] fsArr = val.toString().split(",");
        for (String currFS : fsArr) {
            if (first) {
                first = false;//from  w  w w  . ja v  a  2 s. c  om
                newFeatStr += currFS;
            } else {
                newFeatStr += "," + currFS;
            }
        }
        // HELP, not sure what's going in here, why are you writing the text?
        //reducerInterface.write(val, text);
    }
    Text newVal = new Text();
    newVal.set(key.toString() + "\t" + newFeatStr);
    reducerInterface.write(newVal, null);
}

From source file:com.github.seqware.queryengine.plugins.contribs.MutationsToDonorsAggregationPlugin.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<Text> values, ReducerInterface<Text, Text> reducerInterface) {
    // values /*  w w w. j av a  2s  .  com*/
    for (Text val : values) {
        String[] valArr = val.toString().split("\t");
        String[] fsArr = valArr[2].split(",");
        String newFeatStr = "";
        boolean first = true;
        for (String currFS : fsArr) {
            if (first) {
                first = false;
                newFeatStr += currFS;
            } else {
                newFeatStr += "," + currFS;
            }
        }

        val.set(valArr[0] + "\t" + valArr[1] + "\t" + newFeatStr);
        reducerInterface.write(val, null);
    }
}

From source file:com.google.mr4c.hadoop.MR4CRecordReader.java

License:Open Source License

public boolean next(Text key, DataKeyList value) {
    if (m_done) {
        return false;
    }//from   w  ww .  j  av  a  2s.com
    key.set("" + m_split.getSequenceNumber());
    value.setKeys(m_split.getKeys().getKeys());
    m_done = true;
    return true;
}

From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java

License:Open Source License

private int fillText(char[] chars, Random r, int charsMax, Text text) {
    StringBuilder sb = new StringBuilder();
    // get a reasonable string length
    int stringLength = r.nextInt(charsMax * 2);
    for (int j = 0; j < stringLength; j++) {
        sb.append(chars[r.nextInt(charsMax)]);
    }/*from   w ww  .ja va 2s .c o m*/
    text.set(sb.toString());
    return stringLength;
}

From source file:com.hazelcast.jet.hadoop.impl.ReadHdfsPTest.java

License:Open Source License

private static void writeToSequenceFile(Configuration conf, Path path) throws IOException {
    IntWritable key = new IntWritable();
    Text value = new Text();
    Option fileOption = Writer.file(path);
    Option keyClassOption = Writer.keyClass(key.getClass());
    Option valueClassOption = Writer.valueClass(value.getClass());
    try (Writer writer = SequenceFile.createWriter(conf, fileOption, keyClassOption, valueClassOption)) {
        for (int i = 0; i < ENTRIES.length; i++) {
            key.set(i);//from  ww  w .ja  v a2s.co  m
            value.set(ENTRIES[i]);
            writer.append(key, value);
        }
    }
}

From source file:com.hazelcast.jet.impl.connector.hadoop.ReadHdfsPTest.java

License:Open Source License

private void writeToSequenceFile(Configuration conf, Path path) throws IOException {
    IntWritable key = new IntWritable();
    Text value = new Text();
    Option fileOption = Writer.file(path);
    Option keyClassOption = Writer.keyClass(key.getClass());
    Option valueClassOption = Writer.valueClass(value.getClass());
    try (Writer writer = SequenceFile.createWriter(conf, fileOption, keyClassOption, valueClassOption)) {
        for (int i = 0; i < ENTRIES.length; i++) {
            key.set(i);//from  w  w  w  . j a  v  a  2s.c o m
            value.set(ENTRIES[i]);
            writer.append(key, value);
        }
    }
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

private void cloneOutput() throws IOException {

    List<FileStatus> listStatus = getOutputMappings();

    /*//from   ww  w  . jav  a2s  .c  o  m
     * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is
     * used in the subsequent iterations.
     */
    List<Path> crushInput = emptyList();

    Text srcFile = new Text();
    Text crushOut = new Text();
    Text prevCrushOut = new Text();

    for (FileStatus partFile : listStatus) {
        Path path = partFile.getPath();

        Reader reader = new Reader(fs, path, fs.getConf());

        try {
            while (reader.next(srcFile, crushOut)) {
                if (!crushOut.equals(prevCrushOut)) {
                    swap(crushInput, prevCrushOut.toString());

                    prevCrushOut.set(crushOut);
                    crushInput = new LinkedList<Path>();
                }

                crushInput.add(new Path(srcFile.toString()));
            }
        } finally {
            try {
                reader.close();
            } catch (IOException e) {
                LOG.warn("Trapped exception when closing " + path, e);
            }
        }

        swap(crushInput, prevCrushOut.toString());
    }
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

void writeDirs() throws IOException {

    print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath());

    FileStatus status = fs.getFileStatus(srcDir);

    Path tmpIn = new Path(tmpDir, "in");

    bucketFiles = new Path(tmpIn, "dirs");
    partitionMap = new Path(tmpIn, "partition-map");
    counters = new Path(tmpIn, "counters");

    skippedFiles = new HashSet<String>();

    /*//from w  w  w  .j  av  a  2  s. com
     * Prefer the path returned by the status because it is always fully qualified.
     */
    List<Path> dirs = asList(status.getPath());

    Text key = new Text();
    Text value = new Text();

    Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class,
            CompressionType.BLOCK);

    int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks"));

    Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false);
    partitionBucketer.reset("partition-map");

    jobCounters = new Counters();

    try {
        while (!dirs.isEmpty()) {
            List<Path> nextLevel = new LinkedList<Path>();

            for (Path dir : dirs) {
                jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);

                print(Verbosity.INFO, "\n\n" + dir.toUri().getPath());

                FileStatus[] contents = fs.listStatus(dir, new PathFilter() {
                    @Override
                    public boolean accept(Path testPath) {
                        if (ignoredFiles == null)
                            return true;
                        ignoredFiles.reset(testPath.toUri().getPath());
                        return !ignoredFiles.matches();
                    }

                });

                if (contents == null || contents.length == 0) {
                    print(Verbosity.INFO, " is empty");

                    jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                } else {
                    List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length);
                    Set<String> uncrushedFiles = new HashSet<String>(contents.length);

                    long crushableBytes = 0;

                    /*
                     * Queue sub directories for subsequent inspection and examine the files in this directory.
                     */
                    for (FileStatus content : contents) {
                        Path path = content.getPath();

                        if (content.isDir()) {
                            nextLevel.add(path);
                        } else {
                            boolean changed = uncrushedFiles.add(path.toUri().getPath());

                            assert changed : path.toUri().getPath();

                            long fileLength = content.getLen();

                            if (fileLength <= maxEligibleSize) {
                                crushables.add(content);
                                crushableBytes += fileLength;
                            }
                        }
                    }

                    /*
                     * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the
                     * number of files we found.
                     */
                    if (!uncrushedFiles.isEmpty()) {
                        if (-1 == findMatcher(dir)) {
                            throw new IllegalArgumentException(
                                    "Could not find matching regex for directory: " + dir);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size());
                    }

                    if (0 == crushableBytes) {
                        print(Verbosity.INFO, " has no crushable files");

                        jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                    } else {
                        /*
                         * We found files to consider for crushing.
                         */
                        long nBlocks = crushableBytes / dfsBlockSize;

                        if (nBlocks * dfsBlockSize != crushableBytes) {
                            nBlocks++;
                        }

                        /*
                         * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory.
                         */
                        long dirBuckets = nBlocks / maxFileBlocks;

                        if (dirBuckets * maxFileBlocks != nBlocks) {
                            dirBuckets++;
                        }

                        if (dirBuckets > Integer.MAX_VALUE) {
                            throw new AssertionError("Too many buckets: " + dirBuckets);
                        }

                        Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs);

                        directoryBucketer.reset(getPathPart(dir));

                        for (FileStatus file : crushables) {
                            directoryBucketer.add(new FileStatusHasSize(file));
                        }

                        List<Bucket> crushFiles = directoryBucketer.createBuckets();

                        if (crushFiles.isEmpty()) {
                            jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                        } else {
                            nBuckets += crushFiles.size();

                            jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

                            print(Verbosity.INFO, " => " + crushFiles.size() + " output files");

                            /*
                             * Write out the mapping between a bucket and a file.
                             */
                            for (Bucket crushFile : crushFiles) {
                                String bucketId = crushFile.name();

                                List<String> bucketFiles = crushFile.contents();

                                print(Verbosity.INFO,
                                        format("\n  Output %s will include %,d input bytes from %,d files",
                                                bucketId, crushFile.size(), bucketFiles.size()));

                                key.set(bucketId);

                                for (String f : bucketFiles) {
                                    boolean changed = uncrushedFiles.remove(f);

                                    assert changed : f;

                                    pathMatcher.reset(f);

                                    pathMatcher.matches();

                                    value.set(pathMatcher.group(5));

                                    writer.append(key, value);

                                    /*
                                     * Print the input file with four leading spaces.
                                     */
                                    print(Verbosity.VERBOSE, "\n    " + f);
                                }

                                jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size());

                                partitionBucketer.add(crushFile);
                            }
                        }
                    }

                    if (!uncrushedFiles.isEmpty()) {
                        print(Verbosity.INFO, "\n\n  Skipped " + uncrushedFiles.size() + " files");

                        for (String uncrushed : uncrushedFiles) {
                            print(Verbosity.VERBOSE, "\n    " + uncrushed);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size());
                    }

                    skippedFiles.addAll(uncrushedFiles);
                }
            }

            dirs = nextLevel;
        }
    } finally {
        try {
            writer.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + bucketFiles, e);
        }
    }

    /*
     * Now that we have processed all the directories, write the partition map.
     */
    List<Bucket> partitions = partitionBucketer.createBuckets();

    assert partitions.size() <= numPartitions;

    writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);

    IntWritable partNum = new IntWritable();

    try {
        for (Bucket partition : partitions) {
            String partitionName = partition.name();

            partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)));

            for (String bucketId : partition.contents()) {
                key.set(bucketId);

                writer.append(key, partNum);
            }
        }
    } finally {
        try {
            writer.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + partitionMap, e);
        }
    }

    DataOutputStream countersStream = fs.create(this.counters);

    try {
        jobCounters.write(countersStream);
    } finally {
        try {
            countersStream.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + partitionMap, e);
        }
    }
}

From source file:com.hdfs.concat.crush.CrushPartitionerTest.java

License:Apache License

@Test
public void partition() throws IOException {

    Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);

    Text key = new Text();
    IntWritable partNum = new IntWritable();

    key.set("bucket-1");
    partNum.set(0);/*from w  ww .  j  a v  a  2s .c  o  m*/
    writer.append(key, partNum);

    key.set("bucket-2");
    partNum.set(0);
    writer.append(key, partNum);

    key.set("bucket-3");
    partNum.set(1);
    writer.append(key, partNum);

    key.set("bucket-4");
    partNum.set(2);
    writer.append(key, partNum);

    key.set("bucket-5");
    partNum.set(2);
    writer.append(key, partNum);

    key.set("bucket-6");
    partNum.set(2);
    writer.append(key, partNum);

    writer.close();

    job.setNumReduceTasks(3);

    partitioner.configure(job);

    Text fileName = new Text();

    key.set("bucket-1");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0));
    }

    key.set("bucket-2");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0));
    }

    key.set("bucket-3");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(1));
    }

    key.set("bucket-4");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
    }

    key.set("bucket-5");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
    }

    key.set("bucket-6");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
    }
}