Example usage for org.apache.hadoop.io Text set

List of usage examples for org.apache.hadoop.io Text set

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text set.

Prototype

public void set(Text other) 

Source Link

Document

copy a text.

Usage

From source file:com.github.seqware.queryengine.plugins.contribs.DonorsToMutationsAndGenesAggregationPlugin.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<Text> values, ReducerInterface<Text, Text> reducerInterface) {
    // key is feature set, value is mutation->gene that can just be cat'd
    Text newVal = new Text();
    StringBuilder newValSB = new StringBuilder();
    newValSB.append(key).append("\t");
    boolean first = true;
    for (Text val : values) {
        if (first) {
            first = false;/*from w  w w.  j  ava  2  s. c  o  m*/
        } else {
            newValSB.append(";");
        }
        newValSB.append(val.toString());
    }
    newVal.set(newValSB.toString());
    reducerInterface.write(newVal, null);
}

From source file:com.github.seqware.queryengine.plugins.contribs.GenesToDonorsAggregationPlugin.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<Text> values, ReducerInterface<Text, Text> reducerInterface) {
    // val are the values for a given gene, in this case it's a comma sep list
    String newFeatStr = "";
    boolean first = true;
    for (Text val : values) {
        String[] fsArr = val.toString().split(",");
        for (String currFS : fsArr) {
            if (first) {
                first = false;//from  w  w w  . ja v  a  2 s. c  om
                newFeatStr += currFS;
            } else {
                newFeatStr += "," + currFS;
            }
        }
        // HELP, not sure what's going in here, why are you writing the text?
        //reducerInterface.write(val, text);
    }
    Text newVal = new Text();
    newVal.set(key.toString() + "\t" + newFeatStr);
    reducerInterface.write(newVal, null);
}

From source file:com.github.seqware.queryengine.plugins.contribs.MutationsToDonorsAggregationPlugin.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<Text> values, ReducerInterface<Text, Text> reducerInterface) {
    // values /*  w w w. j av a  2s  .  com*/
    for (Text val : values) {
        String[] valArr = val.toString().split("\t");
        String[] fsArr = valArr[2].split(",");
        String newFeatStr = "";
        boolean first = true;
        for (String currFS : fsArr) {
            if (first) {
                first = false;
                newFeatStr += currFS;
            } else {
                newFeatStr += "," + currFS;
            }
        }

        val.set(valArr[0] + "\t" + valArr[1] + "\t" + newFeatStr);
        reducerInterface.write(val, null);
    }
}

From source file:com.google.mr4c.hadoop.MR4CRecordReader.java

License:Open Source License

public boolean next(Text key, DataKeyList value) {
    if (m_done) {
        return false;
    }//from   w  ww .  j  av  a  2s.com
    key.set("" + m_split.getSequenceNumber());
    value.setKeys(m_split.getKeys().getKeys());
    m_done = true;
    return true;
}

From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java

License:Open Source License

private int fillText(char[] chars, Random r, int charsMax, Text text) {
    StringBuilder sb = new StringBuilder();
    // get a reasonable string length
    int stringLength = r.nextInt(charsMax * 2);
    for (int j = 0; j < stringLength; j++) {
        sb.append(chars[r.nextInt(charsMax)]);
    }/*from   w ww  .ja va 2s .c o m*/
    text.set(sb.toString());
    return stringLength;
}

From source file:com.hazelcast.jet.hadoop.impl.ReadHdfsPTest.java

License:Open Source License

private static void writeToSequenceFile(Configuration conf, Path path) throws IOException {
    IntWritable key = new IntWritable();
    Text value = new Text();
    Option fileOption = Writer.file(path);
    Option keyClassOption = Writer.keyClass(key.getClass());
    Option valueClassOption = Writer.valueClass(value.getClass());
    try (Writer writer = SequenceFile.createWriter(conf, fileOption, keyClassOption, valueClassOption)) {
        for (int i = 0; i < ENTRIES.length; i++) {
            key.set(i);//from  ww  w .ja  v a2s.co  m
            value.set(ENTRIES[i]);
            writer.append(key, value);
        }
    }
}

From source file:com.hazelcast.jet.impl.connector.hadoop.ReadHdfsPTest.java

License:Open Source License

private void writeToSequenceFile(Configuration conf, Path path) throws IOException {
    IntWritable key = new IntWritable();
    Text value = new Text();
    Option fileOption = Writer.file(path);
    Option keyClassOption = Writer.keyClass(key.getClass());
    Option valueClassOption = Writer.valueClass(value.getClass());
    try (Writer writer = SequenceFile.createWriter(conf, fileOption, keyClassOption, valueClassOption)) {
        for (int i = 0; i < ENTRIES.length; i++) {
            key.set(i);//from  w  w  w  . j a  v  a  2s.c o m
            value.set(ENTRIES[i]);
            writer.append(key, value);
        }
    }
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

private void cloneOutput() throws IOException {

    List<FileStatus> listStatus = getOutputMappings();

    /*//from   ww  w  . jav  a2s  .c  o  m
     * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is
     * used in the subsequent iterations.
     */
    List<Path> crushInput = emptyList();

    Text srcFile = new Text();
    Text crushOut = new Text();
    Text prevCrushOut = new Text();

    for (FileStatus partFile : listStatus) {
        Path path = partFile.getPath();

        Reader reader = new Reader(fs, path, fs.getConf());

        try {
            while (reader.next(srcFile, crushOut)) {
                if (!crushOut.equals(prevCrushOut)) {
                    swap(crushInput, prevCrushOut.toString());

                    prevCrushOut.set(crushOut);
                    crushInput = new LinkedList<Path>();
                }

                crushInput.add(new Path(srcFile.toString()));
            }
        } finally {
            try {
                reader.close();
            } catch (IOException e) {
                LOG.warn("Trapped exception when closing " + path, e);
            }
        }

        swap(crushInput, prevCrushOut.toString());
    }
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

void writeDirs() throws IOException {

    print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath());

    FileStatus status = fs.getFileStatus(srcDir);

    Path tmpIn = new Path(tmpDir, "in");

    bucketFiles = new Path(tmpIn, "dirs");
    partitionMap = new Path(tmpIn, "partition-map");
    counters = new Path(tmpIn, "counters");

    skippedFiles = new HashSet<String>();

    /*//from w  w  w  .j  av  a  2  s. com
     * Prefer the path returned by the status because it is always fully qualified.
     */
    List<Path> dirs = asList(status.getPath());

    Text key = new Text();
    Text value = new Text();

    Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class,
            CompressionType.BLOCK);

    int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks"));

    Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false);
    partitionBucketer.reset("partition-map");

    jobCounters = new Counters();

    try {
        while (!dirs.isEmpty()) {
            List<Path> nextLevel = new LinkedList<Path>();

            for (Path dir : dirs) {
                jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);

                print(Verbosity.INFO, "\n\n" + dir.toUri().getPath());

                FileStatus[] contents = fs.listStatus(dir, new PathFilter() {
                    @Override
                    public boolean accept(Path testPath) {
                        if (ignoredFiles == null)
                            return true;
                        ignoredFiles.reset(testPath.toUri().getPath());
                        return !ignoredFiles.matches();
                    }

                });

                if (contents == null || contents.length == 0) {
                    print(Verbosity.INFO, " is empty");

                    jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                } else {
                    List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length);
                    Set<String> uncrushedFiles = new HashSet<String>(contents.length);

                    long crushableBytes = 0;

                    /*
                     * Queue sub directories for subsequent inspection and examine the files in this directory.
                     */
                    for (FileStatus content : contents) {
                        Path path = content.getPath();

                        if (content.isDir()) {
                            nextLevel.add(path);
                        } else {
                            boolean changed = uncrushedFiles.add(path.toUri().getPath());

                            assert changed : path.toUri().getPath();

                            long fileLength = content.getLen();

                            if (fileLength <= maxEligibleSize) {
                                crushables.add(content);
                                crushableBytes += fileLength;
                            }
                        }
                    }

                    /*
                     * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the
                     * number of files we found.
                     */
                    if (!uncrushedFiles.isEmpty()) {
                        if (-1 == findMatcher(dir)) {
                            throw new IllegalArgumentException(
                                    "Could not find matching regex for directory: " + dir);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size());
                    }

                    if (0 == crushableBytes) {
                        print(Verbosity.INFO, " has no crushable files");

                        jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                    } else {
                        /*
                         * We found files to consider for crushing.
                         */
                        long nBlocks = crushableBytes / dfsBlockSize;

                        if (nBlocks * dfsBlockSize != crushableBytes) {
                            nBlocks++;
                        }

                        /*
                         * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory.
                         */
                        long dirBuckets = nBlocks / maxFileBlocks;

                        if (dirBuckets * maxFileBlocks != nBlocks) {
                            dirBuckets++;
                        }

                        if (dirBuckets > Integer.MAX_VALUE) {
                            throw new AssertionError("Too many buckets: " + dirBuckets);
                        }

                        Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs);

                        directoryBucketer.reset(getPathPart(dir));

                        for (FileStatus file : crushables) {
                            directoryBucketer.add(new FileStatusHasSize(file));
                        }

                        List<Bucket> crushFiles = directoryBucketer.createBuckets();

                        if (crushFiles.isEmpty()) {
                            jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                        } else {
                            nBuckets += crushFiles.size();

                            jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

                            print(Verbosity.INFO, " => " + crushFiles.size() + " output files");

                            /*
                             * Write out the mapping between a bucket and a file.
                             */
                            for (Bucket crushFile : crushFiles) {
                                String bucketId = crushFile.name();

                                List<String> bucketFiles = crushFile.contents();

                                print(Verbosity.INFO,
                                        format("\n  Output %s will include %,d input bytes from %,d files",
                                                bucketId, crushFile.size(), bucketFiles.size()));

                                key.set(bucketId);

                                for (String f : bucketFiles) {
                                    boolean changed = uncrushedFiles.remove(f);

                                    assert changed : f;

                                    pathMatcher.reset(f);

                                    pathMatcher.matches();

                                    value.set(pathMatcher.group(5));

                                    writer.append(key, value);

                                    /*
                                     * Print the input file with four leading spaces.
                                     */
                                    print(Verbosity.VERBOSE, "\n    " + f);
                                }

                                jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size());

                                partitionBucketer.add(crushFile);
                            }
                        }
                    }

                    if (!uncrushedFiles.isEmpty()) {
                        print(Verbosity.INFO, "\n\n  Skipped " + uncrushedFiles.size() + " files");

                        for (String uncrushed : uncrushedFiles) {
                            print(Verbosity.VERBOSE, "\n    " + uncrushed);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size());
                    }

                    skippedFiles.addAll(uncrushedFiles);
                }
            }

            dirs = nextLevel;
        }
    } finally {
        try {
            writer.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + bucketFiles, e);
        }
    }

    /*
     * Now that we have processed all the directories, write the partition map.
     */
    List<Bucket> partitions = partitionBucketer.createBuckets();

    assert partitions.size() <= numPartitions;

    writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);

    IntWritable partNum = new IntWritable();

    try {
        for (Bucket partition : partitions) {
            String partitionName = partition.name();

            partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)));

            for (String bucketId : partition.contents()) {
                key.set(bucketId);

                writer.append(key, partNum);
            }
        }
    } finally {
        try {
            writer.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + partitionMap, e);
        }
    }

    DataOutputStream countersStream = fs.create(this.counters);

    try {
        jobCounters.write(countersStream);
    } finally {
        try {
            countersStream.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + partitionMap, e);
        }
    }
}

From source file:com.hdfs.concat.crush.CrushPartitionerTest.java

License:Apache License

@Test
public void partition() throws IOException {

    Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);

    Text key = new Text();
    IntWritable partNum = new IntWritable();

    key.set("bucket-1");
    partNum.set(0);/*from w  ww .  j  a v  a  2s .c  o  m*/
    writer.append(key, partNum);

    key.set("bucket-2");
    partNum.set(0);
    writer.append(key, partNum);

    key.set("bucket-3");
    partNum.set(1);
    writer.append(key, partNum);

    key.set("bucket-4");
    partNum.set(2);
    writer.append(key, partNum);

    key.set("bucket-5");
    partNum.set(2);
    writer.append(key, partNum);

    key.set("bucket-6");
    partNum.set(2);
    writer.append(key, partNum);

    writer.close();

    job.setNumReduceTasks(3);

    partitioner.configure(job);

    Text fileName = new Text();

    key.set("bucket-1");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0));
    }

    key.set("bucket-2");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0));
    }

    key.set("bucket-3");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(1));
    }

    key.set("bucket-4");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
    }

    key.set("bucket-5");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
    }

    key.set("bucket-6");

    for (int file = 0; file < 4; file++) {
        fileName.set("file" + file);
        assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
    }
}