Example usage for org.apache.hadoop.mapred FileInputFormat addInputPath

List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat addInputPath.

Prototype

public static void addInputPath(JobConf conf, Path path) 

Source Link

Document

Add a Path to the list of inputs for the map-reduce job.

Usage

From source file:com.hadoop.secondarysort.SecondarySort_MapRed.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);/*from w  ww .j  a  v  a2s .c  o m*/
    }

    JobConf jobConf = new JobConf(conf);
    jobConf.setMapperClass(MapClass.class);
    jobConf.setReducerClass(Reduce.class);

    jobConf.setPartitionerClass(FirstPartitioner.class);
    jobConf.setOutputValueGroupingComparator(FirstGroupingComparator.class);

    jobConf.setMapOutputKeyClass(IntPair.class);
    jobConf.setMapOutputValueClass(IntWritable.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(IntWritable.class);

    //
    // Job job = new Job(conf, "secondary sort");
    // job.setJarByClass(SecondarySort_MapRed.class);
    // job.setMapperClass(MapClass.class);
    // job.setReducerClass(Reduce.class);
    //
    // // group and partition by the first int in the pair
    // job.setPartitionerClass(FirstPartitioner.class);
    // job.setGroupingComparatorClass(FirstGroupingComparator.class);
    // conf.setClass("mapred.output.key.comparator.class",
    // KeyComparator.class, RawComparator.class);
    // // job.setSortComparatorClass(SecondGroupingComparator.class);
    // // the map output is IntPair, IntWritable
    // job.setMapOutputKeyClass(IntPair.class);
    // job.setMapOutputValueClass(IntWritable.class);
    //
    // // the reduce output is Text, IntWritable
    // job.setOutputKeyClass(Text.class);
    // job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(jobConf, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs[1]));

}

From source file:com.hazelcast.jet.hadoop.impl.ReadHdfsPTest.java

License:Open Source License

@Before
public void setup() throws IOException {
    instance = createJetMember();//from ww w. java 2  s .  com
    jobConf = new JobConf();
    jobConf.setInputFormat(inputFormatClass);

    writeToFile();
    for (Path path : paths) {
        FileInputFormat.addInputPath(jobConf, path);
    }
}

From source file:com.hazelcast.jet.hadoop.impl.WriteHdfsPTest.java

License:Open Source License

@Test
public void testWriteFile() throws Exception {
    int messageCount = 320;
    String mapName = randomMapName();
    JetInstance instance = createJetMember();
    createJetMember();/*w  w w .  j a va2s  .c o  m*/

    Map<IntWritable, IntWritable> map = IntStream.range(0, messageCount).boxed()
            .collect(toMap(IntWritable::new, IntWritable::new));
    instance.getMap(mapName).putAll(map);

    Path path = getPath();

    JobConf conf = new JobConf();
    conf.setOutputFormat(outputFormatClass);
    conf.setOutputCommitter(FileOutputCommitter.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);

    if (outputFormatClass.equals(LazyOutputFormat.class)) {
        LazyOutputFormat.setOutputFormatClass(conf, TextOutputFormat.class);
    }

    FileOutputFormat.setOutputPath(conf, path);

    Pipeline p = Pipeline.create();
    p.drawFrom(Sources.map(mapName)).drainTo(HdfsSinks.hdfs(conf))
            // we use higher value to increase the race chance for LazyOutputFormat
            .setLocalParallelism(8);

    Future<Void> future = instance.newJob(p).getFuture();
    assertCompletesEventually(future);

    JobConf readJobConf = new JobConf();
    readJobConf.setInputFormat(inputFormatClass);
    FileInputFormat.addInputPath(readJobConf, path);

    p = Pipeline.create();
    p.drawFrom(HdfsSources.hdfs(readJobConf)).drainTo(Sinks.list("results"));

    future = instance.newJob(p).getFuture();
    assertCompletesEventually(future);

    IList<Object> results = instance.getList("results");
    assertEquals(messageCount, results.size());
}

From source file:com.hazelcast.jet.impl.connector.hadoop.WriteHdfsPTest.java

License:Open Source License

@Test
public void testWriteFile() throws Exception {
    int messageCount = 20;
    String mapName = randomMapName();
    JetInstance instance = createJetMember();
    createJetMember();/*from   www  .  jav  a2s.  c om*/

    Map<IntWritable, IntWritable> map = IntStream.range(0, messageCount).boxed()
            .collect(toMap(IntWritable::new, IntWritable::new));
    instance.getMap(mapName).putAll(map);

    DAG dag = new DAG();
    Vertex producer = dag.newVertex("producer", readMap(mapName)).localParallelism(1);

    Path path = getPath();

    JobConf conf = new JobConf();
    conf.setOutputFormat(outputFormatClass);
    conf.setOutputCommitter(FileOutputCommitter.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(conf, path);

    Vertex consumer = dag.newVertex("consumer", writeHdfs(conf)).localParallelism(4);

    dag.edge(between(producer, consumer));

    Future<Void> future = instance.newJob(dag).execute();
    assertCompletesEventually(future);

    dag = new DAG();
    JobConf readJobConf = new JobConf();
    readJobConf.setInputFormat(inputFormatClass);
    FileInputFormat.addInputPath(readJobConf, path);
    producer = dag.newVertex("producer", readHdfs(readJobConf)).localParallelism(8);

    consumer = dag.newVertex("consumer", writeList("results")).localParallelism(1);

    dag.edge(between(producer, consumer));
    future = instance.newJob(dag).execute();
    assertCompletesEventually(future);

    IList<Object> results = instance.getList("results");
    assertEquals(messageCount, results.size());
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java

License:Open Source License

/**
 * /*from  ww w.j a  v a  2 s . c  o  m*/
 * @param fname
 * @param fnameStaging
 * @param fnameNew
 * @param brlen
 * @param bclen
 * @throws DMLRuntimeException
 */
private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen,
        int brlen, int bclen) throws DMLRuntimeException {
    long row = -1;
    long col = -1;

    try {
        //STEP 1: read matrix from HDFS and write blocks to local staging area         
        //check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);

        LinkedList<Cell> buffer = new LinkedList<Cell>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    st.reset(value.toString()); //reset tokenizer
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);

                    buffer.addLast(tmp);
                    if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                    {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }

                //final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                if (reader != null)
                    reader.close();
            }
        }

        //STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging,
                        fnamesPartitions, start, end));
                threads[i].start();
            }

            for (Thread t : threads)
                t.join();
        } else {
            for (String pdir : fnamesPartitions)
                writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        //post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * /*from   ww w. j  a v  a 2  s.  c om*/
 * @param fnameNew
 * @param outMo
 * @param inMO
 * @throws DMLRuntimeException
 */
private void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO)
        throws DMLRuntimeException {
    try {
        //delete target file if already exists
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);

        if (ALLOW_COPY_CELLFILES) {
            copyAllFiles(fnameNew, inMO);
            return; //we're done
        }

        //actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(fnameNew);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

        String valueStr = null;

        try {
            for (MatrixObject in : inMO) //read/write all inputs
            {
                LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname="
                        + in.getFileName() + ") via stream merge");

                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());
                FileInputFormat.addInputPath(tmpJob, tmpPath);
                TextInputFormat informat = new TextInputFormat();
                informat.configure(tmpJob);
                InputSplit[] splits = informat.getSplits(tmpJob, 1);

                LongWritable key = new LongWritable();
                Text value = new Text();

                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob,
                            Reporter.NULL);
                    try {
                        while (reader.next(key, value)) {
                            valueStr = value.toString().trim();
                            out.write(valueStr + "\n");
                        }
                    } finally {
                        if (reader != null)
                            reader.close();
                    }
                }
            }
        } finally {
            if (out != null)
                out.close();
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge text cell results.", ex);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * //from w  w  w  .jav  a  2  s.  c om
 * @param fnameStaging
 * @param mo
 * @param ID
 * @throws IOException
 * @throws DMLRuntimeException
 */

private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID)
        throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);

    LinkedList<Cell> buffer = new LinkedList<Cell>();
    LongWritable key = new LongWritable();
    Text value = new Text();

    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)

    FastStringTokenizer st = new FastStringTokenizer(' ');

    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                st.reset(value.toString()); //reset tokenizer
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());

                Cell tmp = new Cell(row, col, lvalue);

                buffer.addLast(tmp);
                if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }

            //final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            if (reader != null)
                reader.close();
        }
    }
}

From source file:com.ibm.bi.dml.runtime.io.ReaderBinaryBlockParFiles.java

License:Open Source License

/**
 * //from   w  w w.  j a va  2 s. c  om
 * @param path
 * @param job
 * @param fs 
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws IOException
 * @throws IllegalAccessException
 * @throws InstantiationException
 * @throws DMLRuntimeException 
 */
private static void readBinaryBlockMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest,
        long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException {
    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    FileInputFormat.addInputPath(job, path);

    ExecutorService pool = Executors.newFixedThreadPool(_numThreads);
    try {
        //create read tasks for all splits
        ArrayList<ReadMatrixPerPartfileTask> tasks = new ArrayList<ReadMatrixPerPartfileTask>();
        for (Path lpath : getSequenceFilePaths(fs, path)) {
            ReadMatrixPerPartfileTask t = new ReadMatrixPerPartfileTask(lpath, job, fs, dest, rlen, clen, brlen,
                    bclen);
            tasks.add(t);
        }

        //wait until all tasks have been executed
        pool.invokeAll(tasks);
        pool.shutdown();

        //early error notify in case not all tasks successful
        for (ReadMatrixPerPartfileTask rt : tasks) {
            if (!rt.getReturnCode()) {
                throw new IOException("Read task for text input failed: " + rt.getErrMsg());
            }
        }

    } catch (Exception e) {
        throw new IOException("Threadpool issue, while parallel read.", e);
    }
}

From source file:com.ibm.bi.dml.runtime.io.ReaderBinaryBlockParSplits.java

License:Open Source License

/**
 * //from   w  w w.ja v a 2 s  . c o m
 * @param path
 * @param job
 * @param fs 
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws IOException
 * @throws IllegalAccessException
 * @throws InstantiationException
 * @throws DMLRuntimeException 
 */
private static void readBinaryBlockMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest,
        long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException {
    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    FileInputFormat.addInputPath(job, path);
    /*      
          BinaryBlockInputFormat informat = new BinaryBlockInputFormat();
          TextInputFormat informat = new TextInputFormat();
          informat.configure(job);
          InputSplit[] splits = informat.getSplits(job, _numThreads);
            
          SequenceFileInputFormat<MatrixIndexes, MatrixBlock> informat = new SequenceFileInputFormat<MatrixIndexes, MatrixBlock>();
          InputSplit[] seqsplits = informat.getSplits(job, _numThreads);
    */
    BinaryBlockInputFormat informat = new BinaryBlockInputFormat();
    InputSplit[] seqsplits = informat.getSplits(job, _numThreads);

    ExecutorService pool = Executors.newFixedThreadPool(_numThreads);
    try {
        //create read tasks for all splits
        ArrayList<ReadMatrixPerSplitTask> tasks = new ArrayList<ReadMatrixPerSplitTask>();

        for (InputSplit split : seqsplits) {
            ReadMatrixPerSplitTask t = new ReadMatrixPerSplitTask(split, informat, job, dest, rlen, clen, brlen,
                    bclen);
            tasks.add(t);
        }

        //wait until all tasks have been executed
        pool.invokeAll(tasks);
        pool.shutdown();

        //early error notify in case not all tasks successful
        for (ReadMatrixPerSplitTask rt : tasks) {
            if (!rt.getReturnCode()) {
                throw new IOException("Read task for text input failed: " + rt.getErrMsg());
            }
        }

    } catch (Exception e) {
        throw new IOException("Threadpool issue, while parallel read.", e);
    }
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCell.java

License:Open Source License

/**
 * /*from w  ww  .ja  v a 2 s .  c  o m*/
 * @param path
 * @param job
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws IOException
 * @throws IllegalAccessException
 * @throws InstantiationException
 */
private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen,
        int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);

    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = -1;
    int col = -1;

    try {
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);

            try {
                if (sparse) //SPARSE<-value
                {
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.appendValue(row, col, lvalue);
                    }

                    dest.sortSparseRows();
                } else //DENSE<-value
                {
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.setValueDenseUnsafe(row, col, lvalue);
                    }
                }
            } finally {
                if (reader != null)
                    reader.close();
            }
        }
    } catch (Exception ex) {
        //post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else {
            throw new IOException("Unable to read matrix in text cell format.", ex);
        }
    }
}