Example usage for org.apache.hadoop.mapred FileInputFormat addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat addInputPath.

Prototype

public static void addInputPath(JobConf conf, Path path)

Source Link

Document

Add a Path to the list of inputs for the map-reduce job.

Usage

From source file:org.apache.sysml.runtime.controlprogram.parfor.DataPartitionerLocal.java

License:Apache License

private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen,
        int brlen, int bclen) throws DMLRuntimeException {
    long row = -1;
    long col = -1;

    try {/* w w  w .ja v a 2 s.c  o  m*/
        //STEP 1: read matrix from HDFS and write blocks to local staging area
        //check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);

        LinkedList<Cell> buffer = new LinkedList<>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    st.reset(value.toString()); //reset tokenizer
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);

                    buffer.addLast(tmp);
                    if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                    {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }

                //final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }

        //STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging,
                        fnamesPartitions, start, end));
                threads[i].start();
            }

            for (Thread t : threads)
                t.join();
        } else {
            for (String pdir : fnamesPartitions)
                writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        //post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
    }
}

From source file:org.apache.sysml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Apache License

private static void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO)
        throws DMLRuntimeException {
    try {//from   ww w.  j  ava2  s. co  m
        //delete target file if already exists
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);

        if (ALLOW_COPY_CELLFILES) {
            copyAllFiles(fnameNew, inMO);
            return; //we're done
        }

        //actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fnameNew);
        FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

        String valueStr = null;

        try {
            for (MatrixObject in : inMO) //read/write all inputs
            {
                if (LOG.isTraceEnabled())
                    LOG.trace("ResultMerge (local, file): Merge input " + in.hashCode() + " (fname="
                            + in.getFileName() + ") via stream merge");

                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());
                FileInputFormat.addInputPath(tmpJob, tmpPath);
                TextInputFormat informat = new TextInputFormat();
                informat.configure(tmpJob);
                InputSplit[] splits = informat.getSplits(tmpJob, 1);

                LongWritable key = new LongWritable();
                Text value = new Text();

                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob,
                            Reporter.NULL);
                    try {
                        while (reader.next(key, value)) {
                            valueStr = value.toString().trim();
                            out.write(valueStr + "\n");
                        }
                    } finally {
                        IOUtilFunctions.closeSilently(reader);
                    }
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(out);
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge text cell results.", ex);
    }
}

From source file:org.apache.sysml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Apache License

private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID)
        throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);/*from w  ww  . j av  a  2 s. com*/
    InputSplit[] splits = informat.getSplits(job, 1);

    LinkedList<Cell> buffer = new LinkedList<>();
    LongWritable key = new LongWritable();
    Text value = new Text();

    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)

    FastStringTokenizer st = new FastStringTokenizer(' ');

    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                st.reset(value.toString()); //reset tokenizer
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());

                Cell tmp = new Cell(row, col, lvalue);

                buffer.addLast(tmp);
                if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }

            //final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
}

From source file:org.apache.sysml.runtime.io.FrameReaderTextCell.java

License:Apache License

protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest,
        ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    if (fs.isDirectory(path)) {
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);/* w  w  w .j a va 2  s .c o  m*/
        InputSplit[] splits = informat.getSplits(job, 1);
        for (InputSplit split : splits)
            readTextCellFrameFromInputSplit(split, informat, job, dest);
    } else {
        readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
    }
}

From source file:org.apache.sysml.runtime.io.FrameReaderTextCellParallel.java

License:Apache License

@Override
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest,
        ValueType[] schema, String[] names, long rlen, long clen) throws IOException {
    int numThreads = OptimizerUtils.getParallelTextReadParallelism();

    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);//from w ww.  j ava 2  s . c o  m

    try {
        //create read tasks for all splits
        ExecutorService pool = Executors.newFixedThreadPool(numThreads);
        InputSplit[] splits = informat.getSplits(job, numThreads);
        ArrayList<ReadTask> tasks = new ArrayList<>();
        for (InputSplit split : splits)
            tasks.add(new ReadTask(split, informat, job, dest));

        //wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();

        //check for exceptions
        for (Future<Object> task : rt)
            task.get();
    } catch (Exception e) {
        throw new IOException("Failed parallel read of text cell input.", e);
    }
}

From source file:org.apache.sysml.runtime.io.FrameReaderTextCSV.java

License:Apache License

@Override
public final FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen,
        long clen) throws IOException, DMLRuntimeException {
    //prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(fname);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
    FileInputFormat.addInputPath(job, path);

    //check existence and non-empty file
    checkValidInputFile(fs, path);/*  w  ww  .  ja v  a  2  s. c o m*/

    //compute size if necessary
    if (rlen <= 0 || clen <= 0) {
        Pair<Integer, Integer> size = computeCSVSize(path, job, fs);
        rlen = size.getKey();
        clen = size.getValue();
    }

    //allocate output frame block
    ValueType[] lschema = createOutputSchema(schema, clen);
    String[] lnames = createOutputNames(names, clen);
    FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen);

    //core read (sequential/parallel) 
    readCSVFrameFromHDFS(path, job, fs, ret, lschema, lnames, rlen, clen);

    return ret;
}

From source file:org.apache.sysml.runtime.io.ReaderTextCell.java

License:Apache License

private static void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen,
        int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);/*from w  ww  .ja  va  2 s .  co  m*/
    InputSplit[] splits = informat.getSplits(job, 1);

    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = -1;
    int col = -1;

    try {
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);

            try {
                if (sparse) //SPARSE<-value
                {
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.appendValue(row, col, lvalue);
                    }

                    dest.sortSparseRows();
                } else //DENSE<-value
                {
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.setValueDenseUnsafe(row, col, lvalue);
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
    } catch (Exception ex) {
        //post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen)
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        else
            throw new IOException("Unable to read matrix in text cell format.", ex);
    }
}

From source file:org.apache.sysml.runtime.io.ReaderTextCellParallel.java

License:Apache License

private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen,
        int brlen, int bclen, boolean matrixMarket) throws IOException {
    int par = _numThreads;

    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);/*from   w w  w  .j a  va2  s.c o m*/

    //check for min file size for matrix market (adjust num splits if necessary)
    if (_isMMFile) {
        long len = MapReduceTool.getFilesizeOnHDFS(path);
        par = (len < MIN_FILESIZE_MM) ? 1 : par;
    }

    try {
        //create read tasks for all splits
        ExecutorService pool = Executors.newFixedThreadPool(par);
        InputSplit[] splits = informat.getSplits(job, par);
        ArrayList<ReadTask> tasks = new ArrayList<>();
        for (InputSplit split : splits) {
            ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
            tasks.add(t);
        }

        //wait until all tasks have been executed
        List<Future<Long>> rt = pool.invokeAll(tasks);

        //check for exceptions and aggregate nnz
        long lnnz = 0;
        for (Future<Long> task : rt)
            lnnz += task.get();

        //post-processing
        dest.setNonZeros(lnnz);
        if (dest.isInSparseFormat())
            sortSparseRowsParallel(dest, rlen, _numThreads, pool);

        pool.shutdown();
    } catch (Exception e) {
        throw new IOException("Threadpool issue, while parallel read.", e);
    }
}

From source file:org.apache.sysml.runtime.io.ReaderTextCSVParallel.java

License:Apache License

@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
        throws IOException, DMLRuntimeException {
    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(fname);
    FileSystem fs = IOUtilFunctions.getFileSystem(path, job);

    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);//from w  w  w .  j  av a 2s . co  m

    InputSplit[] splits = informat.getSplits(job, _numThreads);
    splits = IOUtilFunctions.sortInputSplits(splits);

    // check existence and non-empty file
    checkValidInputFile(fs, path);

    // allocate output matrix block
    // First Read Pass (count rows/cols, determine offsets, allocate matrix block)
    MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits, path, job, _props.hasHeader(),
            _props.getDelim(), estnnz);
    rlen = ret.getNumRows();
    clen = ret.getNumColumns();

    // Second Read Pass (read, parse strings, append to matrix block)
    readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen, _props.hasHeader(),
            _props.getDelim(), _props.isFill(), _props.getFillValue());

    //post-processing (representation-specific, change of sparse/dense block representation)
    // - no sorting required for CSV because it is read in sorted order per row
    // - nnz explicitly maintained in parallel for the individual splits
    ret.examSparsity();

    // sanity check for parallel row count (since determined internally)
    if (rlen > 0 && rlen != ret.getNumRows())
        throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow="
                + rlen + ", real nrow=" + ret.getNumRows());

    return ret;
}

From source file:org.apache.sysml.runtime.io.ReaderTextCSVParallel.java

License:Apache License

private void readCSVMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen,
        long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue)
        throws IOException {
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);//from w ww.j  a  v  a 2  s  .c  om

    ExecutorService pool = Executors.newFixedThreadPool(_numThreads);

    try {
        // create read tasks for all splits
        ArrayList<CSVReadTask> tasks = new ArrayList<>();
        int splitCount = 0;
        for (InputSplit split : splits) {
            tasks.add(new CSVReadTask(split, _offsets, informat, job, dest, rlen, clen, hasHeader, delim, fill,
                    fillValue, splitCount++));
        }
        pool.invokeAll(tasks);
        pool.shutdown();

        // check return codes and aggregate nnz
        long lnnz = 0;
        for (CSVReadTask rt : tasks) {
            lnnz += rt.getPartialNnz();
            if (!rt.getReturnCode()) {
                Exception err = rt.getException();
                throw new IOException("Read task for csv input failed: " + err.toString(), err);
            }
        }
        dest.setNonZeros(lnnz);
    } catch (Exception e) {
        throw new IOException("Threadpool issue, while parallel read.", e);
    }
}