List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:org.apache.sysml.runtime.controlprogram.parfor.DataPartitionerLocal.java
License:Apache License
private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { long row = -1; long col = -1; try {/* w w w .ja v a 2 s.c o m*/ //STEP 1: read matrix from HDFS and write blocks to local staging area //check and add input path JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<>(); LongWritable key = new LongWritable(); Text value = new Text(); FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer row = st.nextLong(); col = st.nextLong(); double lvalue = st.nextDouble(); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } finally { IOUtilFunctions.closeSilently(reader); } } //STEP 2: read matrix blocks from staging area and write matrix to HDFS String[] fnamesPartitions = new File(fnameStaging).list(); if (PARALLEL) { int len = Math.min(fnamesPartitions.length, _par); Thread[] threads = new Thread[len]; for (int i = 0; i < len; i++) { int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len); int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1; end = Math.min(end, fnamesPartitions.length - 1); threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end)); threads[i].start(); } for (Thread t : threads) t.join(); } else { for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir); } } catch (Exception e) { //post-mortem error handling and bounds checking if (row < 1 || row > rlen || col < 1 || col > clen) { throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } else throw new DMLRuntimeException("Unable to partition text cell matrix.", e); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Apache License
private static void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) throws DMLRuntimeException { try {//from ww w. j ava2 s. co m //delete target file if already exists MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); if (ALLOW_COPY_CELLFILES) { copyAllFiles(fnameNew, inMO); return; //we're done } //actual merge JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fnameNew); FileSystem fs = IOUtilFunctions.getFileSystem(path, job); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); String valueStr = null; try { for (MatrixObject in : inMO) //read/write all inputs { if (LOG.isTraceEnabled()) LOG.trace("ResultMerge (local, file): Merge input " + in.hashCode() + " (fname=" + in.getFileName() + ") via stream merge"); JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf()); Path tmpPath = new Path(in.getFileName()); FileInputFormat.addInputPath(tmpJob, tmpPath); TextInputFormat informat = new TextInputFormat(); informat.configure(tmpJob); InputSplit[] splits = informat.getSplits(tmpJob, 1); LongWritable key = new LongWritable(); Text value = new Text(); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL); try { while (reader.next(key, value)) { valueStr = value.toString().trim(); out.write(valueStr + "\n"); } } finally { IOUtilFunctions.closeSilently(reader); } } } } finally { IOUtilFunctions.closeSilently(out); } } catch (Exception ex) { throw new DMLRuntimeException("Unable to merge text cell results.", ex); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Apache License
private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException { JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(mo.getFileName()); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job);/*from w ww . j av a 2 s. com*/ InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<>(); LongWritable key = new LongWritable(); Text value = new Text(); MatrixCharacteristics mc = mo.getMatrixCharacteristics(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit. // It works fine with int row, col but we require long for larger matrices. // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell) // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0) FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer long row = st.nextLong(); long col = st.nextLong(); double lvalue = Double.parseDouble(st.nextToken()); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } finally { IOUtilFunctions.closeSilently(reader); } } }
From source file:org.apache.sysml.runtime.io.FrameReaderTextCell.java
License:Apache License
protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException { if (fs.isDirectory(path)) { FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job);/* w w w .j a va 2 s .c o m*/ InputSplit[] splits = informat.getSplits(job, 1); for (InputSplit split : splits) readTextCellFrameFromInputSplit(split, informat, job, dest); } else { readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen); } }
From source file:org.apache.sysml.runtime.io.FrameReaderTextCellParallel.java
License:Apache License
@Override protected void readTextCellFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException { int numThreads = OptimizerUtils.getParallelTextReadParallelism(); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job);//from w ww. j ava 2 s . c o m try { //create read tasks for all splits ExecutorService pool = Executors.newFixedThreadPool(numThreads); InputSplit[] splits = informat.getSplits(job, numThreads); ArrayList<ReadTask> tasks = new ArrayList<>(); for (InputSplit split : splits) tasks.add(new ReadTask(split, informat, job, dest)); //wait until all tasks have been executed List<Future<Object>> rt = pool.invokeAll(tasks); pool.shutdown(); //check for exceptions for (Future<Object> task : rt) task.get(); } catch (Exception e) { throw new IOException("Failed parallel read of text cell input.", e); } }
From source file:org.apache.sysml.runtime.io.FrameReaderTextCSV.java
License:Apache License
@Override public final FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) throws IOException, DMLRuntimeException { //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileSystem fs = IOUtilFunctions.getFileSystem(path, job); FileInputFormat.addInputPath(job, path); //check existence and non-empty file checkValidInputFile(fs, path);/* w ww . ja v a 2 s. c o m*/ //compute size if necessary if (rlen <= 0 || clen <= 0) { Pair<Integer, Integer> size = computeCSVSize(path, job, fs); rlen = size.getKey(); clen = size.getValue(); } //allocate output frame block ValueType[] lschema = createOutputSchema(schema, clen); String[] lnames = createOutputNames(names, clen); FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); //core read (sequential/parallel) readCSVFrameFromHDFS(path, job, fs, ret, lschema, lnames, rlen, clen); return ret; }
From source file:org.apache.sysml.runtime.io.ReaderTextCell.java
License:Apache License
private static void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException { boolean sparse = dest.isInSparseFormat(); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job);/*from w ww .ja va 2 s . co m*/ InputSplit[] splits = informat.getSplits(job, 1); LongWritable key = new LongWritable(); Text value = new Text(); int row = -1; int col = -1; try { FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { if (sparse) //SPARSE<-value { while (reader.next(key, value)) { st.reset(value.toString()); //reinit tokenizer row = st.nextInt() - 1; col = st.nextInt() - 1; double lvalue = st.nextDouble(); dest.appendValue(row, col, lvalue); } dest.sortSparseRows(); } else //DENSE<-value { while (reader.next(key, value)) { st.reset(value.toString()); //reinit tokenizer row = st.nextInt() - 1; col = st.nextInt() - 1; double lvalue = st.nextDouble(); dest.setValueDenseUnsafe(row, col, lvalue); } } } finally { IOUtilFunctions.closeSilently(reader); } } } catch (Exception ex) { //post-mortem error handling and bounds checking if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); else throw new IOException("Unable to read matrix in text cell format.", ex); } }
From source file:org.apache.sysml.runtime.io.ReaderTextCellParallel.java
License:Apache License
private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket) throws IOException { int par = _numThreads; FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job);/*from w w w .j a va2 s.c o m*/ //check for min file size for matrix market (adjust num splits if necessary) if (_isMMFile) { long len = MapReduceTool.getFilesizeOnHDFS(path); par = (len < MIN_FILESIZE_MM) ? 1 : par; } try { //create read tasks for all splits ExecutorService pool = Executors.newFixedThreadPool(par); InputSplit[] splits = informat.getSplits(job, par); ArrayList<ReadTask> tasks = new ArrayList<>(); for (InputSplit split : splits) { ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket); tasks.add(t); } //wait until all tasks have been executed List<Future<Long>> rt = pool.invokeAll(tasks); //check for exceptions and aggregate nnz long lnnz = 0; for (Future<Long> task : rt) lnnz += task.get(); //post-processing dest.setNonZeros(lnnz); if (dest.isInSparseFormat()) sortSparseRowsParallel(dest, rlen, _numThreads, pool); pool.shutdown(); } catch (Exception e) { throw new IOException("Threadpool issue, while parallel read.", e); } }
From source file:org.apache.sysml.runtime.io.ReaderTextCSVParallel.java
License:Apache License
@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { // prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileSystem fs = IOUtilFunctions.getFileSystem(path, job); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job);//from w w w . j av a 2s . co m InputSplit[] splits = informat.getSplits(job, _numThreads); splits = IOUtilFunctions.sortInputSplits(splits); // check existence and non-empty file checkValidInputFile(fs, path); // allocate output matrix block // First Read Pass (count rows/cols, determine offsets, allocate matrix block) MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits, path, job, _props.hasHeader(), _props.getDelim(), estnnz); rlen = ret.getNumRows(); clen = ret.getNumColumns(); // Second Read Pass (read, parse strings, append to matrix block) readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen, _props.hasHeader(), _props.getDelim(), _props.isFill(), _props.getFillValue()); //post-processing (representation-specific, change of sparse/dense block representation) // - no sorting required for CSV because it is read in sorted order per row // - nnz explicitly maintained in parallel for the individual splits ret.examSparsity(); // sanity check for parallel row count (since determined internally) if (rlen > 0 && rlen != ret.getNumRows()) throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow=" + rlen + ", real nrow=" + ret.getNumRows()); return ret; }
From source file:org.apache.sysml.runtime.io.ReaderTextCSVParallel.java
License:Apache License
private void readCSVMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue) throws IOException { FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job);//from w ww.j a v a 2 s .c om ExecutorService pool = Executors.newFixedThreadPool(_numThreads); try { // create read tasks for all splits ArrayList<CSVReadTask> tasks = new ArrayList<>(); int splitCount = 0; for (InputSplit split : splits) { tasks.add(new CSVReadTask(split, _offsets, informat, job, dest, rlen, clen, hasHeader, delim, fill, fillValue, splitCount++)); } pool.invokeAll(tasks); pool.shutdown(); // check return codes and aggregate nnz long lnnz = 0; for (CSVReadTask rt : tasks) { lnnz += rt.getPartialNnz(); if (!rt.getReturnCode()) { Exception err = rt.getException(); throw new IOException("Read task for csv input failed: " + err.toString(), err); } } dest.setNonZeros(lnnz); } catch (Exception e) { throw new IOException("Threadpool issue, while parallel read.", e); } }