List of usage examples for org.apache.hadoop.fs FileSystem isDirectory
@Deprecated public boolean isDirectory(Path f) throws IOException
From source file:com.ibm.bi.dml.runtime.io.MatrixReader.java
License:Open Source License
/** * // w w w.j av a 2 s . c o m * @param file * @return * @throws IOException */ public static Path[] getSequenceFilePaths(FileSystem fs, Path file) throws IOException { Path[] ret = null; if (fs.isDirectory(file)) { LinkedList<Path> tmp = new LinkedList<Path>(); FileStatus[] dStatus = fs.listStatus(file); for (FileStatus fdStatus : dStatus) if (!fdStatus.getPath().getName().startsWith("_")) //skip internal files tmp.add(fdStatus.getPath()); ret = tmp.toArray(new Path[0]); } else { ret = new Path[] { file }; } return ret; }
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCell.java
License:Open Source License
@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { //allocate output matrix block MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false); //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fname); //check existence and non-empty file checkValidInputFile(fs, path);/*from w w w. j a v a2 s . c o m*/ //core read if (fs.isDirectory(path)) readTextCellMatrixFromHDFS(path, job, ret, rlen, clen, brlen, bclen); else readRawTextCellMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen, _isMMFile); //finally check if change of sparse/dense block representation required if (!ret.isInSparseFormat()) ret.recomputeNonZeros(); ret.examSparsity(); return ret; }
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSV.java
License:Open Source License
/** * /*from w w w. ja v a 2s . c o m*/ * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @param hasHeader * @param delim * @param fill * @param fillValue * @return * @throws IOException */ @SuppressWarnings("unchecked") private MatrixBlock readCSVMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue) throws IOException { ArrayList<Path> files = new ArrayList<Path>(); if (fs.isDirectory(path)) { for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); } else files.add(path); if (dest == null) { dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue); clen = dest.getNumColumns(); } boolean sparse = dest.isInSparseFormat(); ///////////////////////////////////////// String value = null; int row = 0; int col = -1; double cellValue = 0; long lnnz = 0; for (int fileNo = 0; fileNo < files.size(); fileNo++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); if (fileNo == 0 && hasHeader) br.readLine(); //ignore header // Read the data boolean emptyValuesFound = false; try { if (sparse) //SPARSE<-value { while ((value = br.readLine()) != null) //foreach line { String cellStr = value.toString().trim(); emptyValuesFound = false; String[] parts = IOUtilFunctions.split(cellStr, delim); col = 0; for (String part : parts) //foreach cell { part = part.trim(); if (part.isEmpty()) { emptyValuesFound = true; cellValue = fillValue; } else { cellValue = UtilFunctions.parseToDouble(part); } if (cellValue != 0) { dest.appendValue(row, col, cellValue); lnnz++; } col++; } //sanity checks for empty values and number of columns IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound); IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen); row++; } } else //DENSE<-value { while ((value = br.readLine()) != null) //foreach line { String cellStr = value.toString().trim(); emptyValuesFound = false; String[] parts = IOUtilFunctions.split(cellStr, delim); col = 0; for (String part : parts) //foreach cell { part = part.trim(); if (part.isEmpty()) { emptyValuesFound = true; cellValue = fillValue; } else { cellValue = UtilFunctions.parseToDouble(part); } if (cellValue != 0) { dest.setValueDenseUnsafe(row, col, cellValue); lnnz++; } col++; } //sanity checks for empty values and number of columns IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound); IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen); row++; } } } finally { IOUtilFunctions.closeSilently(br); } } //post processing dest.setNonZeros(lnnz); return dest; }
From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java
License:Open Source License
/** * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. * The part files are created by CSV_WRITE MR job. * //from ww w . j ava 2s . c o m * This method is invoked from CP-write instruction. * * @param srcFileName * @param destFileName * @param csvprop * @param rlen * @param clen * @throws IOException */ public void mergeCSVPartFiles(String srcFileName, String destFileName, CSVFileFormatProperties csvprop, long rlen, long clen) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path srcFilePath = new Path(srcFileName); Path mergedFilePath = new Path(destFileName); FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(mergedFilePath)) { hdfs.delete(mergedFilePath, true); } OutputStream out = hdfs.create(mergedFilePath, true); // write out the header, if needed if (csvprop.hasHeader()) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < clen; i++) { sb.append("C" + (i + 1)); if (i < clen - 1) sb.append(csvprop.getDelim()); } sb.append('\n'); out.write(sb.toString().getBytes()); sb.setLength(0); } // if the source is a directory if (hdfs.isDirectory(srcFilePath)) { try { FileStatus[] contents = hdfs.listStatus(srcFilePath); Path[] partPaths = new Path[contents.length]; int numPartFiles = 0; for (int i = 0; i < contents.length; i++) { if (!contents[i].isDirectory()) { partPaths[i] = contents[i].getPath(); numPartFiles++; } } Arrays.sort(partPaths); for (int i = 0; i < numPartFiles; i++) { InputStream in = hdfs.open(partPaths[i]); try { IOUtils.copyBytes(in, out, conf, false); if (i < numPartFiles - 1) out.write('\n'); } finally { IOUtilFunctions.closeSilently(in); } } } finally { IOUtilFunctions.closeSilently(out); } } else if (hdfs.isFile(srcFilePath)) { InputStream in = null; try { in = hdfs.open(srcFilePath); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(srcFilePath.toString() + ": No such file or directory"); } }
From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java
License:Open Source License
/** * /* w ww .ja v a2 s .c om*/ * @param srcFileName * @param destFileName * @param csvprop * @param rlen * @param clen * @throws IOException */ @SuppressWarnings("unchecked") public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path srcFilePath = new Path(srcFileName); Path destFilePath = new Path(destFileName); FileSystem hdfs = FileSystem.get(conf); if (!_props.hasHeader()) { // simply move srcFile to destFile /* * TODO: Remove this roundabout way! * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv * & the only path that exists already on HDFS is /user/biadmin/csv/. * In this case: the directory structure /user/biadmin/csv/temp/out must be created. * Simple hdfs.rename() does not seem to create this directory structure. */ // delete the destination file, if exists already //boolean ret1 = hdfs.delete(destFilePath, true); // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created. //boolean ret2 = hdfs.createNewFile(destFilePath); // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/ //boolean ret3 = hdfs.delete(destFilePath, true); // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv //boolean ret4 = hdfs.rename(srcFilePath, destFilePath); //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4); return; } // construct the header line StringBuilder sb = new StringBuilder(); for (int i = 0; i < clen; i++) { sb.append("C" + (i + 1)); if (i < clen - 1) sb.append(_props.getDelim()); } sb.append('\n'); if (hdfs.isDirectory(srcFilePath)) { // compute sorted order among part files ArrayList<Path> files = new ArrayList<Path>(); for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); // first part file path Path firstpart = files.get(0); // create a temp file, and add header and contents of first part Path tmp = new Path(firstpart.toString() + ".tmp"); OutputStream out = hdfs.create(tmp, true); out.write(sb.toString().getBytes()); sb.setLength(0); // copy rest of the data from firstpart InputStream in = null; try { in = hdfs.open(firstpart); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } // rename tmp to firstpart hdfs.delete(firstpart, true); hdfs.rename(tmp, firstpart); // rename srcfile to destFile hdfs.delete(destFilePath, true); hdfs.createNewFile(destFilePath); // force the creation of directory structure hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure hdfs.rename(srcFilePath, destFilePath); // move the data } else if (hdfs.isFile(srcFilePath)) { // create destination file OutputStream out = hdfs.create(destFilePath, true); // write header out.write(sb.toString().getBytes()); sb.setLength(0); // copy the data from srcFile InputStream in = null; try { in = hdfs.open(srcFilePath); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(srcFilePath.toString() + ": No such file or directory"); } }
From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java
License:Open Source License
/** * Method to find the first (part)file in the order given by <code>fs.listStatus()</code> among all (part)files in <code>inpathPath</code>. * /*from w w w.j av a 2 s . c o m*/ * @param job * @param inputPath * @return * @throws IOException * @throws FileNotFoundException */ public static String findSmallestFile(JobConf job, String inputPath) throws FileNotFoundException, IOException { String smallestFile = null; Path p = new Path(inputPath); FileSystem fs = p.getFileSystem(job); if (!fs.isDirectory(p)) smallestFile = p.makeQualified(fs).toString(); else { FileStatus[] stats = fs.listStatus(p, hiddenFileFilter); if (stats.length == 0) smallestFile = ""; else { smallestFile = stats[0].getPath().toString(); for (int j = 1; j < stats.length; j++) { String f = stats[j].getPath().toString(); if (f.compareTo(smallestFile) < 0) smallestFile = f; } } } return smallestFile; }
From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java
License:Open Source License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { String[] smallestFiles = new String[inputs.length]; JobConf job = new JobConf(); for (int i = 0; i < inputs.length; i++) { smallestFiles[i] = findSmallestFile(job, inputs[i]); }//from w w w.j a v a2 s .co m for (int i = 0; i < inputs.length; i++) { Path p = new Path(inputs[i]); FileSystem fs = p.getFileSystem(job); if (!fs.isDirectory(p)) smallestFiles[i] = p.makeQualified(fs).toString(); else { FileStatus[] stats = fs.listStatus(p, hiddenFileFilter); if (stats.length == 0) smallestFiles[i] = ""; else { smallestFiles[i] = stats[0].getPath().toString(); for (int j = 1; j < stats.length; j++) { String f = stats[j].getPath().toString(); if (f.compareTo(smallestFiles[i]) < 0) smallestFiles[i] = f; } } } } AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(inputs, inputInfos, brlens, bclens, reblockInstructions, replication, smallestFiles); for (int i = 0; i < rlens.length; i++) if ((rlens[i] > 0 && rlens[i] != ret1.rlens[i]) || (clens[i] > 0 && clens[i] != ret1.clens[i])) throw new RuntimeException("Dimension doesn't mach for input matrix " + i + ", expected (" + rlens[i] + ", " + clens[i] + ") but real (" + ret1.rlens[i] + ", " + ret1.clens[i] + ")"); JobReturn ret = CSVReblockMR.runCSVReblockJob(null, inputs, inputInfos, ret1.rlens, ret1.clens, brlens, bclens, reblockInstructions, otherInstructionsInReducer, numReducers, replication, resultIndexes, outputs, outputInfos, ret1.counterFile, smallestFiles); return ret; }
From source file:com.ibm.bi.dml.runtime.transform.BinAgent.java
License:Open Source License
/** * Method to load transform metadata for all attributes * // ww w . j ava 2 s .c om * @param job * @throws IOException */ @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if (_binList == null) return; if (fs.isDirectory(txMtdDir)) { for (int i = 0; i < _binList.length; i++) { int colID = _binList[i]; Path path = new Path(txMtdDir + "/Bin/" + agents.getName(colID) + BIN_FILE_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); // format: colID,min,max,nbins String[] fields = br.readLine().split(TXMTD_SEP); double min = UtilFunctions.parseToDouble(fields[1]); //double max = UtilFunctions.parseToDouble(fields[2]); double binwidth = UtilFunctions.parseToDouble(fields[3]); int nbins = UtilFunctions.parseToInt(fields[4]); _numBins[i] = nbins; _min[i] = min; _binWidths[i] = binwidth; // (max-min)/nbins; br.close(); } } else { fs.close(); throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir); } }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
/** * Helper function to fetch and sort the list of part files under the given * input directory.//from w w w .j a va2 s. c o m * * @param input * @param fs * @return * @throws FileNotFoundException * @throws IOException */ @SuppressWarnings("unchecked") private static ArrayList<Path> collectInputFiles(String input, FileSystem fs) throws FileNotFoundException, IOException { Path path = new Path(input); ArrayList<Path> files = new ArrayList<Path>(); if (fs.isDirectory(path)) { for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); } else files.add(path); return files; }
From source file:com.ibm.bi.dml.runtime.transform.MVImputeAgent.java
License:Open Source License
/** * Method to load transform metadata for all attributes * //from w ww.ja v a2s .co m * @param job * @throws IOException */ @Override public void loadTxMtd(JobConf job, FileSystem fs, Path tfMtdDir, TfUtils agents) throws IOException { if (fs.isDirectory(tfMtdDir)) { // Load information about missing value imputation if (_mvList != null) for (int i = 0; i < _mvList.length; i++) { int colID = _mvList[i]; if (_mvMethodList[i] == 1 || _mvMethodList[i] == 2) // global_mean or global_mode _replacementList[i] = readReplacement(colID, fs, tfMtdDir, agents); else if (_mvMethodList[i] == 3) { // constant: replace a missing value by a given constant // nothing to do. The constant values are loaded already during configure } else throw new RuntimeException("Invalid Missing Value Imputation methods: " + _mvMethodList[i]); } // Load scaling information if (_mvList != null) for (int i = 0; i < _mvList.length; i++) if (_isMVScaled.get(i)) processScalingFile(i, _mvList, _meanList, _varList, fs, tfMtdDir, agents); if (_scnomvList != null) for (int i = 0; i < _scnomvList.length; i++) processScalingFile(i, _scnomvList, _scnomvMeanList, _scnomvVarList, fs, tfMtdDir, agents); } else { fs.close(); throw new RuntimeException("Path to recode maps must be a directory: " + tfMtdDir); } }