Example usage for org.apache.hadoop.fs FileSystem isDirectory

List of usage examples for org.apache.hadoop.fs FileSystem isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem isDirectory.

Prototype

@Deprecated
public boolean isDirectory(Path f) throws IOException 

Source Link

Document

True iff the named path is a directory.

Usage

From source file:com.ibm.bi.dml.runtime.io.MatrixReader.java

License:Open Source License

/**
 * // w  w  w.j av a  2 s  .  c  o  m
 * @param file
 * @return
 * @throws IOException
 */
public static Path[] getSequenceFilePaths(FileSystem fs, Path file) throws IOException {
    Path[] ret = null;

    if (fs.isDirectory(file)) {
        LinkedList<Path> tmp = new LinkedList<Path>();
        FileStatus[] dStatus = fs.listStatus(file);
        for (FileStatus fdStatus : dStatus)
            if (!fdStatus.getPath().getName().startsWith("_")) //skip internal files
                tmp.add(fdStatus.getPath());
        ret = tmp.toArray(new Path[0]);
    } else {
        ret = new Path[] { file };
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCell.java

License:Open Source License

@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
        throws IOException, DMLRuntimeException {
    //allocate output matrix block
    MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false);

    //prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    //check existence and non-empty file
    checkValidInputFile(fs, path);/*from  w w  w.  j  a v  a2 s . c o m*/

    //core read 
    if (fs.isDirectory(path))
        readTextCellMatrixFromHDFS(path, job, ret, rlen, clen, brlen, bclen);
    else
        readRawTextCellMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen, _isMMFile);

    //finally check if change of sparse/dense block representation required
    if (!ret.isInSparseFormat())
        ret.recomputeNonZeros();
    ret.examSparsity();

    return ret;
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSV.java

License:Open Source License

/**
 * /*from   w  w w.  ja v  a 2s  . c  o m*/
 * @param path
 * @param job
 * @param fs
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @param hasHeader
 * @param delim
 * @param fill
 * @param fillValue
 * @return
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private MatrixBlock readCSVMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen,
        long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue)
        throws IOException {
    ArrayList<Path> files = new ArrayList<Path>();
    if (fs.isDirectory(path)) {
        for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);
    } else
        files.add(path);

    if (dest == null) {
        dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue);
        clen = dest.getNumColumns();
    }

    boolean sparse = dest.isInSparseFormat();

    /////////////////////////////////////////
    String value = null;
    int row = 0;
    int col = -1;
    double cellValue = 0;
    long lnnz = 0;

    for (int fileNo = 0; fileNo < files.size(); fileNo++) {
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
        if (fileNo == 0 && hasHeader)
            br.readLine(); //ignore header

        // Read the data
        boolean emptyValuesFound = false;
        try {
            if (sparse) //SPARSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.appendValue(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            } else //DENSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.setValueDenseUnsafe(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(br);
        }
    }

    //post processing
    dest.setNonZeros(lnnz);

    return dest;
}

From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java

License:Open Source License

/**
 * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. 
 * The part files are created by CSV_WRITE MR job. 
 * //from ww w . j  ava  2s .  c  o  m
 * This method is invoked from CP-write instruction.
 * 
 * @param srcFileName
 * @param destFileName
 * @param csvprop
 * @param rlen
 * @param clen
 * @throws IOException
 */
public void mergeCSVPartFiles(String srcFileName, String destFileName, CSVFileFormatProperties csvprop,
        long rlen, long clen) throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path mergedFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (hdfs.exists(mergedFilePath)) {
        hdfs.delete(mergedFilePath, true);
    }
    OutputStream out = hdfs.create(mergedFilePath, true);

    // write out the header, if needed
    if (csvprop.hasHeader()) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < clen; i++) {
            sb.append("C" + (i + 1));
            if (i < clen - 1)
                sb.append(csvprop.getDelim());
        }
        sb.append('\n');
        out.write(sb.toString().getBytes());
        sb.setLength(0);
    }

    // if the source is a directory
    if (hdfs.isDirectory(srcFilePath)) {
        try {
            FileStatus[] contents = hdfs.listStatus(srcFilePath);
            Path[] partPaths = new Path[contents.length];
            int numPartFiles = 0;
            for (int i = 0; i < contents.length; i++) {
                if (!contents[i].isDirectory()) {
                    partPaths[i] = contents[i].getPath();
                    numPartFiles++;
                }
            }
            Arrays.sort(partPaths);

            for (int i = 0; i < numPartFiles; i++) {
                InputStream in = hdfs.open(partPaths[i]);
                try {
                    IOUtils.copyBytes(in, out, conf, false);
                    if (i < numPartFiles - 1)
                        out.write('\n');
                } finally {
                    IOUtilFunctions.closeSilently(in);
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(out);
        }
    } else if (hdfs.isFile(srcFilePath)) {
        InputStream in = null;
        try {
            in = hdfs.open(srcFilePath);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }
    } else {
        throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
}

From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java

License:Open Source License

/**
 * /*  w ww  .ja v  a2 s  .c  om*/
 * @param srcFileName
 * @param destFileName
 * @param csvprop
 * @param rlen
 * @param clen
 * @throws IOException
 */
@SuppressWarnings("unchecked")
public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
        // simply move srcFile to destFile

        /*
         * TODO: Remove this roundabout way! 
         * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv 
         *              & the only path that exists already on HDFS is /user/biadmin/csv/.
         * In this case: the directory structure /user/biadmin/csv/temp/out must be created. 
         * Simple hdfs.rename() does not seem to create this directory structure.
         */

        // delete the destination file, if exists already
        //boolean ret1 = 
        hdfs.delete(destFilePath, true);

        // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
        //boolean ret2 = 
        hdfs.createNewFile(destFilePath);

        // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
        //boolean ret3 = 
        hdfs.delete(destFilePath, true);

        // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
        //boolean ret4 = 
        hdfs.rename(srcFilePath, destFilePath);

        //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4);
        return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1)
            sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

        // compute sorted order among part files
        ArrayList<Path> files = new ArrayList<Path>();
        for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);

        // first part file path
        Path firstpart = files.get(0);

        // create a temp file, and add header and contents of first part
        Path tmp = new Path(firstpart.toString() + ".tmp");
        OutputStream out = hdfs.create(tmp, true);
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy rest of the data from firstpart
        InputStream in = null;
        try {
            in = hdfs.open(firstpart);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }

        // rename tmp to firstpart
        hdfs.delete(firstpart, true);
        hdfs.rename(tmp, firstpart);

        // rename srcfile to destFile
        hdfs.delete(destFilePath, true);
        hdfs.createNewFile(destFilePath); // force the creation of directory structure
        hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
        hdfs.rename(srcFilePath, destFilePath); // move the data 

    } else if (hdfs.isFile(srcFilePath)) {
        // create destination file
        OutputStream out = hdfs.create(destFilePath, true);

        // write header
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy the data from srcFile
        InputStream in = null;
        try {
            in = hdfs.open(srcFilePath);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }
    } else {
        throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java

License:Open Source License

/**
 * Method to find the first (part)file in the order given by <code>fs.listStatus()</code> among all (part)files in <code>inpathPath</code>.
 * /*from w  w w.j  av a 2  s . c o  m*/
 * @param job
 * @param inputPath
 * @return
 * @throws IOException 
 * @throws FileNotFoundException 
 */
public static String findSmallestFile(JobConf job, String inputPath) throws FileNotFoundException, IOException {

    String smallestFile = null;

    Path p = new Path(inputPath);
    FileSystem fs = p.getFileSystem(job);
    if (!fs.isDirectory(p))
        smallestFile = p.makeQualified(fs).toString();
    else {
        FileStatus[] stats = fs.listStatus(p, hiddenFileFilter);
        if (stats.length == 0)
            smallestFile = "";
        else {
            smallestFile = stats[0].getPath().toString();
            for (int j = 1; j < stats.length; j++) {
                String f = stats[j].getPath().toString();
                if (f.compareTo(smallestFile) < 0)
                    smallestFile = f;
            }
        }
    }
    return smallestFile;
}

From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java

License:Open Source License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer,
        int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    String[] smallestFiles = new String[inputs.length];
    JobConf job = new JobConf();
    for (int i = 0; i < inputs.length; i++) {
        smallestFiles[i] = findSmallestFile(job, inputs[i]);
    }//from   w  w w.j  a  v  a2  s  .co m

    for (int i = 0; i < inputs.length; i++) {
        Path p = new Path(inputs[i]);
        FileSystem fs = p.getFileSystem(job);
        if (!fs.isDirectory(p))
            smallestFiles[i] = p.makeQualified(fs).toString();
        else {
            FileStatus[] stats = fs.listStatus(p, hiddenFileFilter);
            if (stats.length == 0)
                smallestFiles[i] = "";
            else {
                smallestFiles[i] = stats[0].getPath().toString();
                for (int j = 1; j < stats.length; j++) {
                    String f = stats[j].getPath().toString();
                    if (f.compareTo(smallestFiles[i]) < 0)
                        smallestFiles[i] = f;
                }
            }
        }
    }

    AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(inputs, inputInfos, brlens, bclens,
            reblockInstructions, replication, smallestFiles);
    for (int i = 0; i < rlens.length; i++)
        if ((rlens[i] > 0 && rlens[i] != ret1.rlens[i]) || (clens[i] > 0 && clens[i] != ret1.clens[i]))
            throw new RuntimeException("Dimension doesn't mach for input matrix " + i + ", expected ("
                    + rlens[i] + ", " + clens[i] + ") but real (" + ret1.rlens[i] + ", " + ret1.clens[i] + ")");
    JobReturn ret = CSVReblockMR.runCSVReblockJob(null, inputs, inputInfos, ret1.rlens, ret1.clens, brlens,
            bclens, reblockInstructions, otherInstructionsInReducer, numReducers, replication, resultIndexes,
            outputs, outputInfos, ret1.counterFile, smallestFiles);
    return ret;
}

From source file:com.ibm.bi.dml.runtime.transform.BinAgent.java

License:Open Source License

/**
 * Method to load transform metadata for all attributes
 * // ww w  .  j ava 2 s .c om
 * @param job
 * @throws IOException
 */
@Override
public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
    if (_binList == null)
        return;

    if (fs.isDirectory(txMtdDir)) {
        for (int i = 0; i < _binList.length; i++) {
            int colID = _binList[i];

            Path path = new Path(txMtdDir + "/Bin/" + agents.getName(colID) + BIN_FILE_SUFFIX);
            TfUtils.checkValidInputFile(fs, path, true);

            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
            // format: colID,min,max,nbins
            String[] fields = br.readLine().split(TXMTD_SEP);
            double min = UtilFunctions.parseToDouble(fields[1]);
            //double max = UtilFunctions.parseToDouble(fields[2]);
            double binwidth = UtilFunctions.parseToDouble(fields[3]);
            int nbins = UtilFunctions.parseToInt(fields[4]);

            _numBins[i] = nbins;
            _min[i] = min;
            _binWidths[i] = binwidth; // (max-min)/nbins;

            br.close();
        }
    } else {
        fs.close();
        throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir);
    }
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Helper function to fetch and sort the list of part files under the given
 * input directory.//from w w w  .j a  va2  s.  c  o  m
 * 
 * @param input
 * @param fs
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private static ArrayList<Path> collectInputFiles(String input, FileSystem fs)
        throws FileNotFoundException, IOException {
    Path path = new Path(input);
    ArrayList<Path> files = new ArrayList<Path>();
    if (fs.isDirectory(path)) {
        for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);
    } else
        files.add(path);

    return files;
}

From source file:com.ibm.bi.dml.runtime.transform.MVImputeAgent.java

License:Open Source License

/**
 * Method to load transform metadata for all attributes
 * //from   w ww.ja v a2s .co m
 * @param job
 * @throws IOException
 */
@Override
public void loadTxMtd(JobConf job, FileSystem fs, Path tfMtdDir, TfUtils agents) throws IOException {

    if (fs.isDirectory(tfMtdDir)) {

        // Load information about missing value imputation
        if (_mvList != null)
            for (int i = 0; i < _mvList.length; i++) {
                int colID = _mvList[i];

                if (_mvMethodList[i] == 1 || _mvMethodList[i] == 2)
                    // global_mean or global_mode
                    _replacementList[i] = readReplacement(colID, fs, tfMtdDir, agents);
                else if (_mvMethodList[i] == 3) {
                    // constant: replace a missing value by a given constant
                    // nothing to do. The constant values are loaded already during configure 
                } else
                    throw new RuntimeException("Invalid Missing Value Imputation methods: " + _mvMethodList[i]);
            }

        // Load scaling information
        if (_mvList != null)
            for (int i = 0; i < _mvList.length; i++)
                if (_isMVScaled.get(i))
                    processScalingFile(i, _mvList, _meanList, _varList, fs, tfMtdDir, agents);

        if (_scnomvList != null)
            for (int i = 0; i < _scnomvList.length; i++)
                processScalingFile(i, _scnomvList, _scnomvMeanList, _scnomvVarList, fs, tfMtdDir, agents);
    } else {
        fs.close();
        throw new RuntimeException("Path to recode maps must be a directory: " + tfMtdDir);
    }
}