Example usage for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.facebook.presto.hive.BackgroundHiveSplitLoader.java

License:Apache License

private static List<Path> getTargetPathsFromSymlink(FileSystem fileSystem, Path symlinkDir) {
    try {/*w  ww.  j a  va 2  s  .c o  m*/
        FileStatus[] symlinks = fileSystem.listStatus(symlinkDir, HIDDEN_FILES_PATH_FILTER);
        List<Path> targets = new ArrayList<>();

        for (FileStatus symlink : symlinks) {
            try (BufferedReader reader = new BufferedReader(
                    new InputStreamReader(fileSystem.open(symlink.getPath()), StandardCharsets.UTF_8))) {
                CharStreams.readLines(reader).stream().map(Path::new).forEach(targets::add);
            }
        }
        return targets;
    } catch (IOException e) {
        throw new PrestoException(HIVE_BAD_DATA, "Error parsing symlinks from: " + symlinkDir, e);
    }
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HdfsSortedOplogOrganizer.java

License:Apache License

private void initOldTmpFiles() throws IOException {
    FileSystem fs = store.getFileSystem();
    if (!fs.exists(bucketPath)) {
        return;/*  ww w.j a v  a  2  s.co  m*/
    }

    oldTmpFiles = new LinkedList<FileStatus>(Arrays.asList(fs.listStatus(bucketPath, new TmpFilePathFilter())));
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHFileOutputFormat.java

License:Apache License

@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath,
        Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties,
        final Progressable progressable) throws IOException {

    // Read configuration for the target path, first from jobconf, then from table properties
    String hfilePath = getFamilyPath(jc, tableProperties);
    if (hfilePath == null) {
        throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
    }//from w w  w  .j  a va  2 s. com

    // Target path's last component is also the column family name.
    final Path columnFamilyPath = new Path(hfilePath);
    final String columnFamilyName = columnFamilyPath.getName();
    final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
    final Job job = new Job(jc);
    setCompressOutput(job, isCompressed);
    setOutputPath(job, finalOutPath);

    // Create the HFile writer
    final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims()
            .newTaskAttemptContext(job.getConfiguration(), progressable);

    final Path outputdir = FileOutputFormat.getOutputPath(tac);
    final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(
            tac);

    // Individual columns are going to be pivoted to HBase cells,
    // and for each row, they need to be written out in order
    // of column name, so sort the column names now, creating a
    // mapping to their column position.  However, the first
    // column is interpreted as the row key.
    String columnList = tableProperties.getProperty("columns");
    String[] columnArray = columnList.split(",");
    final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    int i = 0;
    for (String columnName : columnArray) {
        if (i != 0) {
            columnMap.put(Bytes.toBytes(columnName), i);
        }
        ++i;
    }

    return new RecordWriter() {

        @Override
        public void close(boolean abort) throws IOException {
            try {
                fileWriter.close(null);
                if (abort) {
                    return;
                }
                // Move the hfiles file(s) from the task output directory to the
                // location specified by the user.
                FileSystem fs = outputdir.getFileSystem(jc);
                fs.mkdirs(columnFamilyPath);
                Path srcDir = outputdir;
                for (;;) {
                    FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
                    if ((files == null) || (files.length == 0)) {
                        throw new IOException("No family directories found in " + srcDir);
                    }
                    if (files.length != 1) {
                        throw new IOException("Multiple family directories found in " + srcDir);
                    }
                    srcDir = files[0].getPath();
                    if (srcDir.getName().equals(columnFamilyName)) {
                        break;
                    }
                }
                for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
                    fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
                }
                // Hive actually wants a file as task output (not a directory), so
                // replace the empty directory with an empty file to keep it happy.
                fs.delete(outputdir, true);
                fs.createNewFile(outputdir);
            } catch (InterruptedException ex) {
                throw new IOException(ex);
            }
        }

        private void writeText(Text text) throws IOException {
            // Decompose the incoming text row into fields.
            String s = text.toString();
            String[] fields = s.split("\u0001");
            assert (fields.length <= (columnMap.size() + 1));
            // First field is the row key.
            byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
            // Remaining fields are cells addressed by column name within row.
            for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
                byte[] columnNameBytes = entry.getKey();
                int iColumn = entry.getValue();
                String val;
                if (iColumn >= fields.length) {
                    // trailing blank field
                    val = "";
                } else {
                    val = fields[iColumn];
                    if ("\\N".equals(val)) {
                        // omit nulls
                        continue;
                    }
                }
                byte[] valBytes = Bytes.toBytes(val);
                KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
                try {
                    fileWriter.write(null, kv);
                } catch (IOException e) {
                    LOG.error("Failed while writing row: " + s);
                    throw e;
                } catch (InterruptedException ex) {
                    throw new IOException(ex);
                }
            }
        }

        private void writePut(PutWritable put) throws IOException {
            ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
            SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
            for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
                Collections.sort(entry.getValue(), new CellComparator());
                for (Cell c : entry.getValue()) {
                    try {
                        fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
                    } catch (InterruptedException e) {
                        throw (InterruptedIOException) new InterruptedIOException().initCause(e);
                    }
                }
            }
        }

        @Override
        public void write(Writable w) throws IOException {
            if (w instanceof Text) {
                writeText((Text) w);
            } else if (w instanceof PutWritable) {
                writePut((PutWritable) w);
            } else {
                throw new IOException("Unexpected writable " + w);
            }
        }
    };
}

From source file:com.github.sadikovi.hadoop.riff.RiffOutputCommitter.java

License:Open Source License

private static void recurListFiles(FileSystem fs, FileStatus fileStatus, List<FileStatus> foundFiles,
        boolean fetchOneFile) throws IOException {
    if (fetchOneFile && !foundFiles.isEmpty())
        return;//w ww.  ja v a2s .co m
    if (fileStatus.isDirectory()) {
        FileStatus[] list = fs.listStatus(fileStatus.getPath(), PartFileFilter.instance);
        for (int i = 0; i < list.length; i++) {
            recurListFiles(fs, list[i], foundFiles, fetchOneFile);
        }
    } else {
        // file status is a file, add to the list
        foundFiles.add(fileStatus);
    }
}

From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java

License:Apache License

private static Map<Integer, List<String>> readPoints(Path pointsPathDir, Configuration conf)
        throws IOException {
    Map<Integer, List<String>> result = new TreeMap<Integer, List<String>>();

    FileSystem fs = pointsPathDir.getFileSystem(conf);
    FileStatus[] children = fs.listStatus(pointsPathDir, new PathFilter() {
        public boolean accept(Path path) {
            String name = path.getName();
            return !(name.endsWith(".crc") || name.startsWith("_"));
        }//from   w w  w. jav a 2 s.  c  om
    });

    for (FileStatus file : children) {
        Path path = file.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        try {
            IntWritable key = reader.getKeyClass().asSubclass(IntWritable.class).newInstance();
            WeightedVectorWritable value = reader.getValueClass().asSubclass(WeightedVectorWritable.class)
                    .newInstance();
            while (reader.next(key, value)) {
                //key is the clusterId, value is a list of points
                //String clusterId = value.toString();
                List<String> pointList = result.get(key.get());
                if (pointList == null) {
                    pointList = new ArrayList<String>();
                    result.put(key.get(), pointList);
                }
                //We know we are dealing with named vectors, b/c we generated from the id field
                String name = ((NamedVector) value.getVector()).getName();
                pointList.add(name);
                //value = reader.getValueClass().asSubclass(WeightedVectorWritable.class).newInstance();
            }
        } catch (InstantiationException e) {
            log.error("Exception", e);
        } catch (IllegalAccessException e) {
            log.error("Exception", e);
        }
    }

    return result;
}

From source file:com.hadoop.mapreduce.FourMcInputFormat.java

License:BSD License

protected void addInputPath(List<FileStatus> results, FileSystem fs, FileStatus pathStat, boolean recursive)
        throws IOException {
    Path path = pathStat.getPath();
    if (pathStat.isDir()) {
        if (recursive) {
            for (FileStatus stat : fs.listStatus(path, hiddenPathFilter)) {
                addInputPath(results, fs, stat, recursive);
            }/*from  ww  w . jav a  2s.  c o  m*/
        }
    } else if (visible4mcFilter.accept(path)) {
        results.add(pathStat);
    }
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSV.java

License:Open Source License

/**
 * /*from  ww w  . ja  va2 s.  co m*/
 * @param path
 * @param job
 * @param fs
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @param hasHeader
 * @param delim
 * @param fill
 * @param fillValue
 * @return
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private MatrixBlock readCSVMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen,
        long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue)
        throws IOException {
    ArrayList<Path> files = new ArrayList<Path>();
    if (fs.isDirectory(path)) {
        for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);
    } else
        files.add(path);

    if (dest == null) {
        dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue);
        clen = dest.getNumColumns();
    }

    boolean sparse = dest.isInSparseFormat();

    /////////////////////////////////////////
    String value = null;
    int row = 0;
    int col = -1;
    double cellValue = 0;
    long lnnz = 0;

    for (int fileNo = 0; fileNo < files.size(); fileNo++) {
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
        if (fileNo == 0 && hasHeader)
            br.readLine(); //ignore header

        // Read the data
        boolean emptyValuesFound = false;
        try {
            if (sparse) //SPARSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.appendValue(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            } else //DENSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.setValueDenseUnsafe(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(br);
        }
    }

    //post processing
    dest.setNonZeros(lnnz);

    return dest;
}

From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java

License:Open Source License

/**
 * /*from ww w . jav a  2s  . co m*/
 * @param srcFileName
 * @param destFileName
 * @param csvprop
 * @param rlen
 * @param clen
 * @throws IOException
 */
@SuppressWarnings("unchecked")
public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
        // simply move srcFile to destFile

        /*
         * TODO: Remove this roundabout way! 
         * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv 
         *              & the only path that exists already on HDFS is /user/biadmin/csv/.
         * In this case: the directory structure /user/biadmin/csv/temp/out must be created. 
         * Simple hdfs.rename() does not seem to create this directory structure.
         */

        // delete the destination file, if exists already
        //boolean ret1 = 
        hdfs.delete(destFilePath, true);

        // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
        //boolean ret2 = 
        hdfs.createNewFile(destFilePath);

        // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
        //boolean ret3 = 
        hdfs.delete(destFilePath, true);

        // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
        //boolean ret4 = 
        hdfs.rename(srcFilePath, destFilePath);

        //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4);
        return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1)
            sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

        // compute sorted order among part files
        ArrayList<Path> files = new ArrayList<Path>();
        for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);

        // first part file path
        Path firstpart = files.get(0);

        // create a temp file, and add header and contents of first part
        Path tmp = new Path(firstpart.toString() + ".tmp");
        OutputStream out = hdfs.create(tmp, true);
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy rest of the data from firstpart
        InputStream in = null;
        try {
            in = hdfs.open(firstpart);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }

        // rename tmp to firstpart
        hdfs.delete(firstpart, true);
        hdfs.rename(tmp, firstpart);

        // rename srcfile to destFile
        hdfs.delete(destFilePath, true);
        hdfs.createNewFile(destFilePath); // force the creation of directory structure
        hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
        hdfs.rename(srcFilePath, destFilePath); // move the data 

    } else if (hdfs.isFile(srcFilePath)) {
        // create destination file
        OutputStream out = hdfs.create(destFilePath, true);

        // write header
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy the data from srcFile
        InputStream in = null;
        try {
            in = hdfs.open(srcFilePath);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }
    } else {
        throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java

License:Open Source License

/**
 * Method to find the first (part)file in the order given by <code>fs.listStatus()</code> among all (part)files in <code>inpathPath</code>.
 * /*from  w w w .j  a  va 2 s.  c o  m*/
 * @param job
 * @param inputPath
 * @return
 * @throws IOException 
 * @throws FileNotFoundException 
 */
public static String findSmallestFile(JobConf job, String inputPath) throws FileNotFoundException, IOException {

    String smallestFile = null;

    Path p = new Path(inputPath);
    FileSystem fs = p.getFileSystem(job);
    if (!fs.isDirectory(p))
        smallestFile = p.makeQualified(fs).toString();
    else {
        FileStatus[] stats = fs.listStatus(p, hiddenFileFilter);
        if (stats.length == 0)
            smallestFile = "";
        else {
            smallestFile = stats[0].getPath().toString();
            for (int j = 1; j < stats.length; j++) {
                String f = stats[j].getPath().toString();
                if (f.compareTo(smallestFile) < 0)
                    smallestFile = f;
            }
        }
    }
    return smallestFile;
}

From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java

License:Open Source License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer,
        int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    String[] smallestFiles = new String[inputs.length];
    JobConf job = new JobConf();
    for (int i = 0; i < inputs.length; i++) {
        smallestFiles[i] = findSmallestFile(job, inputs[i]);
    }//  w  w  w .  j  a  v  a2 s .  co m

    for (int i = 0; i < inputs.length; i++) {
        Path p = new Path(inputs[i]);
        FileSystem fs = p.getFileSystem(job);
        if (!fs.isDirectory(p))
            smallestFiles[i] = p.makeQualified(fs).toString();
        else {
            FileStatus[] stats = fs.listStatus(p, hiddenFileFilter);
            if (stats.length == 0)
                smallestFiles[i] = "";
            else {
                smallestFiles[i] = stats[0].getPath().toString();
                for (int j = 1; j < stats.length; j++) {
                    String f = stats[j].getPath().toString();
                    if (f.compareTo(smallestFiles[i]) < 0)
                        smallestFiles[i] = f;
                }
            }
        }
    }

    AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(inputs, inputInfos, brlens, bclens,
            reblockInstructions, replication, smallestFiles);
    for (int i = 0; i < rlens.length; i++)
        if ((rlens[i] > 0 && rlens[i] != ret1.rlens[i]) || (clens[i] > 0 && clens[i] != ret1.clens[i]))
            throw new RuntimeException("Dimension doesn't mach for input matrix " + i + ", expected ("
                    + rlens[i] + ", " + clens[i] + ") but real (" + ret1.rlens[i] + ", " + ret1.clens[i] + ")");
    JobReturn ret = CSVReblockMR.runCSVReblockJob(null, inputs, inputInfos, ret1.rlens, ret1.clens, brlens,
            bclens, reblockInstructions, otherInstructionsInReducer, numReducers, replication, resultIndexes,
            outputs, outputInfos, ret1.counterFile, smallestFiles);
    return ret;
}