Example usage for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.cloudera.training.metrics.JobHistoryHelper.java

License:Apache License

public static JobHistory.JobInfo getJobInfoFromHdfsOutputDir(String outputDir, Configuration conf)
        throws IOException {
    Path output = new Path(outputDir);
    Path historyLogDir = new Path(output, "_logs/history");
    FileSystem fs = output.getFileSystem(conf);
    if (!fs.exists(output)) {
        throw new IOException("History directory " + historyLogDir.toString() + " does not exist");
    }/*from w  w  w . ja  va  2s  . co m*/
    Path[] jobFiles = FileUtil.stat2Paths(fs.listStatus(historyLogDir, jobLogFileFilter));
    if (jobFiles.length == 0) {
        throw new IOException("Not a valid history directory " + historyLogDir.toString());
    }
    String[] jobDetails = JobHistory.JobInfo.decodeJobHistoryFileName(jobFiles[0].getName()).split("_");
    String jobId = jobDetails[2] + "_" + jobDetails[3] + "_" + jobDetails[4];
    JobHistory.JobInfo job = new JobHistory.JobInfo(jobId);
    DefaultJobHistoryParser.parseJobTasks(jobFiles[0].toString(), job, fs);
    return job;
}

From source file:com.dasasian.chok.operation.master.IndexDeployOperation.java

License:Apache License

protected static List<Shard> readShardsFromFs(final String indexName, final String indexPathString)
        throws IndexDeployException {
    // get shard folders from source
    URI uri;/*  www.j  av  a  2s. c om*/
    try {
        uri = new URI(indexPathString);
    } catch (final URISyntaxException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "unable to parse index path uri '"
                + indexPathString + "', make sure it starts with file:// or hdfs:// ", e);
    }
    FileSystem fileSystem;
    try {
        fileSystem = HadoopUtil.getFileSystem(new Path(uri.toString()));
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "unable to retrive file system for index path '" + indexPathString
                        + "', make sure your path starts with hadoop support prefix like file:// or hdfs://",
                e);
    }

    List<Shard> shards = new ArrayList<>();
    try {
        final Path indexPath = new Path(indexPathString);
        if (!fileSystem.exists(indexPath)) {
            throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                    "index path '" + uri + "' does not exists");
        }
        final FileStatus[] listStatus = fileSystem.listStatus(indexPath, new PathFilter() {
            public boolean accept(final Path aPath) {
                return !aPath.getName().startsWith(".");
            }
        });
        for (final FileStatus fileStatus : listStatus) {
            String shardPath = fileStatus.getPath().toString();
            if (fileStatus.isDir() || shardPath.endsWith(".zip")) {
                shards.add(new Shard(createShardName(indexName, shardPath), shardPath));
            }
        }
    } catch (final IOException e) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE,
                "could not access index path: " + indexPathString, e);
    }

    if (shards.size() == 0) {
        throw new IndexDeployException(ErrorType.INDEX_NOT_ACCESSIBLE, "index does not contain any shard");
    }
    return shards;
}

From source file:com.datasalt.utils.mapred.joiner.MultiJoiner.java

License:Apache License

/**
 * Adds a CHANNELED input specification. A channeled input specification is a channel associated to a Mapper and a
 * input file or glob. The user will implement a {@link MultiJoinChanneledMapper} which will be tied to a single
 * channel.//  w  ww.j av  a 2s. c o  m
 * <p>
 * The user must be consistent with the channel numbers it provides. For instance, in case that two or more different
 * files must belong to the same channel.
 * 
 * @param channel
 * @param location
 * @param channelClass
 * @param inputFormat
 * @param mapper
 * 
 * @throws IOException
 */
public MultiJoiner addChanneledInput(Integer channel, Path location, Class<? extends Object> channelClass,
        Class<? extends InputFormat> inputFormat, Class<? extends MultiJoinChanneledMapper> mapper)
        throws IOException {
    /*
     * Configure the MultiJoiner
     */
    setChannelDatumClass(channel, channelClass);
    FileSystem fS = FileSystem.get(getJob().getConfiguration());
    if (location.toString().contains("*")) { // is a glob
        for (FileStatus fSt : fS.globStatus(location, hiddenFileFilter)) { // expands the glob
            addChanneledInputInner(channel, fSt.getPath(), channelClass, inputFormat, mapper);
        }
    } else if (fS.getFileStatus(location).isDir()) {
        for (FileStatus fSt : fS.listStatus(location, hiddenFileFilter)) { // expands the glob
            addChanneledInputInner(channel, fSt.getPath(), channelClass, inputFormat, mapper);
        }
    } else {
        addChanneledInputInner(channel, location, channelClass, inputFormat, mapper);
    }
    return this;
}

From source file:com.digitalpebble.behemoth.io.nutch.NutchSegmentConverterJob.java

License:Apache License

public int run(String[] args) throws Exception {

    String usage = "Usage: SegmentConverter [-dir segdir | segment] output";

    if (args.length < 2) {
        System.err.println(usage);
        System.exit(-1);/*from   ww  w  .j  a va2  s  .c  o  m*/
    }

    final List<Path> segments = new ArrayList<Path>();

    if (args[0].equals("-dir")) {
        Path dir = new Path(args[1]);
        FileSystem fs = dir.getFileSystem(getConf());
        FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
        Path[] files = HadoopFSUtil.getPaths(fstats);
        for (Path p : files) {
            segments.add(p);
        }
    }

    else {
        segments.add(new Path(args[0]));
    }

    Path output = new Path(args[args.length - 1]);
    convert(segments, output);
    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusGenerator.java

License:Apache License

private static long processFiles(Configuration conf, Path input, boolean recurse, PerformanceFileFilter pff)
        throws IOException {

    FileSystem fs = input.getFileSystem(conf);
    FileStatus[] statuses = fs.listStatus(input, pff);
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        if (recurse == true) {
            processFiles(conf, status.getPath(), recurse, pff);
        }/*from   www . j a  v a  2  s. co  m*/
    }
    return pff.counter;
}

From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. /* www .  j  a  va2 s .c om*/
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setMapperClass(CVB0DocInferenceMapper.class);
    job.setNumReduceTasks(0);/*from w w w.j av a 2 s  .  co m*/
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileSystem fs = FileSystem.get(corpus.toUri(), conf);
    if (modelInput != null && fs.exists(modelInput)) {
        FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
        URI[] modelUris = new URI[statuses.length];
        for (int i = 0; i < statuses.length; i++) {
            modelUris[i] = statuses[i].getPath().toUri();
        }
        DistributedCache.setCacheFiles(modelUris, conf);
    }
    setModelPaths(job, modelInput);//bug:mahout-1147
    FileInputFormat.addInputPath(job, corpus);
    FileOutputFormat.setOutputPath(job, output);
    job.setJarByClass(CVB0Driver.class);
    job.submit();
    return job;
}

From source file:com.elex.dmp.lda.InMemoryCollapsedVariationalBayes0.java

License:Apache License

private static Matrix loadVectors(String vectorPathString, Configuration conf) throws IOException {
    Path vectorPath = new Path(vectorPathString);
    FileSystem fs = vectorPath.getFileSystem(conf);
    List<Path> subPaths = Lists.newArrayList();
    if (fs.isFile(vectorPath)) {
        subPaths.add(vectorPath);/*from  w  w  w  .java 2s  .  c  om*/
    } else {
        for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
            subPaths.add(fileStatus.getPath());
        }
    }
    List<Vector> vectorList = Lists.newArrayList();
    for (Path subPath : subPaths) {
        for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>(
                subPath, true, conf)) {
            vectorList.add(record.getSecond().get());
        }
    }
    int numRows = vectorList.size();
    int numCols = vectorList.get(0).size();
    return new SparseRowMatrix(numRows, numCols, vectorList.toArray(new Vector[vectorList.size()]), true,
            vectorList.get(0).isSequentialAccess());
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
 * This will use constant memory and will run at the speed of your disk read
 *///from  ww w.  java2  s .  co m
private static List<Path> createDictionaryChunks(Path dictPath, Path dictionaryPathBase, Configuration baseConf,
        int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(dictPath.toUri(), conf);
    FileStatus[] dictFiles = fs.listStatus(dictPath, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            String name = path.getName();
            return name.startsWith("dictionary.") && !name.endsWith(".crc");
        }
    });
    for (int i = 0; i < dictFiles.length; i++) {
        chunkPaths.add(dictFiles[i].getPath());
    }

    return chunkPaths;
}

From source file:com.ery.dimport.daemon.TaskManager.java

License:Apache License

public void runTask(final TaskInfo task) {
    List<LogHostRunInfoPO> allFiles = new ArrayList<LogHostRunInfoPO>();
    try {/*w w  w  .  j  a v  a 2 s. co  m*/
        task.START_TIME = new Date(System.currentTimeMillis());
        boolean needUpdate = false;
        TaskInfo exists = allTask.get(task.TASK_ID);
        if (exists == null) {
            needUpdate = true;
        } else {
            task.hosts = exists.hosts;
        }
        if (task.hosts == null || task.hosts.size() == 0) {
            task.hosts = new ArrayList<String>(master.getServerManager().getOnlineServers().keySet());
            needUpdate = true;
        }
        if (ZKUtil.checkExists(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID) == -1) {
            needUpdate = true;
        }
        if (needUpdate) {
            try {
                task.HOST_SIZE = task.hosts.size();
                master.logWriter.writeLog(task);
                ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID,
                        DImportConstant.Serialize(task));
            } catch (Throwable e) {
            }
        }
        Thread thread = Thread.currentThread();
        ProcessInfo procInfo = null;
        synchronized (taskInProgress) {
            procInfo = taskInProgress.get(task.getRunTaskId());
        }
        procInfo.thread = thread;
        procInfo.startTime = System.currentTimeMillis();
        procInfo.startTime = System.currentTimeMillis();
        String filePath = task.FILE_PATH;
        boolean isInHdfs = false;
        final Map<String, Long> files = new HashMap<String, Long>();
        String tmpPath = conf.get(DImportConstant.DIMPORT_PROCESS_TMPDATA_DIR, System.getProperty("user.home"));
        if (tmpPath.endsWith("/")) {
            tmpPath = tmpPath.substring(0, tmpPath.length() - 1);
        }
        if (filePath == null || filePath.equals("")) {
            files.put("", 0l);
        } else {
            if (task.fileNamePattern != null || (task.FILE_FILTER != null && !task.FILE_FILTER.equals(""))) {
                task.FILE_FILTER = DImportConstant.macroProcess(task.FILE_FILTER);
                task.FILE_FILTER = task.FILE_FILTER.replaceAll("\\{host\\}", this.master.hostName);
                task.fileNamePattern = Pattern.compile(task.FILE_FILTER);
            }
            Matcher m = hdfsUrlPattern.matcher(filePath);
            if (m.matches()) {
                isInHdfs = true;
                filePath = m.group(2);
                // for (String string : conf.getValByRegex(".*").keySet()) {
                // System.out.println(string + "=" + conf.get(string));
                // }
                Path dirPath = new Path(filePath);
                FileSystem fs = FileSystem.get(HadoopConf.getConf(conf));
                if (!fs.exists(dirPath) || !fs.isDirectory(dirPath)) {
                    throw new IOException("HDFS? " + filePath + "?,?");
                }
                FileStatus[] hFiles = fs.listStatus(dirPath, new PathFilter() {
                    @Override
                    public boolean accept(Path name) {
                        if (task.fileNamePattern != null) {
                            System.out.println("hdfs listStatus:" + name.getParent() + "/" + name.getName());
                            return task.fileNamePattern.matcher(name.getName()).matches();
                        } else {
                            return true;
                        }
                    }
                });
                for (int i = 0; i < hFiles.length; i++) {
                    files.put(hFiles[i].getPath().toString(), hFiles[i].getLen());
                }
            } else {
                java.io.File f = new File(filePath);
                if (!f.exists() || !f.isDirectory()) {
                    throw new IOException(
                            "? " + filePath + "? ,?");
                }
                File[] lFiles = f.listFiles(new FilenameFilter() {
                    public boolean accept(File dir, String name) {
                        if (task.fileNamePattern != null) {
                            System.out.println("local fs listStatus:" + dir + "/" + name);
                            return task.fileNamePattern.matcher(name).matches();
                        } else {
                            return true;
                        }
                    }
                });
                for (int i = 0; i < lFiles.length; i++) {
                    files.put(lFiles[i].getAbsolutePath(), lFiles[i].length());
                }
            }
        }
        for (String fileName : files.keySet()) {
            LogHostRunInfoPO runInfo = new LogHostRunInfoPO(task);
            runInfo.RUN_LOG_ID = DImportConstant.shdf.format(task.SUBMIT_TIME) + "_" + allFiles.size() + "_"
                    + fileName.hashCode();
            runInfo.FILE_NAME = fileName;
            runInfo.RETURN_CODE = 255;
            runInfo.IS_RUN_SUCCESS = -1;
            runInfo.FILE_SIZE = files.get(fileName);
            runInfo.HOST_NAME = master.hostName;
            String localFile = fileName;
            if (isInHdfs) {// 
                localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1);
            }
            // 
            String[] cmds = procInfo.task.getCommand();
            for (int j = 0; j < cmds.length; j++) {
                cmds[j] = DImportConstant.macroProcess(cmds[j]);
                cmds[j] = cmds[j].replaceAll("\\{file\\}", localFile);
                cmds[j] = cmds[j].replaceAll("\\{host\\}", master.hostName);
            }
            runInfo.RUN_COMMAND = StringUtils.join(" ", cmds);
            master.logWriter.writeLog(runInfo);
            LOG.info("??" + runInfo);
            allFiles.add(runInfo);
        }
        ZKUtil.createSetData(watcher, watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                DImportConstant.Serialize(allFiles));
        for (LogHostRunInfoPO runInfo : allFiles) {
            if (procInfo.stoped)
                break;
            String fileName = runInfo.FILE_NAME;
            LOG.info("?:" + fileName);
            procInfo.RUN_LOG_ID = runInfo.RUN_LOG_ID;
            runInfo.START_TIME = new Date(System.currentTimeMillis());
            procInfo.processFile = fileName;
            String localFile = fileName;
            try {
                if (isInHdfs) {// 
                    localFile = tmpPath + "/" + fileName.substring(fileName.lastIndexOf("/") + 1);
                }
                procInfo.task.TASK_COMMAND = runInfo.RUN_COMMAND;
                if (isInHdfs) {// 
                    File lf = new File(localFile);
                    if (lf.exists())
                        lf.delete();
                    FileSystem fs = FileSystem.get(HadoopConf.getConf(conf));
                    LOG.info("HDFS:" + fileName + "===>" + localFile);
                    long btime = System.currentTimeMillis();
                    fs.copyToLocalFile(new Path(fileName), new Path(localFile));
                    LOG.info("HDFS?:" + fileName + "===>" + localFile);
                    runInfo.downTime = System.currentTimeMillis() - btime;
                    fileName = localFile;
                }
                updateHostInfoLog(runInfo, allFiles);
                LOG.info(procInfo.task.TASK_NAME + " commandline: " + procInfo.task.TASK_COMMAND);
                procInfo.proc = execResult(runInfo.RUN_COMMAND);
                runInfo.IS_RUN_SUCCESS = 1;
                runInfo.RETURN_CODE = writeProcessLog(procInfo);
                LOG.info(procInfo.task.TASK_NAME + " return value: " + runInfo.RETURN_CODE);
                // runInfo.RETURN_CODE = procInfo.proc.exitValue();
            } catch (Throwable e) {
                runInfo.ERROR_MSG = e.getMessage();
                if (procInfo.proc != null) {
                    try {
                        procInfo.proc.destroy();
                    } catch (Exception ex) {
                    }
                }
                procInfo.proc = null;
                LOG.error("", e);
            } finally { // 
                runInfo.END_TIME = new Date(System.currentTimeMillis());
                master.logWriter.updateLog(runInfo);
                updateHostInfoLog(runInfo, allFiles);
                ZKUtil.createSetData(watcher,
                        watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                        DImportConstant.Serialize(allFiles));
                if (isInHdfs) {
                    File lf = new File(localFile);
                    if (lf.exists())
                        lf.delete();
                }
            }
        }
    } catch (Throwable e) {
        LOG.error("" + task, e);
        try {
            if (allFiles.size() > 0) {
                for (LogHostRunInfoPO logHostRunInfoPO : allFiles) {
                    if (logHostRunInfoPO.END_TIME.getTime() < 10000) {
                        logHostRunInfoPO.END_TIME = new Date(System.currentTimeMillis());
                        logHostRunInfoPO.IS_RUN_SUCCESS = 1;
                        logHostRunInfoPO.RETURN_CODE = 2;
                    }
                }
                ZKUtil.createSetData(watcher,
                        watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/" + master.hostName,
                        DImportConstant.Serialize(allFiles));
            }
        } catch (KeeperException e1) {
            LOG.error("update task run info on host :" + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/"
                    + master.hostName, e);
        } catch (IOException e1) {
            LOG.error("update task run info on host " + watcher.dimportRunTaskNode + "/" + task.TASK_ID + "/"
                    + master.hostName, e);
        }
    } finally { // 
        synchronized (taskInProgress) {
            taskInProgress.remove(task.getRunTaskId());
        }
    }
}