Example usage for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException

Source Link

Document

Filter files/directories in the given list of paths using default path filter.

Usage

From source file:com.skp.experiment.common.mapreduce.MapFileOutputFormat.java

License:Apache License

/** Open the output generated by this format. */
public static MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException {
    FileSystem fs = dir.getFileSystem(conf);
    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));

    // sort names, so that hash partitioning works
    Arrays.sort(names);/*from ww  w . j  a va2s  . co m*/

    MapFile.Reader[] parts = new MapFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
    }
    return parts;
}

From source file:com.splicemachine.derby.impl.io.HdfsDirFile.java

License:Apache License

@Override
public String[] list() {
    try {/*  w  ww.j  av a  2s.c o  m*/
        FileSystem fs = getFileSystem();
        FileStatus[] fileStatuses = fs.listStatus(new Path(path));
        String[] list = new String[fileStatuses.length];
        for (int i = 0; i < fileStatuses.length; i++) {
            list[i] = fileStatuses[i].getPath().getName();
        }
        return list;
    } catch (IOException e) {
        LOG.error(String.format(
                "An exception occurred while listing the files and directories in the path '%s'.", path), e);
        return null;
    }
}

From source file:com.splout.db.common.SploutHadoopConfiguration.java

License:Apache License

/**
 * Adds the SQLite native libraries to the DistributedCache so that they will be present in the java.library.path
 * of the child's Hadoop task./* w ww.  j  av  a 2 s .  c o  m*/
 * <p/>
 * Usually you don't need to do this as the task will already try to load them from the job's uncompressed JAR, however
 * it is not assured that all Hadoop versions do the uncompressing of the JAR so in this case it's safer to use this.
 */
public static void addSQLite4JavaNativeLibsToDC(Configuration conf, File nativeLibsLocalPath)
        throws IOException, URISyntaxException {
    Path nativeLibHdfs = new Path("splout-native");
    FileSystem fS = FileSystem.get(conf);
    if (fS.exists(nativeLibHdfs)) {
        fS.delete(nativeLibHdfs, true);
    }
    fS.mkdirs(nativeLibHdfs);
    // Copy native libs to HDFS
    File[] natives = nativeLibsLocalPath.listFiles();
    if (natives == null) {
        throw new RuntimeException(
                "natives lib folder not present in local working directory! Are you in SPLOUT_HOME?");
    }
    for (File nativeLib : natives) {
        FileUtil.copy(nativeLib, fS, nativeLibHdfs, false, conf);
    }
    for (FileStatus nativeLibInHdfs : fS.listStatus(nativeLibHdfs)) {
        // http://hadoop.apache.org/docs/r0.20.2/native_libraries.html#Loading+native+libraries+through+DistributedCache
        DistributedCache.createSymlink(conf);
        URI uriToAdd = new URI(
                nativeLibInHdfs.getPath().makeQualified(fS) + "#" + nativeLibInHdfs.getPath().getName());
        DistributedCache.addCacheFile(uriToAdd, conf);
        log.info("Adding to distributed cache: " + uriToAdd);
    }
}

From source file:com.splout.db.examples.PageCountsExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // Validate params etc
    JCommander jComm = new JCommander(this);
    jComm.setProgramName("Splout Page Counts example");
    try {//from   w  w  w. ja v a2s  .c o  m
        jComm.parse(args);
    } catch (ParameterException e) {
        System.err.println(e.getMessage());
        jComm.usage();
        System.exit(-1);
    }

    boolean generate = !noGenerate; // just for clarifying

    if (generateTupleFiles && deploy) {
        System.err.println("Can't run a 'dry' TupleFile generation and deploy it.");
        jComm.usage();
        System.exit(-1);
    }

    Path outPath = new Path(outputPath);
    FileSystem outFs = outPath.getFileSystem(getConf());

    if (!FileSystem.getLocal(conf).equals(FileSystem.get(conf))) {
        File nativeLibs = new File("native");
        if (nativeLibs.exists()) {
            SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf);
        }
    }

    if (generate) {
        Path inputPath = new Path(this.inputPath);
        FileSystem inputFileSystem = inputPath.getFileSystem(conf);

        FileStatus[] fileStatuses = inputFileSystem.listStatus(inputPath);

        // define the schema that the resultant table will have: date, hour, pagename, pageviews
        final Schema tableSchema = new Schema("pagecounts",
                Fields.parse("date:string, hour:string, pagename:string, pageviews:int"));
        // define the schema of the input files: projectcode, pagename, pageviews, bytes
        Schema fileSchema = new Schema("pagecountsfile",
                Fields.parse("projectcode:string, pagename:string, pageviews:int, bytes:long"));

        // instantiate a TableBuilder
        TableBuilder tableBuilder = new TableBuilder(tableSchema);

        // for every input file...
        for (FileStatus fileStatus : fileStatuses) {
            String fileName = fileStatus.getPath().getName().toString();
            // strip the date and the hour from the file name
            String fileDate = fileName.split("-")[1];
            String fileHour = fileName.split("-")[2].substring(0, 2);
            // instantiate a custom RecordProcessor to process the records of this file
            PageCountsRecordProcessor recordProcessor = new PageCountsRecordProcessor(tableSchema, fileDate,
                    fileHour);
            // use the tableBuilder method for adding each of the files to the mix
            tableBuilder.addCSVTextFile(fileStatus.getPath(), ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER,
                    TupleTextInputFormat.NO_ESCAPE_CHARACTER, false, false, TupleTextInputFormat.NO_NULL_STRING,
                    fileSchema, recordProcessor);
        }

        // partition the dataset by pagename - which should give a fair even distribution.
        tableBuilder.partitionBy("pagename");
        // create a compound index on pagename, date so that typical queries for the dataset will be fast
        tableBuilder.createIndex("pagename", "date");

        long nonExactPageSize = memoryForIndexing / 32000; // number of pages
        int pageSize = (int) Math.pow(2, (int) Math.round(Math.log(nonExactPageSize) / Math.log(2)));
        Log.info("Pagesize = " + pageSize + " as memory for indexing was [" + memoryForIndexing
                + "] and there are 32000 pages.");

        tableBuilder.initialSQL("pragma page_size=" + pageSize);
        // insertion order is very important for optimizing query speed because it makes data be co-located in disk
        tableBuilder.insertionSortOrder(OrderBy.parse("pagename:asc, date:asc"));

        // instantiate a TablespaceBuilder
        TablespaceBuilder tablespaceBuilder = new TablespaceBuilder();

        // we will partition this dataset in as many partitions as:
        tablespaceBuilder.setNPartitions(nPartitions);
        tablespaceBuilder.add(tableBuilder.build());
        // we turn a specific SQLite pragma on for making autocomplete queries fast
        tablespaceBuilder.initStatements("pragma case_sensitive_like=true;");

        HadoopUtils.deleteIfExists(outFs, outPath);

        // finally, instantiate a TablespaceGenerator and execute it
        TablespaceGenerator tablespaceViewBuilder;

        if (generateTupleFiles) {
            // we subclass TablespaceGenerator to be able to run the generation without outputting the SQLite stores, for
            // benchmark comparisons.
            // In the future this feature may be useful in general for debugging store creation.
            tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath,
                    this.getClass()) {

                @Override
                public void generateView(Configuration conf, SamplingType samplingType,
                        SamplingOptions samplingOptions) throws Exception {

                    prepareOutput(conf);
                    final int nPartitions = tablespace.getnPartitions();
                    if (nPartitions > 1) {
                        partitionMap = sample(nPartitions, conf, samplingType, samplingOptions);
                    } else {
                        partitionMap = PartitionMap.oneShardOpenedMap();
                    }
                    writeOutputMetadata(conf);

                    TupleMRBuilder builder = createMRBuilder(nPartitions, conf);
                    // Set a TupleOutput here instead of SQLiteOutput
                    builder.setOutput(new Path(outputPath, OUT_STORE), new TupleOutputFormat(tableSchema),
                            ITuple.class, NullWritable.class);
                    executeViewGeneration(builder);
                }
            };
        } else {
            // ... otherwise a standard TablespaceGenerator is used.
            tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath,
                    this.getClass());
        }

        tablespaceViewBuilder.generateView(getConf(), SamplingType.FULL_SCAN,
                new TupleSampler.FullScanSamplingOptions());
    }

    if (deploy) {
        // use StoreDeployerTool for deploying the already generated dataset
        StoreDeployerTool deployer = new StoreDeployerTool(qnode, getConf());
        ArrayList<TablespaceDepSpec> deployments = new ArrayList<TablespaceDepSpec>();
        deployments.add(new TablespaceDepSpec("pagecounts", outPath.toString(), repFactor, null));
        deployer.deploy(deployments);
    }
    return 1;
}

From source file:com.splout.db.hadoop.TupleSampler.java

License:Apache License

@SuppressWarnings("deprecation")
private long fullScanSampling(TablespaceSpec tablespace, final long sampleSize, Configuration hadoopConf,
        Path outputPath, final int nSplits) throws TupleSamplerException {

    MapOnlyJobBuilder builder = new MapOnlyJobBuilder(hadoopConf, "Reservoir Sampling to path " + outputPath);

    for (Table table : tablespace.getPartitionedTables()) {
        final TableSpec tableSpec = table.getTableSpec();
        final String getPartitionByJavaScript = tableSpec.getPartitionByJavaScript();
        for (TableInput inputFile : table.getFiles()) {
            final RecordProcessor processor = inputFile.getRecordProcessor();
            for (Path path : inputFile.getPaths()) {
                builder.addInput(path, inputFile.getFormat(),
                        new MapOnlyMapper<ITuple, NullWritable, Text, NullWritable>() {

                            final int nSamples = (int) (sampleSize / nSplits);
                            final String[] samples = new String[nSamples];

                            CounterInterface counterInterface;
                            long recordCounter = 0;

                            JavascriptEngine jsEngine = null;

                            @Override
                            protected void setup(Context context, MultipleOutputsCollector coll)
                                    throws IOException, InterruptedException {
                                counterInterface = new CounterInterface(context);
                                // Initialize JavaScript engine if needed
                                if (getPartitionByJavaScript != null) {
                                    try {
                                        jsEngine = new JavascriptEngine(getPartitionByJavaScript);
                                    } catch (Throwable e) {
                                        throw new RuntimeException(e);
                                    }/*www .ja va 2 s  .  c o m*/
                                }
                            }

                        ;

                            // Collect Tuples with decreasing probability
                            // (http://en.wikipedia.org/wiki/Reservoir_sampling)
                            protected void map(ITuple key, NullWritable value, Context context)
                                    throws IOException, InterruptedException {
                                ITuple uTuple;
                                try {
                                    uTuple = processor.process(key, key.getSchema().getName(),
                                            counterInterface);
                                } catch (Throwable e) {
                                    throw new RuntimeException(e);
                                }
                                if (uTuple == null) { // user may have filtered the record
                                    return;
                                }

                                long reservoirIndex;
                                if (recordCounter < nSamples) {
                                    reservoirIndex = recordCounter;
                                } else {
                                    reservoirIndex = (long) (Math.random() * recordCounter);
                                }

                                if (reservoirIndex < nSamples) {
                                    String pkey = null;
                                    try {
                                        pkey = TablespaceGenerator.getPartitionByKey(uTuple, tableSpec,
                                                jsEngine);
                                    } catch (Throwable e) {
                                        throw new RuntimeException("Error when determining partition key.", e);
                                    }
                                    samples[(int) reservoirIndex] = pkey;
                                }

                                recordCounter++;
                            }

                            // Write the in-memory sampled Tuples
                            protected void cleanup(Context context, MultipleOutputsCollector coll)
                                    throws IOException, InterruptedException {
                                Text key = new Text();
                                for (String keyStr : samples) {
                                    if (keyStr != null) {
                                        key.set(keyStr);
                                        context.write(key, NullWritable.get());
                                    }
                                }
                            }
                        }, inputFile.getSpecificHadoopInputFormatContext());
            }
        }
    }
    // Set output path
    Path outReservoirPath = new Path(outputPath + "-reservoir");
    builder.setOutput(outReservoirPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class,
            NullWritable.class);
    builder.setJarByClass(callingClass);

    try {
        Job job = null;
        job = builder.createJob();

        if (!job.waitForCompletion(true)) {
            throw new TupleSamplerException("Reservoir Sampling failed!");
        }
    } catch (Exception e) {
        throw new TupleSamplerException("Error creating or launching the sampling job.", e);
    } finally {
        try {
            builder.cleanUpInstanceFiles();
        } catch (IOException e) {
            throw new TupleSamplerException("Error cleaning up the sampling job.", e);
        }
    }

    long retrievedSamples = 0;
    try {
        FileSystem outFs = outReservoirPath.getFileSystem(hadoopConf);
        if (outFs.listStatus(outReservoirPath) == null) {
            throw new IOException("Output folder not created: the Job failed!");
        }

        retrievedSamples = 0;
        // Instantiate the writer we will write samples to
        SequenceFile.Writer writer = new SequenceFile.Writer(outFs, hadoopConf, outputPath, Text.class,
                NullWritable.class);

        // Aggregate the output into a single file for being consistent with the other sampling methods
        for (FileStatus fileStatus : outFs.listStatus(outReservoirPath)) {
            Path thisPath = fileStatus.getPath();
            if (thisPath.getName().startsWith("part-m-")) {
                SequenceFile.Reader reader = new SequenceFile.Reader(outFs, thisPath, hadoopConf);
                Text key = new Text();
                while (reader.next(key)) {
                    writer.append(key, NullWritable.get());
                    retrievedSamples++;
                }
                reader.close();
            }
        }

        writer.close();
        outFs.delete(outReservoirPath, true);
    } catch (IOException e) {
        throw new TupleSamplerException("Error consolidating the sample job results into one file.", e);
    }

    return retrievedSamples;
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.TestHDFSTargetWholeFile.java

License:Apache License

@Test
public void testWholeFilePermission() throws Exception {
    java.nio.file.Path filePath1 = Paths.get(getTestDir() + "/source_testWholeFilePermissionFiles1.txt");
    java.nio.file.Path filePath2 = Paths.get(getTestDir() + "/source_testWholeFilePermissionFiles2.txt");
    java.nio.file.Path filePath3 = Paths.get(getTestDir() + "/source_testWholeFilePermissionFiles3.txt");

    Files.write(filePath1, "This is a sample file 1 with some text".getBytes());
    Files.write(filePath2, "This is a sample file 2 with some text".getBytes());
    Files.write(filePath3, "This is a sample file 3 with some text".getBytes());

    HdfsTarget hdfsTarget = HdfsTargetUtil.newBuilder().hdfsUri(uri.toString()).dirPathTemplate(getTestDir())
            .timeDriver("${time:now()}").dataForamt(DataFormat.WHOLE_FILE).fileType(HdfsFileType.WHOLE_FILE)
            .fileNameEL("${record:value('/fileInfo/filename')}").maxRecordsPerFile(1).maxFileSize(0)
            .uniquePrefix("sdc-").idleTimeout("-1").permissionEL("${record:value('/fileInfo/permissions')}")
            .lateRecordsAction(LateRecordsAction.SEND_TO_LATE_RECORDS_FILE).build();

    TargetRunner runner = new TargetRunner.Builder(HdfsDTarget.class, hdfsTarget)
            .setOnRecordError(OnRecordError.STOP_PIPELINE).build();

    runner.runInit();/*  ww w  .  j  a v  a 2  s  . c  o m*/

    try {
        runner.runWrite(Arrays.asList(getFileRefRecordForFile(filePath1, "755"),
                //posix style
                getFileRefRecordForFile(filePath2, "rwxr--r--"),
                //unix style
                getFileRefRecordForFile(filePath3, "-rw-rw----")));

        org.apache.hadoop.fs.Path targetPath1 = new org.apache.hadoop.fs.Path(
                getTestDir() + "/sdc-" + filePath1.getFileName());
        org.apache.hadoop.fs.Path targetPath2 = new org.apache.hadoop.fs.Path(
                getTestDir() + "/sdc-" + filePath2.getFileName());
        org.apache.hadoop.fs.Path targetPath3 = new org.apache.hadoop.fs.Path(
                getTestDir() + "/sdc-" + filePath3.getFileName());

        FileSystem fs = FileSystem.get(uri, new HdfsConfiguration());

        Assert.assertTrue(fs.exists(targetPath1));
        Assert.assertTrue(fs.exists(targetPath2));
        Assert.assertTrue(fs.exists(targetPath3));

        FsPermission actual1 = fs.listStatus(targetPath1)[0].getPermission();
        FsPermission actual2 = fs.listStatus(targetPath2)[0].getPermission();
        FsPermission actual3 = fs.listStatus(targetPath3)[0].getPermission();

        FsPermission expected1 = new FsPermission("755");
        FsPermission expected2 = FsPermission.valueOf("-rwxr--r--");
        FsPermission expected3 = FsPermission.valueOf("-rw-rw----");

        Assert.assertEquals(expected1, actual1);
        Assert.assertEquals(expected2, actual2);
        Assert.assertEquals(expected3, actual3);

    } finally {
        runner.runDestroy();
    }
}

From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java

License:Apache License

@Override
public List<ConfigIssue> init() {
    List<ConfigIssue> issues = super.init();
    validateHadoopFS(issues);//from   w w w  .ja va2s  .  com
    // This is for getting no of splits - no of executors
    hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark
    hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark
    for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) {
        hadoopConf.set(config.getKey(), config.getValue());
    }
    List<Path> hdfsDirPaths = new ArrayList<>();
    if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) {
        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                Errors.HADOOPFS_18));
    } else if (issues.isEmpty()) {
        for (String hdfsDirLocation : hdfsDirLocations) {
            try {
                FileSystem fs = getFileSystemForInitDestroy();
                Path ph = fs.makeQualified(new Path(hdfsDirLocation));
                hdfsDirPaths.add(ph);
                if (!fs.exists(ph)) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_10, hdfsDirLocation));
                } else if (!fs.getFileStatus(ph).isDirectory()) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_15, hdfsDirLocation));
                } else {
                    try {
                        FileStatus[] files = fs.listStatus(ph);
                        if (files == null || files.length == 0) {
                            issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                    "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation));
                        } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) {
                            for (FileStatus fileStatus : files) {
                                if (fileStatus.isFile()) {
                                    String path = fileStatus.getPath().toString();
                                    try {
                                        List<Map.Entry> buffer;
                                        if (dataFormat == DataFormat.AVRO) {
                                            buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE);
                                        } else {
                                            buffer = previewTextBatch(fileStatus, PREVIEW_SIZE);
                                        }
                                        for (int i = 0; i < buffer.size()
                                                && previewBuffer.size() < PREVIEW_SIZE; i++) {
                                            Map.Entry entry = buffer.get(i);
                                            previewBuffer.put(String.valueOf(entry.getKey()),
                                                    entry.getValue() == null ? null : entry.getValue());
                                        }
                                    } catch (IOException | InterruptedException ex) {
                                        String msg = "Error opening " + path + ": " + ex;
                                        LOG.info(msg, ex);
                                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                                "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath()));
                                    }
                                }
                            }
                        }
                    } catch (IOException ex) {
                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                                Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex));
                    }
                }
            } catch (IOException ioe) {
                LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe);
                issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                        Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe));
            }
        }
    }
    hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ","));
    hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive));
    switch (dataFormat) {
    case JSON:
        if (jsonMaxObjectLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04));
        }
        break;
    case TEXT:
        if (textMaxLineLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05));
        }
        break;
    case LOG:
        logDataFormatValidator = new LogDataFormatValidator(logMode, logMaxObjectLen, retainOriginalLine,
                customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat,
                log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(),
                getFieldPathToGroupMap(fieldPathsToGroupName));
        logDataFormatValidator.validateLogFormatConfig(issues, getContext());
        break;
    case DELIMITED:
        if (csvMaxObjectLen < 1) {
            issues.add(getContext().createConfigIssue(Groups.DELIMITED.name(), "csvMaxObjectLen",
                    Errors.HADOOPFS_30));
        }
        break;
    case AVRO:
        if (avroSchema != null && !avroSchema.isEmpty()) {
            hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema);
            hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema);
        }
        break;
    default:
        issues.add(getContext().createConfigIssue(Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06,
                dataFormat));
    }
    validateParserFactoryConfigs(issues);
    LOG.info("Issues: " + issues);
    return issues;
}

From source file:com.stumbleupon.hbaseadmin.ClusterUtils.java

License:Open Source License

/**
 * Remove any regions that do not qualify for compaction
 * @param admin The hbase admin/*  www.j ava2  s  . com*/
 * @param serverName The server name
 * @param server The HRegion interface
 * @return The filtered regions
 * @throws IOException 
 */
private HRegionInfo getNextEligibleRegion(HBaseAdmin admin, ServerName serverName, HRegionInterface server)
        throws IOException {
    HRegionInfo ret = null;
    List<HRegionInfo> onlineRegions = server.getOnlineRegions();
    String hostport = serverName.getHostAndPort();
    HServerLoad serverLoad = clusterStatus.getLoad(serverName);

    if (serverLoad == null) {
        LOG.warn("Skipping server {} because could not get server load", hostport);
    } else {
        List<String> tableNames = compact.getTableNames();
        boolean excludeFromList = compact.getExcludeTables();
        Map<byte[], RegionLoad> regionLoadMap = serverLoad.getRegionsLoad();
        List<String> reasons = new ArrayList<String>();

        for (HRegionInfo region : onlineRegions) {
            String regionName = region.getRegionNameAsString();
            String tableName = region.getTableNameAsString();
            reasons.clear();

            // Ignore any regions in tables that are marked as excluded
            if (tableNames.size() > 0) {
                if (excludeFromList && tableNames.contains(tableName)) {
                    continue;
                } else if (!excludeFromList && !tableNames.contains(tableName)) {
                    continue;
                } else if (LOG.isDebugEnabled()) {
                    reasons.add(hostport + " [" + regionName + "] qualifies because its table '" + tableName
                            + "' has NOT been excluded");
                }
            }

            // Ignore any regions that we have already visited/compacted
            if (visitedRegions.isRegionVisited(hostport, regionName)) {
                continue;
            } else if (LOG.isDebugEnabled()) {
                reasons.add(hostport + " [" + regionName + "] qualifies because it has NOT been visited");
            }

            // Remove any regions that do not have enough store files to qualify for compaction
            RegionLoad regionLoad = regionLoadMap.get(region.getRegionName());
            boolean isRegionEligible = true;

            if (regionLoad == null) {
                LOG.warn("Could not get region load for '{}'. Skipping region...", regionName);
                continue;
            } else {
                try {
                    int numFamilies = getTableDescriptor(admin, region).getColumnFamilies().length;
                    int numRegionStoreFiles = regionLoad.getStorefiles();
                    int minStoreFilesNeeded = compact.getNumStoreFiles() * numFamilies;

                    if (numRegionStoreFiles >= minStoreFilesNeeded) {
                        isRegionEligible = true;

                        if (LOG.isDebugEnabled()) {
                            reasons.add(hostport + " [" + regionName + "] qualifies because it has a total of "
                                    + numRegionStoreFiles + " store files in " + numFamilies + " families");
                        }
                    } else {
                        if (LOG.isDebugEnabled()) {
                            reasons.add(hostport + " [" + regionName
                                    + "] does not qualify because it has a total of " + numRegionStoreFiles
                                    + " store files in " + numFamilies + " families. Needs at least "
                                    + minStoreFilesNeeded);
                        }

                        isRegionEligible = false;
                    }
                } catch (TableNotFoundException e) {
                    LOG.error("Could not determine if region '{}' is eligible. Skipping region.", regionName,
                            e);
                    continue;
                } catch (IOException e) {
                    LOG.error("Could not determine if region '{}' is eligible. Skipping region.", regionName,
                            e);
                    continue;
                } catch (Exception e) {
                    LOG.error("Could not determine if region '{}' is eligible. Skipping region.", regionName,
                            e);
                    continue;
                }
            }

            // If enabled, force compaction of any regions that contain store files older than maxStoreFileAge 
            if (!isRegionEligible && compact.getMaxStoreFileAge() > 0) {
                List<String> files = server.getStoreFileList(region.getRegionName());
                FileSystem fs = FileSystem.get(admin.getConfiguration());

                if (files != null) {
                    Path[] filePaths = new Path[files.size()];
                    for (int i = 0; i < files.size(); i++) {
                        filePaths[i] = new Path(files.get(0));
                    }

                    long maxStoreFileAge = compact.getMaxStoreFileAge();
                    long now = System.currentTimeMillis();
                    FileStatus[] storeFilesStatus = fs.listStatus(filePaths);

                    for (FileStatus fileStatus : storeFilesStatus) {
                        long storeFileAge = now - fileStatus.getModificationTime();

                        if (storeFileAge > maxStoreFileAge) {
                            isRegionEligible = true;

                            if (LOG.isDebugEnabled()) {
                                reasons.add(hostport + " [" + regionName + "] forced to qualify because "
                                        + "at least one store file is older than the specified maxStoreFileAge");
                            }

                            break;
                        }
                    }
                }
            }

            if (isRegionEligible) {
                if (reasons.size() > 0) {
                    for (String reason : reasons) {
                        LOG.debug(reason);
                    }
                }

                ret = region;
                break;
            }
        }
    }

    return ret;
}

From source file:com.talis.hadoop.rdf.RdfSolrJob.java

License:Apache License

private void writeShardManifest(String manifestLocation, String shardLocation, Configuration configuration)
        throws IOException {
    Path shardsPath = new Path(INTERMEDIATE_SHARDS_URI);
    FileSystem fs = FileSystem.get(shardsPath.toUri(), configuration);
    StringBuffer buf = new StringBuffer();
    for (FileStatus status : fs.listStatus(shardsPath)) {
        LOG.info(status.getPath() + " : " + status.isDir());
        if (status.isDir()) {
            buf.append(status.getPath());
            buf.append("\n");
        }//from  ww  w.j a va2 s  .  c o m
    }
    FSDataOutputStream out = fs.create(new Path(manifestLocation));
    out.write(buf.toString().getBytes());
    out.flush();
    out.close();
}

From source file:com.taobao.datax.plugins.common.DFSUtils.java

License:Open Source License

/**
 * List the statuses of the files/directories in the given path if the path
 * is a directory./*w  w  w  . jav  a2  s. c o m*/
 * 
 * @param dfs
 *            handle of {@link FileSystem}
 * 
 * @param srcpath
 *            Path in {@link FileSystem}
 * 
 * @param isGlob
 *            need to use file pattern
 * 
 * @return all {@link Path} in srcpath
 * 
 * @throws IOException 
 * 
 * */
public static List<Path> listDir(FileSystem dfs, Path srcpath, boolean isGlob) throws IOException {
    List<Path> list = new ArrayList<Path>();
    FileStatus[] status = null;
    if (isGlob) {
        status = dfs.globStatus(srcpath);
    } else {
        status = dfs.listStatus(srcpath);
    }
    if (status != null) {
        for (FileStatus state : status) {
            list.add(state.getPath());
        }
    }

    return list;
}