Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using default path filter.

Usage

From source file:com.skp.experiment.common.mapreduce.MapFileOutputFormat.java

License:Apache License

/** Open the output generated by this format. */
public static MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException {
    FileSystem fs = dir.getFileSystem(conf);
    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));

    // sort names, so that hash partitioning works
    Arrays.sort(names);/*from ww  w . j  a va2s  . co m*/

    MapFile.Reader[] parts = new MapFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
    }
    return parts;
}

From source file:com.splicemachine.derby.impl.io.HdfsDirFile.java

License:Apache License

@Override
public String[] list() {
    try {/*  w  ww.j  av a  2s.c o  m*/
        FileSystem fs = getFileSystem();
        FileStatus[] fileStatuses = fs.listStatus(new Path(path));
        String[] list = new String[fileStatuses.length];
        for (int i = 0; i < fileStatuses.length; i++) {
            list[i] = fileStatuses[i].getPath().getName();
        }
        return list;
    } catch (IOException e) {
        LOG.error(String.format(
                "An exception occurred while listing the files and directories in the path '%s'.", path), e);
        return null;
    }
}

From source file:com.splout.db.common.SploutHadoopConfiguration.java

License:Apache License

/**
 * Adds the SQLite native libraries to the DistributedCache so that they will be present in the java.library.path
 * of the child's Hadoop task./* w ww.  j  av  a 2 s .  c o  m*/
 * <p/>
 * Usually you don't need to do this as the task will already try to load them from the job's uncompressed JAR, however
 * it is not assured that all Hadoop versions do the uncompressing of the JAR so in this case it's safer to use this.
 */
public static void addSQLite4JavaNativeLibsToDC(Configuration conf, File nativeLibsLocalPath)
        throws IOException, URISyntaxException {
    Path nativeLibHdfs = new Path("splout-native");
    FileSystem fS = FileSystem.get(conf);
    if (fS.exists(nativeLibHdfs)) {
        fS.delete(nativeLibHdfs, true);
    }
    fS.mkdirs(nativeLibHdfs);
    // Copy native libs to HDFS
    File[] natives = nativeLibsLocalPath.listFiles();
    if (natives == null) {
        throw new RuntimeException(
                "natives lib folder not present in local working directory! Are you in SPLOUT_HOME?");
    }
    for (File nativeLib : natives) {
        FileUtil.copy(nativeLib, fS, nativeLibHdfs, false, conf);
    }
    for (FileStatus nativeLibInHdfs : fS.listStatus(nativeLibHdfs)) {
        // http://hadoop.apache.org/docs/r0.20.2/native_libraries.html#Loading+native+libraries+through+DistributedCache
        DistributedCache.createSymlink(conf);
        URI uriToAdd = new URI(
                nativeLibInHdfs.getPath().makeQualified(fS) + "#" + nativeLibInHdfs.getPath().getName());
        DistributedCache.addCacheFile(uriToAdd, conf);
        log.info("Adding to distributed cache: " + uriToAdd);
    }
}

From source file:com.splout.db.examples.PageCountsExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // Validate params etc
    JCommander jComm = new JCommander(this);
    jComm.setProgramName("Splout Page Counts example");
    try {//from   w  w  w. ja v a2s  .c o  m
        jComm.parse(args);
    } catch (ParameterException e) {
        System.err.println(e.getMessage());
        jComm.usage();
        System.exit(-1);
    }

    boolean generate = !noGenerate; // just for clarifying

    if (generateTupleFiles && deploy) {
        System.err.println("Can't run a 'dry' TupleFile generation and deploy it.");
        jComm.usage();
        System.exit(-1);
    }

    Path outPath = new Path(outputPath);
    FileSystem outFs = outPath.getFileSystem(getConf());

    if (!FileSystem.getLocal(conf).equals(FileSystem.get(conf))) {
        File nativeLibs = new File("native");
        if (nativeLibs.exists()) {
            SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf);
        }
    }

    if (generate) {
        Path inputPath = new Path(this.inputPath);
        FileSystem inputFileSystem = inputPath.getFileSystem(conf);

        FileStatus[] fileStatuses = inputFileSystem.listStatus(inputPath);

        // define the schema that the resultant table will have: date, hour, pagename, pageviews
        final Schema tableSchema = new Schema("pagecounts",
                Fields.parse("date:string, hour:string, pagename:string, pageviews:int"));
        // define the schema of the input files: projectcode, pagename, pageviews, bytes
        Schema fileSchema = new Schema("pagecountsfile",
                Fields.parse("projectcode:string, pagename:string, pageviews:int, bytes:long"));

        // instantiate a TableBuilder
        TableBuilder tableBuilder = new TableBuilder(tableSchema);

        // for every input file...
        for (FileStatus fileStatus : fileStatuses) {
            String fileName = fileStatus.getPath().getName().toString();
            // strip the date and the hour from the file name
            String fileDate = fileName.split("-")[1];
            String fileHour = fileName.split("-")[2].substring(0, 2);
            // instantiate a custom RecordProcessor to process the records of this file
            PageCountsRecordProcessor recordProcessor = new PageCountsRecordProcessor(tableSchema, fileDate,
                    fileHour);
            // use the tableBuilder method for adding each of the files to the mix
            tableBuilder.addCSVTextFile(fileStatus.getPath(), ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER,
                    TupleTextInputFormat.NO_ESCAPE_CHARACTER, false, false, TupleTextInputFormat.NO_NULL_STRING,
                    fileSchema, recordProcessor);
        }

        // partition the dataset by pagename - which should give a fair even distribution.
        tableBuilder.partitionBy("pagename");
        // create a compound index on pagename, date so that typical queries for the dataset will be fast
        tableBuilder.createIndex("pagename", "date");

        long nonExactPageSize = memoryForIndexing / 32000; // number of pages
        int pageSize = (int) Math.pow(2, (int) Math.round(Math.log(nonExactPageSize) / Math.log(2)));
        Log.info("Pagesize = " + pageSize + " as memory for indexing was [" + memoryForIndexing
                + "] and there are 32000 pages.");

        tableBuilder.initialSQL("pragma page_size=" + pageSize);
        // insertion order is very important for optimizing query speed because it makes data be co-located in disk
        tableBuilder.insertionSortOrder(OrderBy.parse("pagename:asc, date:asc"));

        // instantiate a TablespaceBuilder
        TablespaceBuilder tablespaceBuilder = new TablespaceBuilder();

        // we will partition this dataset in as many partitions as:
        tablespaceBuilder.setNPartitions(nPartitions);
        tablespaceBuilder.add(tableBuilder.build());
        // we turn a specific SQLite pragma on for making autocomplete queries fast
        tablespaceBuilder.initStatements("pragma case_sensitive_like=true;");

        HadoopUtils.deleteIfExists(outFs, outPath);

        // finally, instantiate a TablespaceGenerator and execute it
        TablespaceGenerator tablespaceViewBuilder;

        if (generateTupleFiles) {
            // we subclass TablespaceGenerator to be able to run the generation without outputting the SQLite stores, for
            // benchmark comparisons.
            // In the future this feature may be useful in general for debugging store creation.
            tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath,
                    this.getClass()) {

                @Override
                public void generateView(Configuration conf, SamplingType samplingType,
                        SamplingOptions samplingOptions) throws Exception {

                    prepareOutput(conf);
                    final int nPartitions = tablespace.getnPartitions();
                    if (nPartitions > 1) {
                        partitionMap = sample(nPartitions, conf, samplingType, samplingOptions);
                    } else {
                        partitionMap = PartitionMap.oneShardOpenedMap();
                    }
                    writeOutputMetadata(conf);

                    TupleMRBuilder builder = createMRBuilder(nPartitions, conf);
                    // Set a TupleOutput here instead of SQLiteOutput
                    builder.setOutput(new Path(outputPath, OUT_STORE), new TupleOutputFormat(tableSchema),
                            ITuple.class, NullWritable.class);
                    executeViewGeneration(builder);
                }
            };
        } else {
            // ... otherwise a standard TablespaceGenerator is used.
            tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath,
                    this.getClass());
        }

        tablespaceViewBuilder.generateView(getConf(), SamplingType.FULL_SCAN,
                new TupleSampler.FullScanSamplingOptions());
    }

    if (deploy) {
        // use StoreDeployerTool for deploying the already generated dataset
        StoreDeployerTool deployer = new StoreDeployerTool(qnode, getConf());
        ArrayList<TablespaceDepSpec> deployments = new ArrayList<TablespaceDepSpec>();
        deployments.add(new TablespaceDepSpec("pagecounts", outPath.toString(), repFactor, null));
        deployer.deploy(deployments);
    }
    return 1;
}

From source file:com.splout.db.hadoop.TupleSampler.java

License:Apache License

@SuppressWarnings("deprecation")
private long fullScanSampling(TablespaceSpec tablespace, final long sampleSize, Configuration hadoopConf,
        Path outputPath, final int nSplits) throws TupleSamplerException {

    MapOnlyJobBuilder builder = new MapOnlyJobBuilder(hadoopConf, "Reservoir Sampling to path " + outputPath);

    for (Table table : tablespace.getPartitionedTables()) {
        final TableSpec tableSpec = table.getTableSpec();
        final String getPartitionByJavaScript = tableSpec.getPartitionByJavaScript();
        for (TableInput inputFile : table.getFiles()) {
            final RecordProcessor processor = inputFile.getRecordProcessor();
            for (Path path : inputFile.getPaths()) {
                builder.addInput(path, inputFile.getFormat(),
                        new MapOnlyMapper<ITuple, NullWritable, Text, NullWritable>() {

                            final int nSamples = (int) (sampleSize / nSplits);
                            final String[] samples = new String[nSamples];

                            CounterInterface counterInterface;
                            long recordCounter = 0;

                            JavascriptEngine jsEngine = null;

                            @Override
                            protected void setup(Context context, MultipleOutputsCollector coll)
                                    throws IOException, InterruptedException {
                                counterInterface = new CounterInterface(context);
                                // Initialize JavaScript engine if needed
                                if (getPartitionByJavaScript != null) {
                                    try {
                                        jsEngine = new JavascriptEngine(getPartitionByJavaScript);
                                    } catch (Throwable e) {
                                        throw new RuntimeException(e);
                                    }/*www .ja va 2 s  .  c o m*/
                                }
                            }

                        ;

                            // Collect Tuples with decreasing probability
                            // (http://en.wikipedia.org/wiki/Reservoir_sampling)
                            protected void map(ITuple key, NullWritable value, Context context)
                                    throws IOException, InterruptedException {
                                ITuple uTuple;
                                try {
                                    uTuple = processor.process(key, key.getSchema().getName(),
                                            counterInterface);
                                } catch (Throwable e) {
                                    throw new RuntimeException(e);
                                }
                                if (uTuple == null) { // user may have filtered the record
                                    return;
                                }

                                long reservoirIndex;
                                if (recordCounter < nSamples) {
                                    reservoirIndex = recordCounter;
                                } else {
                                    reservoirIndex = (long) (Math.random() * recordCounter);
                                }

                                if (reservoirIndex < nSamples) {
                                    String pkey = null;
                                    try {
                                        pkey = TablespaceGenerator.getPartitionByKey(uTuple, tableSpec,
                                                jsEngine);
                                    } catch (Throwable e) {
                                        throw new RuntimeException("Error when determining partition key.", e);
                                    }
                                    samples[(int) reservoirIndex] = pkey;
                                }

                                recordCounter++;
                            }

                            // Write the in-memory sampled Tuples
                            protected void cleanup(Context context, MultipleOutputsCollector coll)
                                    throws IOException, InterruptedException {
                                Text key = new Text();
                                for (String keyStr : samples) {
                                    if (keyStr != null) {
                                        key.set(keyStr);
                                        context.write(key, NullWritable.get());
                                    }
                                }
                            }
                        }, inputFile.getSpecificHadoopInputFormatContext());
            }
        }
    }
    // Set output path
    Path outReservoirPath = new Path(outputPath + "-reservoir");
    builder.setOutput(outReservoirPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class,
            NullWritable.class);
    builder.setJarByClass(callingClass);

    try {
        Job job = null;
        job = builder.createJob();

        if (!job.waitForCompletion(true)) {
            throw new TupleSamplerException("Reservoir Sampling failed!");
        }
    } catch (Exception e) {
        throw new TupleSamplerException("Error creating or launching the sampling job.", e);
    } finally {
        try {
            builder.cleanUpInstanceFiles();
        } catch (IOException e) {
            throw new TupleSamplerException("Error cleaning up the sampling job.", e);
        }
    }

    long retrievedSamples = 0;
    try {
        FileSystem outFs = outReservoirPath.getFileSystem(hadoopConf);
        if (outFs.listStatus(outReservoirPath) == null) {
            throw new IOException("Output folder not created: the Job failed!");
        }

        retrievedSamples = 0;
        // Instantiate the writer we will write samples to
        SequenceFile.Writer writer = new SequenceFile.Writer(outFs, hadoopConf, outputPath, Text.class,
                NullWritable.class);

        // Aggregate the output into a single file for being consistent with the other sampling methods
        for (FileStatus fileStatus : outFs.listStatus(outReservoirPath)) {
            Path thisPath = fileStatus.getPath();
            if (thisPath.getName().startsWith("part-m-")) {
                SequenceFile.Reader reader = new SequenceFile.Reader(outFs, thisPath, hadoopConf);
                Text key = new Text();
                while (reader.next(key)) {
                    writer.append(key, NullWritable.get());
                    retrievedSamples++;
                }
                reader.close();
            }
        }

        writer.close();
        outFs.delete(outReservoirPath, true);
    } catch (IOException e) {
        throw new TupleSamplerException("Error consolidating the sample job results into one file.", e);
    }

    return retrievedSamples;
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.TestHDFSTargetWholeFile.java

License:Apache License

@Test
public void testWholeFilePermission() throws Exception {
    java.nio.file.Path filePath1 = Paths.get(getTestDir() + "/source_testWholeFilePermissionFiles1.txt");
    java.nio.file.Path filePath2 = Paths.get(getTestDir() + "/source_testWholeFilePermissionFiles2.txt");
    java.nio.file.Path filePath3 = Paths.get(getTestDir() + "/source_testWholeFilePermissionFiles3.txt");

    Files.write(filePath1, "This is a sample file 1 with some text".getBytes());
    Files.write(filePath2, "This is a sample file 2 with some text".getBytes());
    Files.write(filePath3, "This is a sample file 3 with some text".getBytes());

    HdfsTarget hdfsTarget = HdfsTargetUtil.newBuilder().hdfsUri(uri.toString()).dirPathTemplate(getTestDir())
            .timeDriver("${time:now()}").dataForamt(DataFormat.WHOLE_FILE).fileType(HdfsFileType.WHOLE_FILE)
            .fileNameEL("${record:value('/fileInfo/filename')}").maxRecordsPerFile(1).maxFileSize(0)
            .uniquePrefix("sdc-").idleTimeout("-1").permissionEL("${record:value('/fileInfo/permissions')}")
            .lateRecordsAction(LateRecordsAction.SEND_TO_LATE_RECORDS_FILE).build();

    TargetRunner runner = new TargetRunner.Builder(HdfsDTarget.class, hdfsTarget)
            .setOnRecordError(OnRecordError.STOP_PIPELINE).build();

    runner.runInit();/*  ww w  .  j  a v  a 2  s  . c  o m*/

    try {
        runner.runWrite(Arrays.asList(getFileRefRecordForFile(filePath1, "755"),
                //posix style
                getFileRefRecordForFile(filePath2, "rwxr--r--"),
                //unix style
                getFileRefRecordForFile(filePath3, "-rw-rw----")));

        org.apache.hadoop.fs.Path targetPath1 = new org.apache.hadoop.fs.Path(
                getTestDir() + "/sdc-" + filePath1.getFileName());
        org.apache.hadoop.fs.Path targetPath2 = new org.apache.hadoop.fs.Path(
                getTestDir() + "/sdc-" + filePath2.getFileName());
        org.apache.hadoop.fs.Path targetPath3 = new org.apache.hadoop.fs.Path(
                getTestDir() + "/sdc-" + filePath3.getFileName());

        FileSystem fs = FileSystem.get(uri, new HdfsConfiguration());

        Assert.assertTrue(fs.exists(targetPath1));
        Assert.assertTrue(fs.exists(targetPath2));
        Assert.assertTrue(fs.exists(targetPath3));

        FsPermission actual1 = fs.listStatus(targetPath1)[0].getPermission();
        FsPermission actual2 = fs.listStatus(targetPath2)[0].getPermission();
        FsPermission actual3 = fs.listStatus(targetPath3)[0].getPermission();

        FsPermission expected1 = new FsPermission("755");
        FsPermission expected2 = FsPermission.valueOf("-rwxr--r--");
        FsPermission expected3 = FsPermission.valueOf("-rw-rw----");

        Assert.assertEquals(expected1, actual1);
        Assert.assertEquals(expected2, actual2);
        Assert.assertEquals(expected3, actual3);

    } finally {
        runner.runDestroy();
    }
}

From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java

License:Apache License

@Override
public List<ConfigIssue> init() {
    List<ConfigIssue> issues = super.init();
    validateHadoopFS(issues);//from   w w w  .ja va2s  .  com
    // This is for getting no of splits - no of executors
    hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark
    hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark
    for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) {
        hadoopConf.set(config.getKey(), config.getValue());
    }
    List<Path> hdfsDirPaths = new ArrayList<>();
    if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) {
        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                Errors.HADOOPFS_18));
    } else if (issues.isEmpty()) {
        for (String hdfsDirLocation : hdfsDirLocations) {
            try {
                FileSystem fs = getFileSystemForInitDestroy();
                Path ph = fs.makeQualified(new Path(hdfsDirLocation));
                hdfsDirPaths.add(ph);
                if (!fs.exists(ph)) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_10, hdfsDirLocation));
                } else if (!fs.getFileStatus(ph).isDirectory()) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_15, hdfsDirLocation));
                } else {
                    try {
                        FileStatus[] files = fs.listStatus(ph);
                        if (files == null || files.length == 0) {
                            issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                    "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation));
                        } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) {
                            for (FileStatus fileStatus : files) {
                                if (fileStatus.isFile()) {
                                    String path = fileStatus.getPath().toString();
                                    try {
                                        List<Map.Entry> buffer;
                                        if (dataFormat == DataFormat.AVRO) {
                                            buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE);
                                        } else {
                                            buffer = previewTextBatch(fileStatus, PREVIEW_SIZE);
                                        }
                                        for (int i = 0; i < buffer.size()
                                                && previewBuffer.size() < PREVIEW_SIZE; i++) {
                                            Map.Entry entry = buffer.get(i);
                                            previewBuffer.put(String.valueOf(entry.getKey()),
                                                    entry.getValue() == null ? null : entry.getValue());
                                        }
                                    } catch (IOException | InterruptedException ex) {
                                        String msg = "Error opening " + path + ": " + ex;
                                        LOG.info(msg, ex);
                                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                                "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath()));
                                    }
                                }
                            }
                        }
                    } catch (IOException ex) {
                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                                Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex));
                    }
                }
            } catch (IOException ioe) {
                LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe);
                issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                        Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe));
            }
        }
    }
    hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ","));
    hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive));
    switch (dataFormat) {
    case JSON:
        if (jsonMaxObjectLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04));
        }
        break;
    case TEXT:
        if (textMaxLineLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05));
        }
        break;
    case LOG:
        logDataFormatValidator = new LogDataFormatValidator(logMode, logMaxObjectLen, retainOriginalLine,
                customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat,
                log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(),
                getFieldPathToGroupMap(fieldPathsToGroupName));
        logDataFormatValidator.validateLogFormatConfig(issues, getContext());
        break;
    case DELIMITED:
        if (csvMaxObjectLen < 1) {
            issues.add(getContext().createConfigIssue(Groups.DELIMITED.name(), "csvMaxObjectLen",
                    Errors.HADOOPFS_30));
        }
        break;
    case AVRO:
        if (avroSchema != null && !avroSchema.isEmpty()) {
            hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema);
            hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema);
        }
        break;
    default:
        issues.add(getContext().createConfigIssue(Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06,
                dataFormat));
    }
    validateParserFactoryConfigs(issues);
    LOG.info("Issues: " + issues);
    return issues;
}

From source file:com.stumbleupon.hbaseadmin.ClusterUtils.java

License:Open Source License

/**
 * Remove any regions that do not qualify for compaction
 * @param admin The hbase admin/*  www.j ava2  s  . com*/
 * @param serverName The server name
 * @param server The HRegion interface
 * @return The filtered regions
 * @throws IOException 
 */
private HRegionInfo getNextEligibleRegion(HBaseAdmin admin, ServerName serverName, HRegionInterface server)
        throws IOException {
    HRegionInfo ret = null;
    List<HRegionInfo> onlineRegions = server.getOnlineRegions();
    String hostport = serverName.getHostAndPort();
    HServerLoad serverLoad = clusterStatus.getLoad(serverName);

    if (serverLoad == null) {
        LOG.warn("Skipping server {} because could not get server load", hostport);
    } else {
        List<String> tableNames = compact.getTableNames();
        boolean excludeFromList = compact.getExcludeTables();
        Map<byte[], RegionLoad> regionLoadMap = serverLoad.getRegionsLoad();
        List<String> reasons = new ArrayList<String>();

        for (HRegionInfo region : onlineRegions) {
            String regionName = region.getRegionNameAsString();
            String tableName = region.getTableNameAsString();
            reasons.clear();

            // Ignore any regions in tables that are marked as excluded
            if (tableNames.size() > 0) {
                if (excludeFromList && tableNames.contains(tableName)) {
                    continue;
                } else if (!excludeFromList && !tableNames.contains(tableName)) {
                    continue;
                } else if (LOG.isDebugEnabled()) {
                    reasons.add(hostport + " [" + regionName + "] qualifies because its table '" + tableName
                            + "' has NOT been excluded");
                }
            }

            // Ignore any regions that we have already visited/compacted
            if (visitedRegions.isRegionVisited(hostport, regionName)) {
                continue;
            } else if (LOG.isDebugEnabled()) {
                reasons.add(hostport + " [" + regionName + "] qualifies because it has NOT been visited");
            }

            // Remove any regions that do not have enough store files to qualify for compaction
            RegionLoad regionLoad = regionLoadMap.get(region.getRegionName());
            boolean isRegionEligible = true;

            if (regionLoad == null) {
                LOG.warn("Could not get region load for '{}'. Skipping region...", regionName);
                continue;
            } else {
                try {
                    int numFamilies = getTableDescriptor(admin, region).getColumnFamilies().length;
                    int numRegionStoreFiles = regionLoad.getStorefiles();
                    int minStoreFilesNeeded = compact.getNumStoreFiles() * numFamilies;

                    if (numRegionStoreFiles >= minStoreFilesNeeded) {
                        isRegionEligible = true;

                        if (LOG.isDebugEnabled()) {
                            reasons.add(hostport + " [" + regionName + "] qualifies because it has a total of "
                                    + numRegionStoreFiles + " store files in " + numFamilies + " families");
                        }
                    } else {
                        if (LOG.isDebugEnabled()) {
                            reasons.add(hostport + " [" + regionName
                                    + "] does not qualify because it has a total of " + numRegionStoreFiles
                                    + " store files in " + numFamilies + " families. Needs at least "
                                    + minStoreFilesNeeded);
                        }

                        isRegionEligible = false;
                    }
                } catch (TableNotFoundException e) {
                    LOG.error("Could not determine if region '{}' is eligible. Skipping region.", regionName,
                            e);
                    continue;
                } catch (IOException e) {
                    LOG.error("Could not determine if region '{}' is eligible. Skipping region.", regionName,
                            e);
                    continue;
                } catch (Exception e) {
                    LOG.error("Could not determine if region '{}' is eligible. Skipping region.", regionName,
                            e);
                    continue;
                }
            }

            // If enabled, force compaction of any regions that contain store files older than maxStoreFileAge 
            if (!isRegionEligible && compact.getMaxStoreFileAge() > 0) {
                List<String> files = server.getStoreFileList(region.getRegionName());
                FileSystem fs = FileSystem.get(admin.getConfiguration());

                if (files != null) {
                    Path[] filePaths = new Path[files.size()];
                    for (int i = 0; i < files.size(); i++) {
                        filePaths[i] = new Path(files.get(0));
                    }

                    long maxStoreFileAge = compact.getMaxStoreFileAge();
                    long now = System.currentTimeMillis();
                    FileStatus[] storeFilesStatus = fs.listStatus(filePaths);

                    for (FileStatus fileStatus : storeFilesStatus) {
                        long storeFileAge = now - fileStatus.getModificationTime();

                        if (storeFileAge > maxStoreFileAge) {
                            isRegionEligible = true;

                            if (LOG.isDebugEnabled()) {
                                reasons.add(hostport + " [" + regionName + "] forced to qualify because "
                                        + "at least one store file is older than the specified maxStoreFileAge");
                            }

                            break;
                        }
                    }
                }
            }

            if (isRegionEligible) {
                if (reasons.size() > 0) {
                    for (String reason : reasons) {
                        LOG.debug(reason);
                    }
                }

                ret = region;
                break;
            }
        }
    }

    return ret;
}

From source file:com.talis.hadoop.rdf.RdfSolrJob.java

License:Apache License

private void writeShardManifest(String manifestLocation, String shardLocation, Configuration configuration)
        throws IOException {
    Path shardsPath = new Path(INTERMEDIATE_SHARDS_URI);
    FileSystem fs = FileSystem.get(shardsPath.toUri(), configuration);
    StringBuffer buf = new StringBuffer();
    for (FileStatus status : fs.listStatus(shardsPath)) {
        LOG.info(status.getPath() + " : " + status.isDir());
        if (status.isDir()) {
            buf.append(status.getPath());
            buf.append("\n");
        }//from  ww  w.j a va2 s  .  c o m
    }
    FSDataOutputStream out = fs.create(new Path(manifestLocation));
    out.write(buf.toString().getBytes());
    out.flush();
    out.close();
}

From source file:com.taobao.datax.plugins.common.DFSUtils.java

License:Open Source License

/**
 * List the statuses of the files/directories in the given path if the path
 * is a directory./*w  w  w  . jav  a2  s. c o m*/
 * 
 * @param dfs
 *            handle of {@link FileSystem}
 * 
 * @param srcpath
 *            Path in {@link FileSystem}
 * 
 * @param isGlob
 *            need to use file pattern
 * 
 * @return all {@link Path} in srcpath
 * 
 * @throws IOException 
 * 
 * */
public static List<Path> listDir(FileSystem dfs, Path srcpath, boolean isGlob) throws IOException {
    List<Path> list = new ArrayList<Path>();
    FileStatus[] status = null;
    if (isGlob) {
        status = dfs.globStatus(srcpath);
    } else {
        status = dfs.listStatus(srcpath);
    }
    if (status != null) {
        for (FileStatus state : status) {
            list.add(state.getPath());
        }
    }

    return list;
}