Example usage for org.apache.hadoop.fs FileSystem getContentSummary

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getContentSummary.

Prototype

public ContentSummary getContentSummary(Path f) throws IOException

Source Link

Document

Return the ContentSummary of a given Path .

Usage

From source file:simsql.runtime.RelOp.java

License:Apache License

public boolean run(RuntimeParameter params, boolean verbose) {

    ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params;

    // build the jar.
    String jarFile = buildJarFile(params);

    // Get the default configuration object
    Configuration conf = new Configuration();

    // set quite mode on/off
    conf.setQuietMode(!verbose);// w w  w  .  jav  a 2  s . c  o  m

    /***
    conf.setBoolean("mapred.task.profile", true);
    conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," +
        "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s");
    ***/

    // tell it how to serialize and deserialize records and recordkeys
    conf.set("io.serializations", getSerializations());
    conf.setBoolean("mapred.compress.map.output", true);

    int ioSortMB = conf.getInt("io.sort.mb", 256);
    conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.setInt("simsql.input.numSplits", pp.getNumCPUs());
    conf.setInt("mapred.job.reuse.jvm.num.tasks", 1);
    // conf.setBoolean ("mapred.map.tasks.speculative.execution", false);
    // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false);

    // tell it to use the jar that we just created
    conf.set("mapred.jar", jarFile);

    // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar");

    conf.setBoolean("mapred.output.compress", true);
    conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" });

    // use snappy for the intermediate stuff
    conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass());

    // do some additional operator-specific configurations
    setConfigurations(conf, params);

    // collect statistics for final relations always
    conf.setBoolean("simsql.collectStats", isFinal || collectStats);

    // figure out what file to map
    String[] inDirs = myInputNetwork.getPipelinedInputFiles();
    inDirs = excludeAnyWhoWillNotBeMapped(inDirs);
    String inSingleString = inDirs[0];
    conf.set("simsql.fileToMap", inSingleString);
    for (int i = 1; i < inDirs.length; i++) {
        inSingleString += "," + inDirs[i];
    }

    // create and name the job
    Job job;
    try {
        job = new Job(conf);
    } catch (Exception e) {
        throw new RuntimeException("Unable to create a new job!", e);
    }

    job.setJobName(getJobName());

    // set the map-reduce input and output types
    job.setMapOutputKeyClass(getMapOutputKeyClass());
    job.setMapOutputValueClass(getMapOutputValueClass());
    job.setOutputKeyClass(getOutputKeyClass());
    job.setOutputValueClass(getOutputValueClass());

    int numReducers = getNumReducers(params);

    job.setMapperClass(getMapperClass());
    job.setReducerClass(getReducerClass());

    // set the number of reducers
    job.setNumReduceTasks(numReducers);

    // set the input and the output formats... these extend FileInputFormat and FileOutputFormat
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(getOutputFormatClass());

    // set the input and output paths
    try {
        System.out.println("input file: " + inSingleString);
        FileInputFormat.setInputPaths(job, inSingleString);
        FileInputFormat.setInputPathFilter(job, TableFileFilter.class);
        FileOutputFormat.setOutputPath(job, new Path(getOutput()));
    } catch (Exception e) {
        throw new RuntimeException("Unable to set up the input/output path for the job.", e);
    }

    // set the split size
    FileInputFormat.setMinInputSplitSize(job, getSplitSize(params));
    FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params));

    // set the various sorting/grouping/mapping classes
    job.setGroupingComparatorClass(getGroupingComparatorClass());
    job.setPartitionerClass(getPartitionerClass());
    job.setSortComparatorClass(getSortComparatorClass());

    // and now, submit the job and wait for things to finish
    int exitCode;
    try {
        exitCode = job.waitForCompletion(verbose) ? 0 : 1;

        // get the output bytes counter.
        Counters c = job.getCounters();
        Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN);

        // and use them to set the size of the output relation.
        if (myDB != null) {
            myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue());
            myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length);
        }

    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Unable to run the job", e);
    }

    // now, delete all the empty part files
    try {

        // get a filesystem
        FileSystem dfs = FileSystem.get(conf);
        Path outPath = new Path(getOutput());
        if (dfs.exists(outPath) && dfs.isDirectory(outPath)) {
            FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter());
            for (FileStatus ff : fstatus) {
                if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around...
                    dfs.delete(ff.getPath(), true);
                }
            }
        }
    } catch (Exception e) { // this isn't disastrous 
    }
    return (exitCode == 0);
}

From source file:uk.bl.wa.hadoop.mapred.ByteBlockRecordReader.java

License:Open Source License

/**
 * // w  w  w .java  2s  . com
 * @param inputSplit
 * @param conf
 * @throws IOException
 */
public ByteBlockRecordReader(InputSplit inputSplit, JobConf conf) throws IOException {
    if (inputSplit instanceof FileSplit) {
        FileSplit fs = (FileSplit) inputSplit;
        path = fs.getPath();
        FileSystem fSys = path.getFileSystem(conf);
        file_length = fSys.getContentSummary(path).getLength();
        fsdis = fSys.open(path);

        // Support auto-decompression of compressed files:
        boolean autoDecompress = conf.getBoolean("mapreduce.unsplittableinputfileformat.autodecompress", false);
        if (autoDecompress) {
            log.warn("Enabling auto-decompression of this file.");
            compressionCodecs = new CompressionCodecFactory(conf);
            final CompressionCodec codec = compressionCodecs.getCodec(path);
            if (codec != null) {
                fsdis = codec.createInputStream(fsdis);
            }
        } else {
            log.info("Auto-decompression is not enabled.");
        }
    } else {
        log.error("Only FileSplit supported!");
        throw new IOException("Need FileSplit input...");
    }
}

From source file:uk.bl.wa.hadoop.mapreduce.lib.input.ByteBlockRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    if (inputSplit instanceof FileSplit) {
        FileSplit fs = (FileSplit) inputSplit;
        path = fs.getPath();//from   w w  w  . ja va  2  s. c o  m
        FileSystem fSys = path.getFileSystem(context.getConfiguration());
        fsdis = fSys.open(path);
        file_length = fSys.getContentSummary(path).getLength();
    } else {
        log.error("Only FileSplit supported!");
        throw new IOException("Need FileSplit input...");
    }

}

From source file:weka.distributed.spark.SparkUtils.java

License:Open Source License

/**
 * Get the size in bytes of a file/directory
 *
 * @param path the path to the file/directory
 * @return the size in bytes/*  w w w.j a  v  a 2s. c o m*/
 * @throws IOException if a problem occurs
 */
public static long getSizeInBytesOfPath(String path) throws IOException {
    String[] pathHolder = new String[1];
    Configuration conf = getFSConfigurationForPath(path, pathHolder);
    FileSystem fs = FileSystem.get(conf);
    Path p = new Path(pathHolder[0]);
    return fs.getContentSummary(p).getLength();
}

From source file:wherehows.SchemaFetch.java

License:Open Source License

private static void scanPathHelper(Path path, FileSystem scanFs)
        throws IOException, InterruptedException, SQLException {
    String curPath = path.toUri().getPath();
    Path n = path;/*from w  w w . j  av  a 2  s.com*/
    if (path.getName().matches("^(\\.|_|tmp|temp|test|trash|backup|archive|ARCHIVE|storkinternal).*"))
        return;

    logger.info("  -- scanPath(" + curPath + ")\n");
    int x = isTable(path, scanFs);
    if (x > 0) {
        // System.err.println("  traceTable(" + path.toString() + ")");
        traceTableInfo(path, scanFs);
    } else if (x == 0) { // iterate over each table
        // FileStatus[] fslist = scanFs.listStatus(path);
        // System.err.println(" => " + fslist.length + " subdirs");
        for (FileStatus fstat : scanFs.listStatus(path)) {
            n = fstat.getPath();
            curPath = n.toUri().getPath();
            // System.err.println("  traceSubDir(" + curPath + ")");
            if (n == path) {
                continue;
            }
            try {
                if (isTable(n, scanFs) > 0) {
                    traceTableInfo(n, scanFs);
                } else if (scanFs.listStatus(n).length > 0 || scanFs.getContentSummary(n).getLength() > 0) {
                    scanPath(n, scanFs);
                } else {
                    logger.info("* scanPath() size = 0: " + curPath);
                }
            } catch (AccessControlException e) {
                logger.error("* scanPath(e) Permission denied. Cannot access: " + curPath + " owner:"
                        + fstat.getOwner() + " group: " + fstat.getGroup() + "with current user "
                        + UserGroupInformation.getCurrentUser());
                // System.err.println(e);
                continue;
            } // catch
        } // end of for
    } // end else
}

From source file:wherehows.SchemaFetch.java

License:Open Source License

/**
 * Collect one dataset's metadata//  w  w w  .  j av  a  2  s  .c o  m
 *
 * @param path
 * @throws java.io.IOException
 */
private static void traceTableInfo(Path path, FileSystem tranceFs) throws IOException, SQLException {
    logger.info("trace table : " + path.toUri().getPath());
    // analyze the pattern of the name
    String tbl_name = path.getName();
    if (tbl_name.matches("(_|\\.|tmp|temp|stg|test|\\*).*")) // skip _temporary _schema.avsc
    {
        return;
    }

    FileStatus[] fstat_lst;
    FileStatus fstat = tranceFs.getFileStatus(path);
    String fullPath = path.toUri().getPath();
    String xName = "";
    long data_size = -1;
    long sample_data_size = -1;
    int i, x;
    // String data_source = checkDataSource(fullPath);

    // TODO this part need to rewrite
    try {
        while (fstat.isDirectory()) {

            fstat_lst = tranceFs.listStatus(fstat.getPath()); // list all children
            if (fstat_lst.length == 0) { // empty directory
                logger.info(fstat.getPath().toUri().getPath() + " is empty.");
                return;
            }

            int is_fstat_visible = 0;
            for (i = fstat_lst.length - 1; i >= 0; i--) { // iterate from the last item back to the first
                fstat = fstat_lst[i]; // start from the last file in the list
                xName = fstat.getPath().getName();

                if (xName.matches("\\.pig_schema|.*\\.avsc|\\.dataset")) {
                    is_fstat_visible = 1;
                    break;
                } else if (xName.equals("hourly") && i > 0
                        && fstat_lst[i - 1].getPath().getName().equals("daily")) {
                    continue; // try to traverse "daily" instead of "hourly" when possible
                } else if (xName.matches("(_|\\.|tmp|temp).*")) {
                    continue;
                }

                try { // sub directory may be inaccessible
                    sample_data_size = fstat.isDirectory()
                            ? tranceFs.getContentSummary(fstat.getPath()).getLength()
                            : fstat.getLen();
                } catch (AccessControlException e) {
                    if (tranceFs.listStatus(fstat.getPath()).length > 0) {
                        is_fstat_visible = 1;
                        break;
                    } else {
                        continue;
                    }
                }

                if (fstat.isDirectory() == false
                        && xName.matches("(_|\\.).*|.*\\.(jar|json|txt|csv|tsv|zip|gz|lzo)") == false) {
                    is_fstat_visible = 1;
                    break;
                }

                // if fstat is a Directory
                if (fstat.isDirectory() == true && xName.matches("(_|\\.).*") == false) {
                    is_fstat_visible = 1;
                    break;
                }
            }
            // logger.info(fstat.getPath() + "is_fstat_visible : " + is_fstat_visible);
            if (is_fstat_visible == 0) {
                return;
            }
        }
    } catch (AccessControlException e) {
        logger.error("* TblInfo() Cannot access " + fstat.getPath().toUri().getPath());
        return;
    }

    // get schema and sample data
    DatasetJsonRecord datasetSchemaRecord = fileAnalyzerFactory.getSchema(fstat.getPath(),
            path.toUri().getPath());
    if (datasetSchemaRecord != null) {
        schemaFileWriter.append(datasetSchemaRecord);
    } else {
        logger.error("* Cannot resolve the schema of " + fullPath);
    }

    SampleDataRecord sampleDataRecord = fileAnalyzerFactory.getSampleData(fstat.getPath(),
            path.toUri().getPath());
    if (sampleDataRecord != null) {
        sampleFileWriter.append(sampleDataRecord);
    } else {
        System.err.println("* Cannot fetch sample data of " + fullPath);
    }
}