List of usage examples for org.apache.hadoop.fs FileSystem getContentSummary
public ContentSummary getContentSummary(Path f) throws IOException
From source file:simsql.runtime.RelOp.java
License:Apache License
public boolean run(RuntimeParameter params, boolean verbose) { ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params; // build the jar. String jarFile = buildJarFile(params); // Get the default configuration object Configuration conf = new Configuration(); // set quite mode on/off conf.setQuietMode(!verbose);// w w w . jav a 2 s . c o m /*** conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," + "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s"); ***/ // tell it how to serialize and deserialize records and recordkeys conf.set("io.serializations", getSerializations()); conf.setBoolean("mapred.compress.map.output", true); int ioSortMB = conf.getInt("io.sort.mb", 256); conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.setInt("simsql.input.numSplits", pp.getNumCPUs()); conf.setInt("mapred.job.reuse.jvm.num.tasks", 1); // conf.setBoolean ("mapred.map.tasks.speculative.execution", false); // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false); // tell it to use the jar that we just created conf.set("mapred.jar", jarFile); // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar"); conf.setBoolean("mapred.output.compress", true); conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" }); // use snappy for the intermediate stuff conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); // do some additional operator-specific configurations setConfigurations(conf, params); // collect statistics for final relations always conf.setBoolean("simsql.collectStats", isFinal || collectStats); // figure out what file to map String[] inDirs = myInputNetwork.getPipelinedInputFiles(); inDirs = excludeAnyWhoWillNotBeMapped(inDirs); String inSingleString = inDirs[0]; conf.set("simsql.fileToMap", inSingleString); for (int i = 1; i < inDirs.length; i++) { inSingleString += "," + inDirs[i]; } // create and name the job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create a new job!", e); } job.setJobName(getJobName()); // set the map-reduce input and output types job.setMapOutputKeyClass(getMapOutputKeyClass()); job.setMapOutputValueClass(getMapOutputValueClass()); job.setOutputKeyClass(getOutputKeyClass()); job.setOutputValueClass(getOutputValueClass()); int numReducers = getNumReducers(params); job.setMapperClass(getMapperClass()); job.setReducerClass(getReducerClass()); // set the number of reducers job.setNumReduceTasks(numReducers); // set the input and the output formats... these extend FileInputFormat and FileOutputFormat job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(getOutputFormatClass()); // set the input and output paths try { System.out.println("input file: " + inSingleString); FileInputFormat.setInputPaths(job, inSingleString); FileInputFormat.setInputPathFilter(job, TableFileFilter.class); FileOutputFormat.setOutputPath(job, new Path(getOutput())); } catch (Exception e) { throw new RuntimeException("Unable to set up the input/output path for the job.", e); } // set the split size FileInputFormat.setMinInputSplitSize(job, getSplitSize(params)); FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params)); // set the various sorting/grouping/mapping classes job.setGroupingComparatorClass(getGroupingComparatorClass()); job.setPartitionerClass(getPartitionerClass()); job.setSortComparatorClass(getSortComparatorClass()); // and now, submit the job and wait for things to finish int exitCode; try { exitCode = job.waitForCompletion(verbose) ? 0 : 1; // get the output bytes counter. Counters c = job.getCounters(); Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); // and use them to set the size of the output relation. if (myDB != null) { myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue()); myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Unable to run the job", e); } // now, delete all the empty part files try { // get a filesystem FileSystem dfs = FileSystem.get(conf); Path outPath = new Path(getOutput()); if (dfs.exists(outPath) && dfs.isDirectory(outPath)) { FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... dfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } return (exitCode == 0); }
From source file:uk.bl.wa.hadoop.mapred.ByteBlockRecordReader.java
License:Open Source License
/** * // w w w .java 2s . com * @param inputSplit * @param conf * @throws IOException */ public ByteBlockRecordReader(InputSplit inputSplit, JobConf conf) throws IOException { if (inputSplit instanceof FileSplit) { FileSplit fs = (FileSplit) inputSplit; path = fs.getPath(); FileSystem fSys = path.getFileSystem(conf); file_length = fSys.getContentSummary(path).getLength(); fsdis = fSys.open(path); // Support auto-decompression of compressed files: boolean autoDecompress = conf.getBoolean("mapreduce.unsplittableinputfileformat.autodecompress", false); if (autoDecompress) { log.warn("Enabling auto-decompression of this file."); compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(path); if (codec != null) { fsdis = codec.createInputStream(fsdis); } } else { log.info("Auto-decompression is not enabled."); } } else { log.error("Only FileSplit supported!"); throw new IOException("Need FileSplit input..."); } }
From source file:uk.bl.wa.hadoop.mapreduce.lib.input.ByteBlockRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { if (inputSplit instanceof FileSplit) { FileSplit fs = (FileSplit) inputSplit; path = fs.getPath();//from w w w . ja va 2 s. c o m FileSystem fSys = path.getFileSystem(context.getConfiguration()); fsdis = fSys.open(path); file_length = fSys.getContentSummary(path).getLength(); } else { log.error("Only FileSplit supported!"); throw new IOException("Need FileSplit input..."); } }
From source file:weka.distributed.spark.SparkUtils.java
License:Open Source License
/** * Get the size in bytes of a file/directory * * @param path the path to the file/directory * @return the size in bytes/* w w w.j a v a 2s. c o m*/ * @throws IOException if a problem occurs */ public static long getSizeInBytesOfPath(String path) throws IOException { String[] pathHolder = new String[1]; Configuration conf = getFSConfigurationForPath(path, pathHolder); FileSystem fs = FileSystem.get(conf); Path p = new Path(pathHolder[0]); return fs.getContentSummary(p).getLength(); }
From source file:wherehows.SchemaFetch.java
License:Open Source License
private static void scanPathHelper(Path path, FileSystem scanFs) throws IOException, InterruptedException, SQLException { String curPath = path.toUri().getPath(); Path n = path;/*from w w w . j av a 2 s.com*/ if (path.getName().matches("^(\\.|_|tmp|temp|test|trash|backup|archive|ARCHIVE|storkinternal).*")) return; logger.info(" -- scanPath(" + curPath + ")\n"); int x = isTable(path, scanFs); if (x > 0) { // System.err.println(" traceTable(" + path.toString() + ")"); traceTableInfo(path, scanFs); } else if (x == 0) { // iterate over each table // FileStatus[] fslist = scanFs.listStatus(path); // System.err.println(" => " + fslist.length + " subdirs"); for (FileStatus fstat : scanFs.listStatus(path)) { n = fstat.getPath(); curPath = n.toUri().getPath(); // System.err.println(" traceSubDir(" + curPath + ")"); if (n == path) { continue; } try { if (isTable(n, scanFs) > 0) { traceTableInfo(n, scanFs); } else if (scanFs.listStatus(n).length > 0 || scanFs.getContentSummary(n).getLength() > 0) { scanPath(n, scanFs); } else { logger.info("* scanPath() size = 0: " + curPath); } } catch (AccessControlException e) { logger.error("* scanPath(e) Permission denied. Cannot access: " + curPath + " owner:" + fstat.getOwner() + " group: " + fstat.getGroup() + "with current user " + UserGroupInformation.getCurrentUser()); // System.err.println(e); continue; } // catch } // end of for } // end else }
From source file:wherehows.SchemaFetch.java
License:Open Source License
/** * Collect one dataset's metadata// w w w . j av a 2 s .c o m * * @param path * @throws java.io.IOException */ private static void traceTableInfo(Path path, FileSystem tranceFs) throws IOException, SQLException { logger.info("trace table : " + path.toUri().getPath()); // analyze the pattern of the name String tbl_name = path.getName(); if (tbl_name.matches("(_|\\.|tmp|temp|stg|test|\\*).*")) // skip _temporary _schema.avsc { return; } FileStatus[] fstat_lst; FileStatus fstat = tranceFs.getFileStatus(path); String fullPath = path.toUri().getPath(); String xName = ""; long data_size = -1; long sample_data_size = -1; int i, x; // String data_source = checkDataSource(fullPath); // TODO this part need to rewrite try { while (fstat.isDirectory()) { fstat_lst = tranceFs.listStatus(fstat.getPath()); // list all children if (fstat_lst.length == 0) { // empty directory logger.info(fstat.getPath().toUri().getPath() + " is empty."); return; } int is_fstat_visible = 0; for (i = fstat_lst.length - 1; i >= 0; i--) { // iterate from the last item back to the first fstat = fstat_lst[i]; // start from the last file in the list xName = fstat.getPath().getName(); if (xName.matches("\\.pig_schema|.*\\.avsc|\\.dataset")) { is_fstat_visible = 1; break; } else if (xName.equals("hourly") && i > 0 && fstat_lst[i - 1].getPath().getName().equals("daily")) { continue; // try to traverse "daily" instead of "hourly" when possible } else if (xName.matches("(_|\\.|tmp|temp).*")) { continue; } try { // sub directory may be inaccessible sample_data_size = fstat.isDirectory() ? tranceFs.getContentSummary(fstat.getPath()).getLength() : fstat.getLen(); } catch (AccessControlException e) { if (tranceFs.listStatus(fstat.getPath()).length > 0) { is_fstat_visible = 1; break; } else { continue; } } if (fstat.isDirectory() == false && xName.matches("(_|\\.).*|.*\\.(jar|json|txt|csv|tsv|zip|gz|lzo)") == false) { is_fstat_visible = 1; break; } // if fstat is a Directory if (fstat.isDirectory() == true && xName.matches("(_|\\.).*") == false) { is_fstat_visible = 1; break; } } // logger.info(fstat.getPath() + "is_fstat_visible : " + is_fstat_visible); if (is_fstat_visible == 0) { return; } } } catch (AccessControlException e) { logger.error("* TblInfo() Cannot access " + fstat.getPath().toUri().getPath()); return; } // get schema and sample data DatasetJsonRecord datasetSchemaRecord = fileAnalyzerFactory.getSchema(fstat.getPath(), path.toUri().getPath()); if (datasetSchemaRecord != null) { schemaFileWriter.append(datasetSchemaRecord); } else { logger.error("* Cannot resolve the schema of " + fullPath); } SampleDataRecord sampleDataRecord = fileAnalyzerFactory.getSampleData(fstat.getPath(), path.toUri().getPath()); if (sampleDataRecord != null) { sampleFileWriter.append(sampleDataRecord); } else { System.err.println("* Cannot fetch sample data of " + fullPath); } }