List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException
From source file:com.TCG.Nutch_DNS.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*from ww w . j a v a 2 s . c om*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.tdunning.plume.local.lazy.MapRedExecutor.java
License:Apache License
/** * This method can be called to execute a {@link PlumeWorkflow} by using Hadoop Map-Reduce implementation. * It will build the execution tree, optimize it and convert each MSCR step into a MapRed job. * It will launch MSCR jobs in parallel when it is allowable to do so by using a ThreadPool. If one MSCR fails, * all the work flow is canceled. Because it stores the result in a temporary folder, it will only flush the final * result to the API parameter if the work flow has been executed successfully. * /* www . ja v a 2 s. c o m*/ * @param workFlow The {@link PlumeWorkflow} to execute * @param outputTo Output folder where the result of the work flow will be stored if executed successfully * * @throws IOException If the work flow had to be canceled * @throws InterruptedException */ public void execute(PlumeWorkflow workFlow, String outputTo) throws IOException, InterruptedException { Optimizer optimizer = new Optimizer(); ExecutionStep step = optimizer.optimize(workFlow); int nStep = 0; final String workFlowId = workFlow.getClass().getName() + "-" + System.currentTimeMillis(); do { nStep++; log.info("Begin execution step " + nStep + " for workflow " + workFlow.getClass().getName()); // Create a latch to mark the end of a concurrent step where all MSCRs can be executed in parallel final CountDownLatch latch = new CountDownLatch(step.mscrSteps.size()); // Create a signal that can be flagged if one of the MSCRs fail to abort all the workFlow // - I have chosen an AtomicBoolean in case this flag can be re-set to false under some circumstance - final AtomicBoolean abort = new AtomicBoolean(false); // For each MSCR that can be executed concurrently... for (final MSCR mscr : step.mscrSteps) { final String workFlowOutputPath = tmpOutputFolder + "/" + workFlowId; final String jobId = workFlowId + "/" + mscr.getId(); final String jobOutputPath = tmpOutputFolder + "/" + jobId; log.info("Triggering execution of jobId " + jobId + ". Its output will be saved to " + jobOutputPath); // ... Get its MapRed Job final Job job = getMapRed(mscr, workFlow, workFlowOutputPath, jobOutputPath); final FileSystem fS = FileSystem.getLocal(job.getConfiguration()); // ... Submit it to the ThreadPool executor.submit(new Runnable() { @Override public void run() { try { job.waitForCompletion(true); // job completed successfully - materialize outputs log.info("jobId " + jobId + " completed successfully, now materializing outputs."); for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) { LazyCollection<?> oCol = (LazyCollection<?>) mscr.getOutputChannels() .get(entry.getKey()).output; // Move this output to somewhere recognizable - this executor's tmp folder + this PCollection's Plume Id // This way, mappers that read unmaterialized collections will know where to find intermediate states. FileStatus[] files = fS.listStatus(new Path(jobOutputPath)); Path materializedPath = new Path(workFlowOutputPath + "/" + oCol.getPlumeId()); fS.mkdirs(materializedPath); for (FileStatus file : files) { if (file.getPath().getName().startsWith(entry.getValue() + "-r-")) { FileUtil.copy(fS, file.getPath(), fS, materializedPath, false, job.getConfiguration()); oCol.setFile(materializedPath.toString()); } } log.info( "Materialized plume output " + oCol.getPlumeId() + " to " + oCol.getFile()); } } catch (IOException e) { log.warn("One Job failed: " + jobId + ", current Workflow will be aborted ", e); abort.set(true); // Flag the premature end of this workflow } catch (InterruptedException e) { log.warn("One Job failed: " + jobId + ", current Workflow will be aborted ", e); abort.set(true); // Flag the premature end of this workflow } catch (ClassNotFoundException e) { log.warn("One Job failed: " + jobId + ", current Workflow will be aborted ", e); abort.set(true); // Flag the premature end of this workflow } finally { latch.countDown(); // Count down under any circumstance } } }); } latch.await(); // wait until all MSCRs from this step are completed if (abort.get()) { throw new IOException("Current Workflow was aborted"); } step = step.nextStep; } while (step != null); log.info("Workflow ended correctly."); // Move temporary result to where API user wants to: WARN: Local-specific implementation Files.move(new File(tmpOutputFolder + "/" + workFlowId), new File(outputTo)); }
From source file:com.teradata.compaction.mapreduce.MergeParquetFilesMR.java
License:Apache License
private static Schema getBaseSchema(final Path pathToParqetFiles, Configuration conf) throws IOException { fileSchema = null;//from w ww.jav a 2 s . co m FileSystem fsystem = pathToParqetFiles.getFileSystem(conf); FileStatus fstatus = fsystem.getFileStatus(pathToParqetFiles); if (fstatus.isDir()) { FileStatus[] files = fsystem.listStatus(fstatus.getPath()); for (FileStatus file : files) { if (!file.isDir()) { if (file.getPath().toString().toLowerCase().endsWith(".parquet")) { ParquetReader<GenericRecord> reader_schema = new AvroParquetReader<GenericRecord>( file.getPath()); GenericRecord tmp_schema = reader_schema.read(); fileSchema = tmp_schema.getSchema(); reader_schema.close(); break; } } } } // Print the Schema of one of the parquet files, which will be used as // schema for the final file! // System.out.println(fileSchema.toString()); return fileSchema; }
From source file:com.thinkbiganalytics.datalake.authorization.hdfs.HDFSUtil.java
License:Apache License
private void listAllDirAndFlushPolicy(FileSystem fileSystem, Path path) throws FileNotFoundException, IOException { FileStatus[] fileStatus = fileSystem.listStatus(path); for (FileStatus status : fileStatus) { // Apply ACL recursively on each file/directory. if (status.isDirectory()) { // Flush ACL before creating new one. flushAcl(fileSystem, status.getPath()); listAllDirAndFlushPolicy(fileSystem, status.getPath()); } else {// w w w .jav a2s .com // Flush ACL before creating new one. flushAcl(fileSystem, status.getPath()); } } }
From source file:com.thinkbiganalytics.datalake.authorization.hdfs.HDFSUtil.java
License:Apache License
/** * @param fileSystem : HDFS fileSystem object * @param path : Path on which ACL needs to be created * @param groups : List of group to which permission needs to be granted. *//*ww w . j a v a 2 s. co m*/ public void listAllDirAndApplyPolicy(FileSystem fileSystem, Path path, String groups, String hdfsPermission) throws FileNotFoundException, IOException { FsAction fsActionObject = getFinalPermission(hdfsPermission); FileStatus[] fileStatus = fileSystem.listStatus(path); for (FileStatus status : fileStatus) { // Flush ACL before creating new one. flushAcl(fileSystem, status.getPath()); // Apply ACL recursively on each file/directory. if (status.isDirectory()) { String[] groupListForPermission = groups.split(","); for (int groupCounter = 0; groupCounter < groupListForPermission.length; groupCounter++) { // Create HDFS ACL for each for each Path on HDFS AclEntry aclEntryOwner = new AclEntry.Builder().setName(groupListForPermission[groupCounter]) .setPermission(fsActionObject).setScope(AclEntryScope.ACCESS) .setType(AclEntryType.GROUP).build(); AclEntry aclEntryOther = new AclEntry.Builder().setPermission(FsAction.NONE) .setScope(AclEntryScope.ACCESS).setType(AclEntryType.OTHER).build(); // Apply ACL on Path applyAcl(fileSystem, status.getPath(), aclEntryOwner); applyAcl(fileSystem, status.getPath(), aclEntryOther); } // Recursive call made to apply acl on each sub directory listAllDirAndApplyPolicy(fileSystem, status.getPath(), groups, hdfsPermission); } else { String[] groupListForPermission = groups.split(","); for (int groupCounter = 0; groupCounter < groupListForPermission.length; groupCounter++) { // Create HDFS ACL for each for each Path on HDFS AclEntry aclEntryOwner = new AclEntry.Builder().setName(groupListForPermission[groupCounter]) .setPermission(fsActionObject).setScope(AclEntryScope.ACCESS) .setType(AclEntryType.GROUP).build(); AclEntry aclEntryOther = new AclEntry.Builder().setPermission(FsAction.NONE) .setScope(AclEntryScope.ACCESS).setType(AclEntryType.OTHER).build(); // Apply ACL on Path applyAcl(fileSystem, status.getPath(), aclEntryOwner); applyAcl(fileSystem, status.getPath(), aclEntryOther); } } } }
From source file:com.thinkbiganalytics.kerberos.TestKerberosKinit.java
License:Apache License
private void searchHDFS(Configuration configuration, final String environment, String hdfsPath, String hdfsUrl) throws Exception { configuration.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); configuration.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); FileSystem fs = FileSystem.get(configuration); if (environment.equalsIgnoreCase(ENVIRONMENT_CLOUDERA)) { FileStatus[] status = fs.listStatus(new Path(hdfsUrl + hdfsPath)); System.out.println("File Count: " + status.length); } else {// w ww . j a v a 2s . c o m if (environment.equalsIgnoreCase(ENVIRONMENT_HDP)) { FileStatus[] status = fs.listStatus(new Path(hdfsUrl + hdfsPath)); System.out.println("File Count: " + status.length); } } }
From source file:com.thinkbiganalytics.kylo.catalog.file.DefaultCatalogFileManager.java
License:Apache License
/** * Lists the files at the specified path. *//*from w w w .j av a2s . c o m*/ @Nonnull private List<DataSetFile> listFiles(@Nonnull final FileSystem fs, @Nonnull final Path path) throws IOException { return Arrays.stream(fs.listStatus(path)).map(status -> { final DataSetFile file = new DataSetFile(); file.setDirectory(status.isDirectory()); file.setLength(status.getLen()); file.setModificationTime(status.getModificationTime()); file.setName(status.getPath().getName()); file.setPath(status.getPath().toString()); return file; }).collect(Collectors.toList()); }
From source file:com.trace.hadoop.TestDFSRename.java
License:Apache License
void list(FileSystem fs, String name) throws IOException { FileSystem.LOG.info("\n\n" + name); for (FileStatus s : fs.listStatus(dir)) { FileSystem.LOG.info("" + s.getPath()); }/* w ww . j a v a 2 s. co m*/ }
From source file:com.tripadvisor.hadoop.BackupHdfs.java
License:Apache License
/** * Method to go though the HDFS filesystem in a DFS to find all * files/* w w w . ja v a 2 s . co m*/ * * fs:FileSystem object from HDFS * minDate: Oldest date for files to be backed up * maxDate:Newest date for files to be backed up * p:Path in HDFS to look for files * pathList:Will be filled with all files in p * hmTimestamps: hashmap of timestamps for later sorting **/ public void checkDir(FileSystem fs, long minDate, long maxDate, Path p, ArrayList<Path> pathList, HashMap<Path, Long> hmTimestamps) { long tmpDate; FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } // dump the mkdir and chmod commands for this // directory -- skip root directory only { FileStatus stat = fs.getFileStatus(p); if (!sPath.equals("/")) { m_wrMkdirs.println("hadoop fs -mkdir " + sPath); } m_wrChmods.println("hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); Short sh = new Short(stat.getPermission().toShort()); m_wrChmods.println("hadoop fs -chmod " + Long.toOctalString(sh.longValue()) + " " + sPath); } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, minDate, maxDate, fStat[i].getPath(), pathList, hmTimestamps); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // another database to regular hive tables to // partitioned hive tables. We use table names to // both exclude some from the backup, and for the rest // to dump out the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { m_nIgnoredTables++; if (m_nIgnoredTables < 5) { System.out.println("Skipping ignore-table file: " + sPath); } else if (m_nIgnoredTables == 5) { System.out.println("(...not showing other skipped tables...)"); } return; } FileStatus stat = fs.getFileStatus(p); tmpDate = stat.getModificationTime() / 1000; // store the chmods/chowns for all files m_wrChmods.println("hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); m_wrChmods.println("hadoop fs -chmod " + stat.getPermission().toShort() + " " + sPath); // check dates. is it too young? if (tmpDate < minDate) { return; } // is the file too recent? if (tmpDate > maxDate) { //System.out.println("file too recent: " + sPath); return; } // file timestamp is ok pathList.add(p); hmTimestamps.put(p, new Long(tmpDate)); // store info about total bytes neeed to backup m_nTotalBytes += fs.getContentSummary(p).getLength(); } } catch (IOException e) { System.err.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }
From source file:com.tripadvisor.hadoop.VerifyHdfsBackup.java
License:Apache License
/** * Method to go though the HDFS filesystem in a DFS to find all * files//from w ww . j a v a 2 s.c o m * * fs:FileSystem object from HDFS * maxDate:Newest date for files to be backed up * p:Path in HDFS to look for files **/ public void checkDir(FileSystem fs, Path p, String sLocalPathRoot, long maxDate) { FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, fStat[i].getPath(), sLocalPathRoot, maxDate); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // tripmonster to regular hive tables to partitioned // hive tables. We use table names to both exclude // some from the backup, and for the rest to dump out // the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { return; } // check the file FileStatus stat = fs.getFileStatus(p); // ignore files that are too new if ((stat.getModificationTime() / 1000) > maxDate) { System.out.println("IGNORING: " + sPath + " too new"); return; } // warn about files that have a mis-matching block // size. The checksum check will fail for them // anyways, so just catch it here. if (stat.getBlockSize() != N_BLOCK_SIZE) { System.out.println("ERROR: non-default block size (" + (stat.getBlockSize() / (1024 * 1024)) + "M) would fail checksum: " + sPath); return; } // get HDFS checksum FileChecksum ck = fs.getFileChecksum(p); String sCk, sCkShort; if (ck == null) { sCk = sCkShort = "<null>"; } else { sCk = ck.toString(); sCkShort = sCk.replaceAll("^.*:", ""); } System.out.println(sPath + " len=" + stat.getLen() + " " + stat.getOwner() + "/" + stat.getGroup() + " checksum=" + sCk); // find the local file String sFsPath = sLocalPathRoot + p.toUri().getPath(); File fLocal = new File(sFsPath); if (!fLocal.exists()) { Calendar cal = Calendar.getInstance(); cal.setTimeInMillis(stat.getModificationTime()); System.out.println("ERROR: file does not exist: " + sFsPath + " hdfs-last-mtime=" + cal.getTime().toString()); return; } if (!fLocal.isFile()) { System.out.println("ERROR: path is not a file: " + sFsPath); return; } if (stat.getLen() != fLocal.length()) { System.out.println("ERROR: length mismatch: " + sFsPath + " hdfslen=" + stat.getLen() + " fslen=" + fLocal.length()); return; } // get local fs checksum FileChecksum ckLocal = getLocalFileChecksum(sFsPath); if (ckLocal == null) { System.out.println("ERROR Failed to get checksum for local file " + sFsPath); return; } // compare checksums as a string, to strip the // algorithm name from the beginning String sCkLocal = ckLocal.toString(); String sCkLocalShort = sCkLocal.replaceAll("^.*:", ""); if (false == sCkShort.equals(sCkLocalShort)) { System.out.println( "ERROR: checksum mismatch: " + sFsPath + "\nhdfs = " + sCk + "\nlocal= " + sCkLocal); return; } } } catch (IOException e) { System.out.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }