List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation.//from w w w. jav a 2 s . c o m * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs * @throws ClassNotFoundException * @throws InterruptedException */ public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force) throws IOException, InterruptedException, ClassNotFoundException { //getConf().set("mapred.temp.dir", "d:/tmp"); Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: starting"); Job job = AvroJob.getAvroJob(getConf()); if (numLists == -1) { // for politeness make numLists = job.getNumReduceTasks(); // a partition per fetch task } if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } LOG.info("Generator: with " + numLists + " partition."); job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorMapper.class); job.setReducerClass(SelectorReducer.class); FileOutputFormat.setOutputPath(job, tempDir); //job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputFormatClass(GeneratorOutputFormat.class); job.setOutputKeyClass(Float.class); job.setOutputValueClass(SelectorEntry.class); // AvroMultipleOutputs.addNamedOutput(job, "seq", // AvroPairOutputFormat.class, Float.class, SelectorEntry.class); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); return null; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); fs.createNewFile(new Path(newSeg, "generatored")); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbUpdateMapper.class); // job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { job.waitForCompletion(true); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); } Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.iflytek.spider.parse.ParseSegment.java
License:Apache License
public int run(String[] args) throws Exception { String usage = "Usage: ParseSegment segments"; if (args.length == 0) { System.err.println(usage); System.exit(-1);/*from ww w . ja v a2 s .co m*/ } FileSystem fs = FileSystem.get(getConf()); for (FileStatus p : fs.listStatus(new Path(args[0]))) { if (fs.exists(new Path(p.getPath(), "crawl_parse"))) fs.delete(new Path(p.getPath(), "crawl_parse"), true); if (fs.exists(new Path(p.getPath(), "parse_data"))) fs.delete(new Path(p.getPath(), "parse_data"), true); parse(p.getPath()); } return 0; }
From source file:com.inmobi.conduit.AbstractService.java
License:Apache License
private Path getLatestDir(FileSystem fs, Path Dir) throws Exception { FileStatus[] fileStatus;/*from w w w .j a va 2s . c o m*/ try { fileStatus = fs.listStatus(Dir); } catch (FileNotFoundException fe) { fileStatus = null; } if (fileStatus != null && fileStatus.length > 0) { FileStatus latestfile = fileStatus[0]; for (FileStatus currentfile : fileStatus) { if (currentfile.getPath().getName().compareTo(latestfile.getPath().getName()) > 0) latestfile = currentfile; } return latestfile.getPath(); } return null; }
From source file:com.inmobi.conduit.distcp.tools.mapred.lib.TestDynamicInputFormat.java
License:Apache License
private static void createFile(String path) throws Exception { FileSystem fileSystem = null; DataOutputStream outputStream = null; try {//from w w w . ja v a 2 s .co m fileSystem = cluster.getFileSystem(); outputStream = fileSystem.create(new Path(path), true, 0); expectedFilePaths.add(fileSystem.listStatus(new Path(path))[0].getPath().toString()); } finally { IOUtils.cleanup(null, fileSystem, outputStream); } }
From source file:com.inmobi.conduit.distcp.tools.mapred.TestCopyCommitter.java
License:Apache License
@Test public void testDeleteMissingFlatInterleavedFiles() { TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); JobContext jobContext = Mockito.mock(JobContext.class); Mockito.when(jobContext.getConfiguration()).thenReturn(config); JobID jobID = new JobID(); Mockito.when(jobContext.getJobID()).thenReturn(jobID); Configuration conf = jobContext.getConfiguration(); String sourceBase;/*from w w w . jav a 2 s . c om*/ String targetBase; FileSystem fs = null; try { OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); fs = FileSystem.get(conf); sourceBase = "/tmp1/" + String.valueOf(rand.nextLong()); targetBase = "/tmp1/" + String.valueOf(rand.nextLong()); TestDistCpUtils.createFile(fs, sourceBase + "/1"); TestDistCpUtils.createFile(fs, sourceBase + "/3"); TestDistCpUtils.createFile(fs, sourceBase + "/4"); TestDistCpUtils.createFile(fs, sourceBase + "/5"); TestDistCpUtils.createFile(fs, sourceBase + "/7"); TestDistCpUtils.createFile(fs, sourceBase + "/8"); TestDistCpUtils.createFile(fs, sourceBase + "/9"); TestDistCpUtils.createFile(fs, targetBase + "/2"); TestDistCpUtils.createFile(fs, targetBase + "/4"); TestDistCpUtils.createFile(fs, targetBase + "/5"); TestDistCpUtils.createFile(fs, targetBase + "/7"); TestDistCpUtils.createFile(fs, targetBase + "/9"); TestDistCpUtils.createFile(fs, targetBase + "/A"); DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out")); options.setSyncFolder(true); options.setDeleteMissing(true); options.appendToConf(conf); CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS); Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong())); listing.buildListing(listingFile, options); conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase); conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase); committer.commitJob(jobContext); if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { Assert.fail("Source and target folders are not in sync"); } Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4); //Test for idempotent commit committer.commitJob(jobContext); if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { Assert.fail("Source and target folders are not in sync"); } Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4); } catch (IOException e) { LOG.error("Exception encountered while testing for delete missing", e); Assert.fail("Delete missing failure"); } finally { TestDistCpUtils.delete(fs, "/tmp1"); } }
From source file:com.inmobi.conduit.distcp.tools.mapred.TestCopyCommitter.java
License:Apache License
private boolean checkDirectoryPermissions(FileSystem fs, String targetBase, FsPermission sourcePerm) throws IOException { Path base = new Path(targetBase); Stack<Path> stack = new Stack<Path>(); stack.push(base);//from www. jav a 2 s . c o m while (!stack.isEmpty()) { Path file = stack.pop(); if (!fs.exists(file)) continue; FileStatus[] fStatus = fs.listStatus(file); if (fStatus == null || fStatus.length == 0) continue; for (FileStatus status : fStatus) { if (status.isDir()) { stack.push(status.getPath()); Assert.assertEquals(status.getPermission(), sourcePerm); } } } return true; }
From source file:com.inmobi.conduit.distcp.tools.mapred.TestUniformSizeInputFormat.java
License:Apache License
public void testGetSplits(int nMaps) throws Exception { DistCpOptions options = getOptions(nMaps); Configuration configuration = new Configuration(); configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps())); Path listFile = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testGetSplits_1/fileList.seq"); CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(listFile, options); JobContext jobContext = Mockito.mock(JobContext.class); Mockito.when(jobContext.getConfiguration()).thenReturn(configuration); Mockito.when(jobContext.getJobID()).thenReturn(new JobID()); UniformSizeInputFormat uniformSizeInputFormat = new UniformSizeInputFormat(); List<InputSplit> splits = uniformSizeInputFormat.getSplits(jobContext); //Removing the legacy check - Refer HADOOP-9230 int sizePerMap = totalFileSize / nMaps; checkSplits(listFile, splits);//w ww .ja v a2 s . c om int doubleCheckedTotalSize = 0; int previousSplitSize = -1; for (int i = 0; i < splits.size(); ++i) { InputSplit split = splits.get(i); int currentSplitSize = 0; TaskAttemptID taskId = new TaskAttemptID("", 0, true, 0, 0); final TaskAttemptContext taskAttemptContext = Mockito.mock(TaskAttemptContext.class); Mockito.when(taskAttemptContext.getConfiguration()).thenReturn(configuration); Mockito.when(taskAttemptContext.getTaskAttemptID()).thenReturn(taskId); RecordReader<Text, FileStatus> recordReader = uniformSizeInputFormat.createRecordReader(split, taskAttemptContext); recordReader.initialize(split, taskAttemptContext); while (recordReader.nextKeyValue()) { Path sourcePath = recordReader.getCurrentValue().getPath(); FileSystem fs = sourcePath.getFileSystem(configuration); FileStatus fileStatus[] = fs.listStatus(sourcePath); Assert.assertEquals(fileStatus.length, 1); currentSplitSize += fileStatus[0].getLen(); } Assert.assertTrue(previousSplitSize == -1 || Math.abs(currentSplitSize - previousSplitSize) < 0.1 * sizePerMap || i == splits.size() - 1); doubleCheckedTotalSize += currentSplitSize; } Assert.assertEquals(totalFileSize, doubleCheckedTotalSize); }
From source file:com.inmobi.conduit.distcp.tools.SimpleCopyListing.java
License:Apache License
/** {@inheritDoc} */ @Override/*from w w w .jav a 2s. c o m*/ public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException { SequenceFile.Writer fileListWriter = null; try { fileListWriter = getWriter(pathToListingFile); for (Path path : options.getSourcePaths()) { FileSystem sourceFS = path.getFileSystem(getConf()); path = makeQualified(path); FileStatus rootStatus = sourceFS.getFileStatus(path); Path sourcePathRoot = computeSourceRootPath(rootStatus, options); boolean localFile = (rootStatus.getClass() != FileStatus.class); FileStatus[] sourceFiles = sourceFS.listStatus(path); if (sourceFiles != null && sourceFiles.length > 0) { for (FileStatus sourceStatus : sourceFiles) { if (LOG.isDebugEnabled()) { LOG.debug("Recording source-path: " + sourceStatus.getPath() + " for copy."); } writeToFileListing(fileListWriter, sourceStatus, sourcePathRoot, localFile, options); if (isDirectoryAndNotEmpty(sourceFS, sourceStatus)) { if (LOG.isDebugEnabled()) { LOG.debug("Traversing non-empty source dir: " + sourceStatus.getPath()); } traverseNonEmptyDirectory(fileListWriter, sourceStatus, sourcePathRoot, localFile, options); } } } else { writeToFileListing(fileListWriter, rootStatus, sourcePathRoot, localFile, options); } } } finally { try { if (fileListWriter != null) fileListWriter.close(); } catch (IOException exception) { LOG.error("Could not close output-steam to the file-list: ", exception); throw exception; } } }
From source file:com.inmobi.conduit.distcp.tools.SimpleCopyListing.java
License:Apache License
private static FileStatus[] getChildren(FileSystem fileSystem, FileStatus parent) throws IOException { return fileSystem.listStatus(parent.getPath()); }
From source file:com.inmobi.conduit.distcp.tools.util.TestDistCpUtils.java
License:Apache License
public static boolean checkIfFoldersAreInSync(FileSystem fs, String targetBase, String sourceBase) throws IOException { Path base = new Path(targetBase); Stack<Path> stack = new Stack<Path>(); stack.push(base);/* ww w .j a v a 2 s . c o m*/ while (!stack.isEmpty()) { Path file = stack.pop(); if (!fs.exists(file)) continue; FileStatus[] fStatus = fs.listStatus(file); if (fStatus == null || fStatus.length == 0) continue; for (FileStatus status : fStatus) { if (status.isDir()) { stack.push(status.getPath()); } Assert.assertTrue(fs.exists(new Path( sourceBase + "/" + DistCpUtils.getRelativePath(new Path(targetBase), status.getPath())))); } } return true; }