List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java
License:Open Source License
private static void deletePartFiles(FileSystem fs, Path path) throws FileNotFoundException, IOException { PathFilter filter = new PathFilter() { public boolean accept(Path file) { return file.getName().startsWith("part-"); }//from www.j a va2s. co m }; FileStatus[] list = fs.listStatus(path, filter); for (FileStatus stat : list) { fs.delete(stat.getPath(), false); } }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
/** * Helper function to fetch and sort the list of part files under the given * input directory./* w w w . ja va 2 s . co m*/ * * @param input * @param fs * @return * @throws FileNotFoundException * @throws IOException */ @SuppressWarnings("unchecked") private static ArrayList<Path> collectInputFiles(String input, FileSystem fs) throws FileNotFoundException, IOException { Path path = new Path(input); ArrayList<Path> files = new ArrayList<Path>(); if (fs.isDirectory(path)) { for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); } else files.add(path); return files; }
From source file:com.ibm.jaql.io.hadoop.FileOutputConfigurator.java
License:Apache License
public void setSequential(JobConf conf) throws Exception { registerSerializers(conf);/*from w w w .ja v a 2s .c om*/ // For an expression, the location is the final file name Path outPath = new Path(location); FileSystem fs = outPath.getFileSystem(conf); outPath = outPath.makeQualified(fs); if (fs.exists(outPath)) { // TODO: Jaql currently has overwrite semantics; add flag to control this if (fs.isFile(outPath)) { fs.delete(outPath, false); } else { // Look for a map-reduce output directory FileStatus[] nonMR = fs.listStatus(outPath, new PathFilter() { boolean onlyOne = true; public boolean accept(Path path) { String name = path.getName(); if (name.matches("([.][.]?)|([.]part-[0-9]+.crc)|(part-[0-9]+)")) { return false; } if (onlyOne) { onlyOne = false; return true; } return false; } }); if (nonMR.length > 0) { throw new IOException( "directory exists and is not a map-reduce output directory: " + nonMR[0].getPath()); } fs.delete(outPath, true); } } // In sequential mode, we will write directly to the output file // and bypass the _temporary directory and rename of the standard // FileOutputCommitter by using our own DirectFileOutputCommitter. FileOutputFormat.setOutputPath(conf, outPath.getParent()); conf.setClass("mapred.output.committer.class", DirectFileOutputCommiter.class, OutputCommitter.class); }
From source file:com.iflytek.spider.crawl.CrawlDb.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println(/*from w w w .j a va 2 s .com*/ "Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-noAdditions]"); System.err.println("\tcrawldb\tCrawlDb to update"); System.err.println("\t-dir segments\tparent directory containing all segments to update from"); System.err.println("\tseg1 seg2 ...\tlist of segment names to update from"); System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)"); System.err.println( "\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs"); return -1; } boolean force = false; final FileSystem fs = FileSystem.get(getConf()); boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); HashSet<Path> dirs = new HashSet<Path>(); for (int i = 1; i < args.length; i++) { if (args[i].equals("-force")) { force = true; } else if (args[i].equals("-noAdditions")) { additionsAllowed = false; } else if (args[i].equals("-dir")) { FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths))); } else { dirs.add(new Path(args[i])); } } try { update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), additionsAllowed, force); return 0; } catch (Exception e) { LOG.fatal("CrawlDb update: " + StringUtils.stringifyException(e)); return -1; } }
From source file:com.inmobi.conduit.AbstractService.java
License:Apache License
private List<Path> listPartFiles(Path path, FileSystem fs) { List<Path> matches = new LinkedList<Path>(); try {//from ww w .j av a 2 s . com FileStatus[] statuses = fs.listStatus(path, new PathFilter() { public boolean accept(Path path) { return path.toString().contains("part"); } }); for (FileStatus status : statuses) { matches.add(status.getPath()); } } catch (IOException e) { LOG.error(e.getMessage(), e); } return matches; }
From source file:com.inmobi.conduit.local.LocalStreamServiceTest.java
License:Apache License
private void testClusterName(String configName, String currentClusterName) throws Exception { ConduitConfigParser parser = new ConduitConfigParser(configName); ConduitConfig config = parser.getConfig(); Set<String> streamsToProcess = new HashSet<String>(); streamsToProcess.addAll(config.getSourceStreams().keySet()); Set<String> clustersToProcess = new HashSet<String>(); Set<TestLocalStreamService> services = new HashSet<TestLocalStreamService>(); Cluster currentCluster = null;/*from w ww. jav a2s . c om*/ for (SourceStream sStream : config.getSourceStreams().values()) { for (String cluster : sStream.getSourceClusters()) { clustersToProcess.add(cluster); } } if (currentClusterName != null) { currentCluster = config.getClusters().get(currentClusterName); } for (String clusterName : clustersToProcess) { Cluster cluster = config.getClusters().get(clusterName); cluster.getHadoopConf().set("mapred.job.tracker", super.CreateJobConf().get("mapred.job.tracker")); TestLocalStreamService service = new TestLocalStreamService(config, cluster, currentCluster, new NullCheckPointProvider(), streamsToProcess); services.add(service); } for (TestLocalStreamService service : services) { FileSystem fs = service.getFileSystem(); service.preExecute(); if (currentClusterName != null) Assert.assertEquals(service.getCurrentCluster().getName(), currentClusterName); // creating a job with empty input path Path tmpJobInputPath = new Path("/tmp/job/input/path"); Map<FileStatus, String> fileListing = new TreeMap<FileStatus, String>(); Set<FileStatus> trashSet = new HashSet<FileStatus>(); // checkpointKey, CheckPointPath Table<String, String, String> checkpointPaths = HashBasedTable.create(); service.createMRInput(tmpJobInputPath, fileListing, trashSet, checkpointPaths); Job testJobConf = service.createJob(tmpJobInputPath, 1000); testJobConf.waitForCompletion(true); int numberOfCountersPerFile = 0; long sumOfCounterValues = 0; Path outputCounterPath = new Path(new Path(service.getCluster().getTmpPath(), service.getName()), "counters"); FileStatus[] statuses = fs.listStatus(outputCounterPath, new PathFilter() { public boolean accept(Path path) { return path.toString().contains("part"); } }); for (FileStatus fileSt : statuses) { Scanner scanner = new Scanner(fs.open(fileSt.getPath())); while (scanner.hasNext()) { String counterNameValue = null; try { counterNameValue = scanner.next(); String tmp[] = counterNameValue.split(ConduitConstants.AUDIT_COUNTER_NAME_DELIMITER); Assert.assertEquals(4, tmp.length); Long numOfMsgs = Long.parseLong(tmp[3]); numberOfCountersPerFile++; sumOfCounterValues += numOfMsgs; } catch (Exception e) { LOG.error("Counters file has malformed line with counter name =" + counterNameValue + "..skipping the line", e); } } } // Should have 2 counters for each file Assert.assertEquals(NUMBER_OF_FILES * 2, numberOfCountersPerFile); // sum of all counter values should be equal to total number of messages Assert.assertEquals(NUMBER_OF_FILES * 3, sumOfCounterValues); Assert.assertEquals(testJobConf.getConfiguration().get(FS_DEFAULT_NAME_KEY), service.getCurrentCluster().getHadoopConf().get(FS_DEFAULT_NAME_KEY)); Assert.assertEquals(testJobConf.getConfiguration().get(SRC_FS_DEFAULT_NAME_KEY), service.getCluster().getHadoopConf().get(FS_DEFAULT_NAME_KEY)); if (currentCluster == null) Assert.assertEquals(testJobConf.getConfiguration().get(FS_DEFAULT_NAME_KEY), testJobConf.getConfiguration().get(SRC_FS_DEFAULT_NAME_KEY)); service.getFileSystem().delete(new Path(service.getCluster().getRootDir()), true); } }
From source file:com.jkoolcloud.tnt4j.streams.inputs.HdfsFileLineStream.java
License:Apache License
/** * Searches for files matching name pattern. Name pattern also may contain path of directory, where file search * should be performed, e.g., C:/Tomcat/logs/localhost_access_log.*.txt. If no path is defined (just file name * pattern) then files are searched in {@code System.getProperty("user.dir")}. Files array is ordered by file create * timestamp in descending order./*from www . j a v a2 s. co m*/ * * @param path * path of file * @param fs * file system * * @return array of found files paths. * @throws IOException * if files can't be listed by file system. * * @see FileSystem#listStatus(Path, PathFilter) * @see FilenameUtils#wildcardMatch(String, String, IOCase) */ public static Path[] searchFiles(Path path, FileSystem fs) throws IOException { FileStatus[] dir = fs.listStatus(path.getParent(), new PathFilter() { @Override public boolean accept(Path path) { String name = path.getName(); return FilenameUtils.wildcardMatch(name, "*", IOCase.INSENSITIVE); // NON-NLS } }); Path[] activityFiles = new Path[dir == null ? 0 : dir.length]; if (dir != null) { Arrays.sort(dir, new Comparator<FileStatus>() { @Override public int compare(FileStatus o1, FileStatus o2) { return Long.valueOf(o1.getModificationTime()).compareTo(o2.getModificationTime()) * (-1); } }); for (int i = 0; i < dir.length; i++) { activityFiles[i] = dir[i].getPath(); } } return activityFiles; }
From source file:com.jkoolcloud.tnt4j.streams.inputs.HdfsFileLineStreamTest.java
License:Apache License
@Test() public void test() throws Exception { FileSystem fs = mock(FileSystem.class); HdfsFileLineStream stream = new HdfsFileLineStream(); TestFileList files = new TestFileList(false); final String fileName = ("file:////" + files.get(0).getParentFile() + File.separator + files.getPrefix() // NON-NLS + "*.TST").replace("\\", "/"); // NON-NLS Map<String, String> props = new HashMap<>(2); props.put(StreamProperties.PROP_FILENAME, fileName); props.put(StreamProperties.PROP_RESTORE_STATE, "false"); // NON-NLS when(fs.open(any(Path.class))).thenReturn(new FSDataInputStream(new TestInputStreamStub())); final FileStatus fileStatusMock = mock(FileStatus.class); final FileStatus[] array = new FileStatus[10]; Arrays.fill(array, fileStatusMock); when(fs.listStatus(any(Path.class), any(PathFilter.class))).thenReturn(array); when(fileStatusMock.getModificationTime()).thenReturn(1L, 2L, 3L); when(fileStatusMock.getPath()).thenReturn(mock(Path.class)); when(fs.getContentSummary(any(Path.class))).thenReturn(mock(ContentSummary.class)); Method m = FileSystem.class.getDeclaredMethod("addFileSystemForTesting", URI.class, Configuration.class, // NON-NLS FileSystem.class); m.setAccessible(true);//w ww .j a v a2 s.com m.invoke(FileSystem.class, URI.create(fileName), new Configuration(), fs); StreamThread st = mock(StreamThread.class); st.setName("HdfsFileLineStreamTestThreadName"); // NON-NLS stream.setOwnerThread(st); stream.setProperties(props.entrySet()); stream.startStream(); verify(fileStatusMock, atLeastOnce()).getModificationTime(); verify(fileStatusMock, atLeastOnce()).getPath(); verify(fs, atLeastOnce()).listStatus(any(Path.class), any(PathFilter.class)); stream.cleanup(); }
From source file:com.kxen.han.projection.giraph.BspCase.java
License:Apache License
/** * Read all parts- files in the output and count their lines. * This works only for textual output!/* w w w . j av a2 s . co m*/ * * @param conf Configuration * @param outputPath Output path * @return Number of output lines * @throws IOException */ public int getNumResults(Configuration conf, Path outputPath) throws IOException { FileSystem fs = FileSystem.get(conf); int numResults = 0; for (FileStatus status : fs.listStatus(outputPath, PARTS_FILTER)) { FSDataInputStream in = null; BufferedReader reader = null; try { in = fs.open(status.getPath()); reader = new BufferedReader(new InputStreamReader(in, Charsets.UTF_8)); while (reader.readLine() != null) { numResults++; } } finally { Closeables.closeQuietly(in); Closeables.closeQuietly(reader); } } return numResults; }
From source file:com.linkedin.cubert.io.rubix.RubixFile.java
License:Open Source License
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, ParseException, InstantiationException, IllegalAccessException { final int VERBOSE_NUM_ROWS = 4; Options options = new Options(); options.addOption("h", "help", false, "shows this message"); options.addOption("v", "verbose", false, "print summary and first few rows of each block"); options.addOption("m", "metadata", false, "show the metadata"); options.addOption("d", "dump", false, "dump the contents of the rubix file. Use -f for specifying format, and -o for specifying output location"); options.addOption("f", "format", true, "the data format for dumping data (AVRO or TEXT). Default: TEXT"); options.addOption("e", "extract", true, "Extract one rubix block matching the block id. Use -o for specifying output location"); options.addOption("o", true, "Store the output at the specified location"); CommandLineParser parser = new BasicParser(); // parse the command line arguments CommandLine line = parser.parse(options, args); // show the help message if (line.hasOption("h")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(/* w ww . ja v a 2 s .c o m*/ "RubixFile <rubix file or dir> [options]\nIf no options are provided, print a summary of the blocks.", options); return; } // validate provided options if (line.hasOption("d") && line.hasOption("e")) { System.err.println("Cannot dump (-d) and extract (-e) at the same time!"); return; } // obtain the list of rubix files String[] files = line.getArgs(); if (files == null || files.length == 0) { System.err.println("Rubix file not specified"); return; } Configuration conf = new JobConf(); FileSystem fs = FileSystem.get(conf); Path path = new Path(files[0]); FileStatus[] allFiles; FileStatus status = fs.getFileStatus(path); if (status.isDir()) { allFiles = fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path path) { return path.toString().contains(RubixConstants.RUBIX_EXTENSION); } }); } else { allFiles = new FileStatus[] { status }; } // walk over all files and extract the trailer section List<RubixFile<Tuple, Object>> rfiles = new ArrayList<RubixFile<Tuple, Object>>(); for (FileStatus s : allFiles) { Path p = s.getPath(); RubixFile<Tuple, Object> rfile = new RubixFile<Tuple, Object>(conf, p); // if printing meta data information.. exit after first file (since all files // have the same meta data) if (line.hasOption("m")) { rfile.getKeyData(); System.out.println(new ObjectMapper().writer().writeValueAsString(rfile.metadataJson)); break; } rfiles.add(rfile); } // dump the data if (line.hasOption("d")) { String format = line.getOptionValue("f"); if (format == null) format = "TEXT"; format = format.trim().toUpperCase(); if (format.equals("AVRO")) { // dumpAvro(rfiles, line.getOptionValue("o")); throw new UnsupportedOperationException( "Dumping to avro is not currently supporting. Please write a Cubert (map-only) script to store data in avro format"); } else if (format.equals("TEXT")) { if (line.hasOption("o")) { System.err.println("Dumping TEXT format data *into a file* is not currently supported"); return; } dumpText(rfiles, line.getOptionValue("o"), Integer.MAX_VALUE); } else { System.err.println("Invalid format [" + format + "] for dumping. Please use AVRO or TEXT"); return; } } else if (line.hasOption("e")) // extract one rubix block { long blockId = Long.parseLong(line.getOptionValue("e")); extract(rfiles, blockId, line.getOptionValue("o")); } else // print summary { dumpText(rfiles, null, line.hasOption("v") ? VERBOSE_NUM_ROWS : 0); } }