Java tutorial
/* * Copyright (c) 2012, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.analyzer; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.util.Date; import java.util.List; import java.util.TreeSet; import java.util.Iterator; import java.util.Hashtable; import java.util.ArrayList; import java.text.SimpleDateFormat; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.conf.Configuration; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import java.io.IOException; import com.almworks.sqlite4java.SQLiteException; /*********************************************************** * FSCrawler crawls a filesystem and stuffs the results into * an FSAnalyzer's store. * * @author "Michael Cafarella" <mjc@cloudera.com> ***********************************************************/ public class FSCrawler { final static int INFINITE_CRAWL_DEPTH = -1; private static final Log LOG = LogFactory.getLog(FSCrawler.class); static SimpleDateFormat fileDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); Hashtable<Long, Thread> pendingCrawls = new Hashtable<Long, Thread>(); Hashtable<Long, CrawlRuntimeStatus> crawlStatusInfo = new Hashtable<Long, CrawlRuntimeStatus>(); FSAnalyzer analyzer; FileSystem fs; /** * Needs an analyzer to work */ public FSCrawler(FSAnalyzer analyzer) { this.analyzer = analyzer; this.fs = null; } /** * Traverse an entire region of the filesystem, analyzing files. * This code should: * a) Navigate the directory hierarchy * b) Run analysis code to figure out the file details * c) Invoke addSingleFile() appropriately. */ protected void recursiveCrawlBuildList(FileSystem fs, Path p, int subdirDepth, long crawlId, List<Path> todoFileList, List<Path> todoDirList) throws IOException { FileStatus fstatus = fs.getFileStatus(p); if (!fstatus.isDir()) { todoFileList.add(p); } else { if (subdirDepth > 0 || subdirDepth < 0) { todoDirList.add(p); Path paths[] = new Path[1]; paths[0] = p; for (FileStatus subfilestatus : fs.listStatus(p)) { Path subfile = subfilestatus.getPath(); try { recursiveCrawlBuildList(fs, subfile, subdirDepth - 1, crawlId, todoFileList, todoDirList); } catch (IOException iex) { iex.printStackTrace(); } } } } } /** * <code>getStartNonblockingCrawl</code> traverses a given filesystem. It returns immediately * and does not wait for the crawl to complete. * If the crawl is created or is already ongoing, it returns true. * If the crawl is not currently going and cannot start, it returns false. */ public synchronized boolean getStartNonblockingCrawl(final URI fsURI) { try { final int subdirDepth = INFINITE_CRAWL_DEPTH; long fsId = analyzer.getCreateFilesystem(fsURI, true); if (fsId < 0) { return false; } LOG.info("Grabbing filesystem: " + fsURI); final FileSystem fs = FileSystem.get(fsURI, new Configuration()); final Path startDir = fs.makeQualified(new Path(fsURI.getPath())); final long crawlid = analyzer.getCreatePendingCrawl(fsId, true); Thread pendingThread = pendingCrawls.get(crawlid); if (pendingThread == null) { Thread t = new Thread() { public void run() { try { synchronized (pendingCrawls) { pendingCrawls.put(crawlid, this); } synchronized (crawlStatusInfo) { crawlStatusInfo.put(crawlid, new CrawlRuntimeStatus("Initializing crawl")); } // Build the file and dir-level todo lists List<Path> todoFileList = new ArrayList<Path>(); List<Path> todoDirList = new ArrayList<Path>(); recursiveCrawlBuildList(fs, startDir, subdirDepth, crawlid, todoFileList, todoDirList); // Get the files to process TreeSet<String> observedFilenames = new TreeSet<String>(); for (Path p : analyzer.getFilesForCrawl(crawlid)) { observedFilenames.add(p.toString()); } for (Iterator<Path> it = todoFileList.iterator(); it.hasNext();) { Path p = it.next(); if (observedFilenames.contains(p.toString())) { it.remove(); } } // Get the dirs to process TreeSet<String> observedDirnames = new TreeSet<String>(); for (Path p : analyzer.getDirsForCrawl(crawlid)) { observedDirnames.add(p.toString()); } for (Iterator<Path> it = todoDirList.iterator(); it.hasNext();) { Path p = it.next(); if (observedDirnames.contains(p.toString())) { it.remove(); } } synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setMessage("Processing files"); cstatus.setNumToProcess(todoFileList.size()); cstatus.setNumDone(0); } int numDone = 0; for (Path p : todoDirList) { try { analyzer.addSingleFile(fs, p, crawlid); } catch (IOException iex) { iex.printStackTrace(); } } for (Path p : todoFileList) { synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setMessage("Processing file " + p.toString()); } try { analyzer.addSingleFile(fs, p, crawlid); } catch (Exception iex) { iex.printStackTrace(); } numDone++; synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setNumDone(numDone); if (cstatus.shouldFinish()) { break; } } } } catch (IOException iex) { iex.printStackTrace(); } finally { try { synchronized (pendingCrawls) { pendingCrawls.remove(crawlid); analyzer.completeCrawl(crawlid); } } catch (SQLiteException sle) { } } } }; t.start(); } return true; } catch (Exception iex) { iex.printStackTrace(); } return false; } /** * Is there an ongoing (running) crawl for the given filesystem? */ public CrawlRuntimeStatus isCrawlOngoing(URI fsURI) { long fsId = analyzer.getCreateFilesystem(fsURI, false); if (fsId < 0) { return null; } synchronized (pendingCrawls) { final long crawlid = analyzer.getCreatePendingCrawl(fsId, false); Thread pendingThread = pendingCrawls.get(crawlid); if (pendingThread != null && pendingThread.isAlive()) { synchronized (crawlStatusInfo) { return crawlStatusInfo.get(crawlid); } } return null; } } /** * waitForCrawl() will block until the given crawl is complete. If there * is an ongoing crawl that completes, it will return true. * If there was no ongoing crawl, it will return false. */ protected boolean waitForOngoingCrawl(URI fsURI, boolean shouldKill) { long fsId = analyzer.getCreateFilesystem(fsURI, false); if (fsId < 0) { return false; } synchronized (pendingCrawls) { final long crawlid = analyzer.getCreatePendingCrawl(fsId, false); if (crawlid < 0) { return false; } if (shouldKill) { synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setShouldFinish(true); } } Thread pendingThread = pendingCrawls.get(crawlid); if (pendingThread != null) { try { pendingThread.join(); } catch (InterruptedException iex) { } } return true; } } public void killOngoingCrawl(URI fsURI) { long fsId = analyzer.getCreateFilesystem(fsURI, false); if (fsId >= 0) { synchronized (pendingCrawls) { final long crawlid = analyzer.getCreatePendingCrawl(fsId, false); synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); if (cstatus != null) { cstatus.setShouldFinish(true); } } } } } /** * Kick off a crawl at the indicated directory and filesystem, * to the indicated depth. */ public boolean blockingCrawl(URI fsURI) throws IOException, SQLiteException { boolean crawlStarted = getStartNonblockingCrawl(fsURI); if (crawlStarted) { waitForOngoingCrawl(fsURI, false); } return crawlStarted; } //////////////////////////////////////// // Main() //////////////////////////////////////// public static void main(String argv[]) throws Exception { if (argv.length < 4) { System.err.println("Usage: FSCrawler <metadataStoreDir> <schemaDbDir> (--crawl <dir>)"); return; } int i = 0; File metadataStoreDir = new File(argv[i++]).getCanonicalFile(); File schemadbdir = new File(argv[i++]).getCanonicalFile(); String op = argv[i++]; FSAnalyzer fsa = new FSAnalyzer(metadataStoreDir, schemadbdir); try { if ("--crawl".equals(op)) { File crawlTarget = new File(argv[i++]).getCanonicalFile(); System.err.println("About to crawl " + crawlTarget); FSCrawler crawler = new FSCrawler(fsa); crawler.blockingCrawl(new URI("file://" + crawlTarget)); } else if ("--test".equals(op)) { List<SchemaSummary> summaryList = fsa.getSchemaSummaries(); System.err.println("Schema summary list has " + summaryList.size() + " entries"); } } finally { fsa.close(); } } }