com.cloudera.recordbreaker.analyzer.FSCrawler.java Source code

Introduction

Here is the source code for com.cloudera.recordbreaker.analyzer.FSCrawler.java
Source

/*
 * Copyright (c) 2012, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.analyzer;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.Date;
import java.util.List;
import java.util.TreeSet;
import java.util.Iterator;
import java.util.Hashtable;
import java.util.ArrayList;
import java.text.SimpleDateFormat;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.conf.Configuration;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.IOException;
import com.almworks.sqlite4java.SQLiteException;

/***********************************************************
 * FSCrawler crawls a filesystem and stuffs the results into
 * an FSAnalyzer's store.
 *
 * @author "Michael Cafarella" <mjc@cloudera.com>
 ***********************************************************/
public class FSCrawler {
    final static int INFINITE_CRAWL_DEPTH = -1;
    private static final Log LOG = LogFactory.getLog(FSCrawler.class);

    static SimpleDateFormat fileDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

    Hashtable<Long, Thread> pendingCrawls = new Hashtable<Long, Thread>();
    Hashtable<Long, CrawlRuntimeStatus> crawlStatusInfo = new Hashtable<Long, CrawlRuntimeStatus>();
    FSAnalyzer analyzer;
    FileSystem fs;

    /**
     * Needs an analyzer to work
     */
    public FSCrawler(FSAnalyzer analyzer) {
        this.analyzer = analyzer;
        this.fs = null;
    }

    /**
     * Traverse an entire region of the filesystem, analyzing files.
     * This code should:
     * a) Navigate the directory hierarchy
     * b) Run analysis code to figure out the file details
     * c) Invoke addSingleFile() appropriately.
     */
    protected void recursiveCrawlBuildList(FileSystem fs, Path p, int subdirDepth, long crawlId,
            List<Path> todoFileList, List<Path> todoDirList) throws IOException {
        FileStatus fstatus = fs.getFileStatus(p);
        if (!fstatus.isDir()) {
            todoFileList.add(p);
        } else {
            if (subdirDepth > 0 || subdirDepth < 0) {
                todoDirList.add(p);
                Path paths[] = new Path[1];
                paths[0] = p;
                for (FileStatus subfilestatus : fs.listStatus(p)) {
                    Path subfile = subfilestatus.getPath();
                    try {
                        recursiveCrawlBuildList(fs, subfile, subdirDepth - 1, crawlId, todoFileList, todoDirList);
                    } catch (IOException iex) {
                        iex.printStackTrace();
                    }
                }
            }
        }
    }

    /**
     * <code>getStartNonblockingCrawl</code> traverses a given filesystem.  It returns immediately
     * and does not wait for the crawl to complete.
     * If the crawl is created or is already ongoing, it returns true.
     * If the crawl is not currently going and cannot start, it returns false. 
     */
    public synchronized boolean getStartNonblockingCrawl(final URI fsURI) {
        try {
            final int subdirDepth = INFINITE_CRAWL_DEPTH;
            long fsId = analyzer.getCreateFilesystem(fsURI, true);
            if (fsId < 0) {
                return false;
            }
            LOG.info("Grabbing filesystem: " + fsURI);
            final FileSystem fs = FileSystem.get(fsURI, new Configuration());
            final Path startDir = fs.makeQualified(new Path(fsURI.getPath()));

            final long crawlid = analyzer.getCreatePendingCrawl(fsId, true);
            Thread pendingThread = pendingCrawls.get(crawlid);
            if (pendingThread == null) {
                Thread t = new Thread() {
                    public void run() {
                        try {
                            synchronized (pendingCrawls) {
                                pendingCrawls.put(crawlid, this);
                            }
                            synchronized (crawlStatusInfo) {
                                crawlStatusInfo.put(crawlid, new CrawlRuntimeStatus("Initializing crawl"));
                            }
                            // Build the file and dir-level todo lists
                            List<Path> todoFileList = new ArrayList<Path>();
                            List<Path> todoDirList = new ArrayList<Path>();
                            recursiveCrawlBuildList(fs, startDir, subdirDepth, crawlid, todoFileList, todoDirList);

                            // Get the files to process
                            TreeSet<String> observedFilenames = new TreeSet<String>();
                            for (Path p : analyzer.getFilesForCrawl(crawlid)) {
                                observedFilenames.add(p.toString());
                            }
                            for (Iterator<Path> it = todoFileList.iterator(); it.hasNext();) {
                                Path p = it.next();
                                if (observedFilenames.contains(p.toString())) {
                                    it.remove();
                                }
                            }

                            // Get the dirs to process
                            TreeSet<String> observedDirnames = new TreeSet<String>();
                            for (Path p : analyzer.getDirsForCrawl(crawlid)) {
                                observedDirnames.add(p.toString());
                            }
                            for (Iterator<Path> it = todoDirList.iterator(); it.hasNext();) {
                                Path p = it.next();
                                if (observedDirnames.contains(p.toString())) {
                                    it.remove();
                                }
                            }

                            synchronized (crawlStatusInfo) {
                                CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                                cstatus.setMessage("Processing files");
                                cstatus.setNumToProcess(todoFileList.size());
                                cstatus.setNumDone(0);
                            }

                            int numDone = 0;
                            for (Path p : todoDirList) {
                                try {
                                    analyzer.addSingleFile(fs, p, crawlid);
                                } catch (IOException iex) {
                                    iex.printStackTrace();
                                }
                            }
                            for (Path p : todoFileList) {
                                synchronized (crawlStatusInfo) {
                                    CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                                    cstatus.setMessage("Processing file " + p.toString());
                                }
                                try {
                                    analyzer.addSingleFile(fs, p, crawlid);
                                } catch (Exception iex) {
                                    iex.printStackTrace();
                                }
                                numDone++;
                                synchronized (crawlStatusInfo) {
                                    CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                                    cstatus.setNumDone(numDone);
                                    if (cstatus.shouldFinish()) {
                                        break;
                                    }
                                }
                            }
                        } catch (IOException iex) {
                            iex.printStackTrace();
                        } finally {
                            try {
                                synchronized (pendingCrawls) {
                                    pendingCrawls.remove(crawlid);
                                    analyzer.completeCrawl(crawlid);
                                }
                            } catch (SQLiteException sle) {
                            }
                        }
                    }
                };
                t.start();
            }
            return true;
        } catch (Exception iex) {
            iex.printStackTrace();
        }
        return false;
    }

    /**
     * Is there an ongoing (running) crawl for the given filesystem?
     */
    public CrawlRuntimeStatus isCrawlOngoing(URI fsURI) {
        long fsId = analyzer.getCreateFilesystem(fsURI, false);
        if (fsId < 0) {
            return null;
        }
        synchronized (pendingCrawls) {
            final long crawlid = analyzer.getCreatePendingCrawl(fsId, false);
            Thread pendingThread = pendingCrawls.get(crawlid);
            if (pendingThread != null && pendingThread.isAlive()) {
                synchronized (crawlStatusInfo) {
                    return crawlStatusInfo.get(crawlid);
                }
            }
            return null;
        }
    }

    /**
     * waitForCrawl() will block until the given crawl is complete.  If there
     * is an ongoing crawl that completes, it will return true.
     * If there was no ongoing crawl, it will return false.
     */
    protected boolean waitForOngoingCrawl(URI fsURI, boolean shouldKill) {
        long fsId = analyzer.getCreateFilesystem(fsURI, false);
        if (fsId < 0) {
            return false;
        }
        synchronized (pendingCrawls) {
            final long crawlid = analyzer.getCreatePendingCrawl(fsId, false);
            if (crawlid < 0) {
                return false;
            }
            if (shouldKill) {
                synchronized (crawlStatusInfo) {
                    CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                    cstatus.setShouldFinish(true);
                }
            }
            Thread pendingThread = pendingCrawls.get(crawlid);
            if (pendingThread != null) {
                try {
                    pendingThread.join();
                } catch (InterruptedException iex) {
                }
            }
            return true;
        }
    }

    public void killOngoingCrawl(URI fsURI) {
        long fsId = analyzer.getCreateFilesystem(fsURI, false);
        if (fsId >= 0) {
            synchronized (pendingCrawls) {
                final long crawlid = analyzer.getCreatePendingCrawl(fsId, false);
                synchronized (crawlStatusInfo) {
                    CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                    if (cstatus != null) {
                        cstatus.setShouldFinish(true);
                    }
                }
            }
        }
    }

    /**
     * Kick off a crawl at the indicated directory and filesystem,
     * to the indicated depth.
     */
    public boolean blockingCrawl(URI fsURI) throws IOException, SQLiteException {
        boolean crawlStarted = getStartNonblockingCrawl(fsURI);
        if (crawlStarted) {
            waitForOngoingCrawl(fsURI, false);
        }
        return crawlStarted;
    }

    ////////////////////////////////////////
    // Main()
    ////////////////////////////////////////
    public static void main(String argv[]) throws Exception {
        if (argv.length < 4) {
            System.err.println("Usage: FSCrawler <metadataStoreDir> <schemaDbDir> (--crawl <dir>)");
            return;
        }
        int i = 0;
        File metadataStoreDir = new File(argv[i++]).getCanonicalFile();
        File schemadbdir = new File(argv[i++]).getCanonicalFile();
        String op = argv[i++];
        FSAnalyzer fsa = new FSAnalyzer(metadataStoreDir, schemadbdir);

        try {
            if ("--crawl".equals(op)) {
                File crawlTarget = new File(argv[i++]).getCanonicalFile();
                System.err.println("About to crawl " + crawlTarget);
                FSCrawler crawler = new FSCrawler(fsa);
                crawler.blockingCrawl(new URI("file://" + crawlTarget));
            } else if ("--test".equals(op)) {
                List<SchemaSummary> summaryList = fsa.getSchemaSummaries();
                System.err.println("Schema summary list has " + summaryList.size() + " entries");
            }
        } finally {
            fsa.close();
        }
    }
}