org.punksearch.crawler.NetworkCrawler.java Source code

Introduction

Here is the source code for org.punksearch.crawler.NetworkCrawler.java
Source

/***************************************************************************
 *                                                                         *
 *   PunkSearch - Searching over LAN                                       *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/
package org.punksearch.crawler;

import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.punksearch.common.FileTypes;
import org.punksearch.common.PunksearchFs;
import org.punksearch.ip.IpIterator;
import org.punksearch.ip.IpRange;
import org.punksearch.ip.IpRanges;
import org.punksearch.ip.SynchronizedIpIterator;
import org.punksearch.logic.hosts_resolver.HostnameResolver;
import org.punksearch.stats.HostStats;
import org.punksearch.stats.TotalStats;
import org.punksearch.stats.TotalStatsWriter;

import java.io.File;
import java.io.IOException;
import java.util.*;

import static org.punksearch.common.Settings.*;
import static org.punksearch.crawler.CrawlerKeys.*;

/**
 * The crawling process manager. It starts crawling threads, cleans target index, merges data crawled by threads into
 * the target index, cleans temporary files.
 *
 * @author Yury Soldak (ysoldak@gmail.com)
 * @see HostCrawler
 * @see IndexOperator
 * @see org.punksearch.stats.HostStats
 */
public class NetworkCrawler implements Runnable {
    private static final Log log = LogFactory.getLog(NetworkCrawler.class);

    private static final NetworkCrawler INSTANCE = new NetworkCrawler();

    private static final String THREAD_PREFIX = "HostCrawler";

    private FileTypes fileTypes;
    private String indexDirectory;
    private boolean forceUnlock;
    private int threadCount;
    private float daysToKeep;
    private int maxHours;
    private List<IpRange> ranges;

    private final List<HostCrawler> threadList = Collections.synchronizedList(new ArrayList<HostCrawler>());
    private Set<Timer> timers = new HashSet<Timer>();

    private NetworkCrawler() {
    }

    public static NetworkCrawler getInstance() {
        return INSTANCE;
    }

    /**
     * Signals all threads to stop crawling.
     */
    public void stop() {
        synchronized (threadList) {
            for (HostCrawler thread : threadList) {
                thread.requestStop();
            }
        }
    }

    /**
     * The getter method to access current running crawling threads.
     *
     * @return List of crawling threads.
     */
    public List<HostCrawler> getThreads() {
        return threadList;
    }

    /**
     * Starts the crawling process. Starts all threads, merges temp indexes into main one, clears temp files.
     */
    public synchronized void run() {
        readProperties();

        if (!prepareAllIndexDirs()) {
            log.warn("Can't start crawling. Something wrong with an index directory (check log).");
            return;
        }

        if (ranges.size() == 0) {
            log.warn("Can't start crawling. The list of IPs to crawl is empty.");
            return;
        }

        long startTime = new Date().getTime();
        log.info("Crawl process started");

        startTimers();

        IpIterator iter = new SynchronizedIpIterator(ranges);
        synchronized (threadList) {
            threadList.clear();
            for (int i = 0; i < threadCount; i++) {
                HostCrawler indexerThread = makeThread(i, iter);
                indexerThread.start();
                threadList.add(indexerThread);
            }
        }

        TotalStats totalStats = new TotalStats(System.currentTimeMillis());
        List<HostStats> hosts = new ArrayList<HostStats>();

        boolean cleaned = false;

        for (HostCrawler thread : threadList) {
            try {
                thread.join();
                // we want clean the target index just once and at the end of index process
                // also we do not want to clean the index if crawling was interrupted
                if (!cleaned) {
                    cleanTargetIndex();
                    cleaned = true;
                }
                hosts.addAll(thread.getCrawledHosts());
                totalStats.addShares(thread.getShares());
                removeHostsFromIndex(thread.getCrawledHosts());
                mergeIntoIndex(thread.getName());
                cleanTempForThread(thread.getName());
            } catch (InterruptedException e) {
                log.warn("Interrupted: " + thread.getName());
            }
            log.info("Finished: " + thread.getName());
        }

        if (hosts.size() > 0) {
            String statsDir = PunksearchFs.resolveStatsDirectory();
            HostStats.dump(statsDir, hosts);
            HostStats.merge(statsDir, PunksearchFs.resolve(statsDir + File.separator + "hosts.csv"));
            totalStats.addHostStats(hosts);
            TotalStatsWriter.dump(totalStats);
        }

        // should always optimize, since some old items could have been deleted and no one new host crawled.
        log.info("Optimizing index...");
        IndexOperator.optimize(indexDirectory);

        long finishTime = new Date().getTime();
        log.info("Crawl process finished in " + ((finishTime - startTime) / 1000) + " sec");

        synchronized (threadList) {
            threadList.clear();
        }
        cancelTimers();
    }

    /**
     * Extracts configuration from system properties.
     * <p/>
     * The system property names are defined by static final fields of CrawlerKeys class.
     */
    private void readProperties() {
        indexDirectory = PunksearchFs.resolveIndexDirectory();

        forceUnlock = getBool(UNLOCK_PROPERTY, false);
        threadCount = getInt(THREADS_PROPERTY, 5);

        fileTypes = FileTypes.readFromDefaultFile();

        daysToKeep = getFloat(KEEPDAYS_PROPERTY, 7);
        maxHours = getInt(MAXHOURS_PROPERTY, 12);

        ranges = parseRanges(get(RANGE_PROPERTY));
    }

    private void startTimers() {
        Timer processTimer = new Timer();
        processTimer.schedule(new MaxRunWatchDog(), maxHours * 3600 * 1000L);

        Timer statusDumpTimer = new Timer();
        long dumpPeriod = Long.getLong(DUMP_STATUS_PERIOD, 10L) * 1000;
        statusDumpTimer.scheduleAtFixedRate(new ThreadStatusDump(), dumpPeriod, dumpPeriod);

        timers.add(processTimer);
        timers.add(statusDumpTimer);
    }

    private void cancelTimers() {
        for (Timer timer : timers) {
            timer.cancel();
        }
        timers.clear();
    }

    private boolean prepareAllIndexDirs() {
        if (!prepareIndex(indexDirectory)) {
            log.warn("Can't prepare main index directory (check log).");
            return false;
        }
        for (int i = 0; i < threadCount; i++) {
            final String threadDirectory = getThreadDirectory(i);
            if (!prepareIndex(threadDirectory)) {
                log.warn("Can't prepare directory for crawl thread: " + threadDirectory);
                return false;
            }
        }
        return true;
    }

    private void removeHostsFromIndex(Set<HostStats> hosts) {
        log.trace("Start cleaning target index directory from set of indexed hosts");
        for (HostStats host : hosts) {
            final String hostTerm = host.getProtocol() + "_" + host.getIp();
            final String hostName = HostnameResolver.getInstance().resolveByIp(host.getIp().toString());

            log.debug("Cleaning target index directory from indexed host: " + hostTerm.replace("_", "://")
                    + ", hostname: " + hostName);

            IndexOperator.deleteByHost(indexDirectory, hostTerm, hostName);
        }
        log.trace("Finished cleaning target index directory from set of indexed hosts");
    }

    private void cleanTargetIndex() {
        log.trace("Start cleaning target index directory: " + indexDirectory);
        if (daysToKeep == 0) {
            IndexOperator.deleteAll(indexDirectory);
            log.trace("Target index directory wiped out");
        } else {
            log.trace("Start cleaning target index directory from old items");
            IndexOperator.deleteByAge(indexDirectory, daysToKeep);
            log.trace("Finished cleaning target index directory from old items");
        }
        log.trace("Target index directory cleaned up.");
    }

    private void mergeIntoIndex(String threadName) {
        int index = Integer.valueOf(threadName.substring(THREAD_PREFIX.length()));
        Set<String> dirs = new HashSet<String>();
        dirs.add(getThreadDirectory(index));
        IndexOperator.merge(indexDirectory, dirs);
    }

    /**
     * Parses the IP ranges string.
     * <p/>
     * The string can be either path to a file with IP ranges or comma-separated list of string representation of IP
     * ranges.
     *
     * @param rangesString Either path to a file with IP ranges or comma-separated list of string representation of IP ranges.
     * @return list of IP ranges. May return empty list, never null.
     */
    private static List<IpRange> parseRanges(String rangesString) {
        List<IpRange> result = IpRanges.parseList(rangesString);
        if (result.isEmpty()) {
            File file = new File(PunksearchFs.resolve(rangesString));
            if (file.exists()) {
                result = loadRangesFromFile(file);
            } else {
                log.warn("Can't find IP ranges file: '" + file.getAbsolutePath() + "'");
            }
        }
        return result;
    }

    /**
     * Reads a file and creates list of IpRanges from it.
     * <p/>
     * The file may be of random format, the single restriction is IP should be in the first column. Each row of the
     * file must be either comment (starts with "#") or start with IP or IP range.
     * <p/>
     * Example:
     * <p/>
     * <pre>
     * # this is a comment before single ip
     * 10.20.30.40
     * # another comment before ip range
     * 11.22.33.44-11.22.33.55
     * # comment before long row, the tail after first comma is ignored
     * 22.33.44.55, smth else, foo
     * </pre>
     *
     * @param file file to get ip ranges from
     * @return list of IpRanage objects
     */
    @SuppressWarnings("unchecked")
    private static List<IpRange> loadRangesFromFile(File file) {
        Set<IpRange> result = new HashSet<IpRange>();
        try {
            List<String> lines = FileUtils.readLines(file);
            for (String line : lines) {
                line = line.trim();
                if (line.startsWith("#")) {
                    continue;
                }
                String[] parts = line.split(",");
                if (IpRange.isIpRange(parts[0].trim())) {
                    result.add(new IpRange(parts[0].trim()));
                }
            }
        } catch (IOException e) {
            log.warn("Can't load ranges from file: " + file.getAbsolutePath());
        }
        ArrayList<IpRange> list = new ArrayList<IpRange>(result);
        Collections.sort(list);
        return list;
    }

    private static String getThreadDirectory(int index) {
        return getTempDir() + "punksearch_crawler" + index;
    }

    /**
     * @return temp dir ending by file.separator
     */
    private static String getTempDir() {
        String tempDir = System.getProperty(TMP_DIR_PROPERTY);

        if (tempDir == null || tempDir.length() == 0) {
            tempDir = System.getProperty("java.io.tmpdir");
        }

        if (!tempDir.endsWith(File.separator)) {
            tempDir += File.separator;
        }

        return tempDir;
    }

    private HostCrawler makeThread(int index, IpIterator iter) {
        return new HostCrawler(THREAD_PREFIX + index, iter, fileTypes, getThreadDirectory(index));
    }

    private static void cleanTempForThread(String threadName) {
        int index = Integer.valueOf(threadName.substring(THREAD_PREFIX.length()));
        try {
            FileUtils.deleteDirectory(new File(getThreadDirectory(index)));
        } catch (IOException e) {
            log.warn("Temp directory '" + getThreadDirectory(index) + "' was not cleaned up. Check permissions");
        }
    }

    private boolean prepareIndex(String dir) {
        if (!IndexOperator.indexExists(dir)) {
            try {
                IndexOperator.createIndex(dir);
            } catch (IOException e) {
                log.error("Can't create index directory: '" + dir + "'!");
                return false;
            }
        }

        if (IndexOperator.isLocked(dir)) {
            if (forceUnlock) {
                IndexOperator.unlock(dir);
            } else {
                log.warn("Index directory is locked: '" + dir + "' "
                        + "Consider to set \"*.crawler.forceunlock=true\" in punksearch.properties");
                return false;
            }
        }

        return true;
    }

    private class MaxRunWatchDog extends TimerTask {
        public void run() {
            log.info("Stopping crawling due to time limit");
            NetworkCrawler.getInstance().stop();
        }
    }

    private class ThreadStatusDump extends TimerTask {

        public static final String STATUS_FILENAME = "punksearch-crawl.status";

        public void run() {
            List<HostCrawler> threads = NetworkCrawler.getInstance().getThreads();
            String dump = "";
            for (HostCrawler thread : threads) {
                boolean stop = thread.isStopRequested();
                String status = "unknown";
                if (stop) {
                    if (thread.getIp() != null) {
                        status = "stopping";
                    } else {
                        status = "stopped manually";
                    }
                } else {
                    if (thread.getIp() != null) {
                        status = "crawling " + thread.getIp();
                    } else {
                        status = "finished successfully";
                    }
                }
                dump += thread.getName() + " : " + status + " : " + thread.getCrawledHosts().size() + "\n";
            }
            String path = getTempDir() + STATUS_FILENAME;
            try {
                FileUtils.writeStringToFile(new File(path), dump);
            } catch (IOException e) {
                log.warn("Can't write crawler status to file: " + path);
            }
        }
    }
}