org.zaproxy.zap.spider.Spider.java Source code

Introduction

Here is the source code for org.zaproxy.zap.spider.Spider.java
Source

/*
 * Zed Attack Proxy (ZAP) and its related class files.
 * 
 * ZAP is an HTTP/HTTPS proxy for assessing web application security.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0 
 *   
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and 
 * limitations under the License. 
 */
package org.zaproxy.zap.spider;

import java.net.CookieManager;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.log4j.Logger;
import org.parosproxy.paros.model.Model;
import org.parosproxy.paros.network.ConnectionParam;
import org.parosproxy.paros.network.HttpMessage;
import org.parosproxy.paros.network.HttpRequestHeader;
import org.parosproxy.paros.network.HttpSender;
import org.zaproxy.zap.extension.spider.ExtensionSpider;
import org.zaproxy.zap.model.Context;
import org.zaproxy.zap.spider.filters.DefaultFetchFilter;
import org.zaproxy.zap.spider.filters.DefaultParseFilter;
import org.zaproxy.zap.spider.filters.FetchFilter;
import org.zaproxy.zap.spider.filters.FetchFilter.FetchStatus;
import org.zaproxy.zap.spider.filters.ParseFilter;
import org.zaproxy.zap.spider.parser.SpiderParser;
import org.zaproxy.zap.users.User;

/**
 * The Class Spider.
 */
public class Spider {

    /** The spider parameters. */
    private SpiderParam spiderParam;

    /** The connection parameters. */
    private ConnectionParam connectionParam;

    /** The model. */
    private Model model;

    /** The listeners for Spider related events. */
    private List<SpiderListener> listeners;

    /** If the spider is currently paused. */
    private volatile boolean paused;

    /** The the spider is currently stopped. */
    private volatile boolean stopped;

    /** The pause lock, used for locking access to the "paused" variable. */
    private ReentrantLock pauseLock = new ReentrantLock();

    /** The controller that manages the spidering process. */
    private SpiderController controller;

    /**
     * The condition that is used for the threads in the pool to wait on, when the Spider crawling
     * is paused. When the Spider is resumed, all the waiting threads are awakened.
     */
    private Condition pausedCondition = pauseLock.newCondition();

    /** The thread pool for spider workers. */
    private ExecutorService threadPool;

    /** The default fetch filter. */
    private DefaultFetchFilter defaultFetchFilter;

    /** The seed list. */
    private List<URI> seedList;

    /** The extension. */
    private ExtensionSpider extension;

    /** The Constant log. */
    private static final Logger log = Logger.getLogger(Spider.class);

    /** The HTTP sender used to effectively send the data. */
    private HttpSender httpSender;

    /** The count of the tasks finished. */
    private int tasksDoneCount;

    /** The total count of all the submitted tasks. */
    private int tasksTotalCount;

    /** The cookie manager. */
    private CookieManager cookieManager;

    /** The scan context. If null, the scan is not performed in a context. */
    private Context scanContext;

    /** The scan user. */
    private User scanUser;

    /**
     * The initialized marks if the spidering process is completely started. It solves the problem
     * when the first task is processed and the process is finished before the other seeds are
     * added.
     */
    private boolean initialized;

    /**   we do not want to recurse into an SVN folder, or a subfolder of an SVN folder, if one was created from a previous Spider run */
    private static final Pattern svnUrlPattern = Pattern.compile("\\.svn/"); //case sensitive

    /**   we do not want to recurse into a Git folder, or a subfolder of a Git folder, if one was created from a previous Spider run */
    private static final Pattern gitUrlPattern = Pattern.compile("\\.git/"); //case sensitive

    /**
     * Instantiates a new spider.
     *
     * @param extension the extension
     * @param spiderParam the spider param
     * @param connectionParam the connection param
     * @param model the model
     * @param scanContext if a scan context is set, only URIs within the context are fetched and processed
     */
    public Spider(ExtensionSpider extension, SpiderParam spiderParam, ConnectionParam connectionParam, Model model,
            Context scanContext) {
        super();
        log.info("Spider initializing...");
        this.spiderParam = spiderParam;
        this.connectionParam = connectionParam;
        this.model = model;
        this.controller = new SpiderController(this, extension.getCustomParsers());
        this.listeners = new LinkedList<>();
        this.seedList = new ArrayList<>();
        this.cookieManager = new CookieManager();
        this.scanContext = scanContext;
        this.extension = extension;

        init();
    }

    /**
     * Initialize the spider.
     */
    private void init() {
        this.paused = false;
        this.stopped = true;
        this.tasksDoneCount = 0;
        this.tasksTotalCount = 0;
        this.initialized = false;

        // Add a default fetch filter and any custom ones
        defaultFetchFilter = new DefaultFetchFilter();
        this.addFetchFilter(defaultFetchFilter);

        for (FetchFilter filter : extension.getCustomFetchFilters()) {
            this.addFetchFilter(filter);
        }

        // Add a default parse filter and any custom ones
        this.addParseFilter(new DefaultParseFilter());
        for (ParseFilter filter : extension.getCustomParseFilters())
            this.addParseFilter(filter);

        // Add the scan context, if any
        defaultFetchFilter.setScanContext(this.scanContext);
        defaultFetchFilter.setDomainsAlwaysInScope(spiderParam.getDomainsAlwaysInScopeEnabled());

    }

    /* SPIDER Related */
    /**
     * Adds a new seed for the Spider.
     * 
     * @param msg the message used for seed. The request URI is used from the Request Header
     */
    public void addSeed(HttpMessage msg) {
        URI uri = msg.getRequestHeader().getURI();
        addSeed(uri);
    }

    /**
     * Adds a new seed for the Spider.
     * 
     * @param uri the uri
     */
    public void addSeed(URI uri) {
        // Update the scope of the spidering process
        String host = null;

        try {
            host = uri.getHost();
            defaultFetchFilter.addScopeRegex(host);
        } catch (URIException e) {
            log.error("There was an error while adding seed value: " + uri, e);
            return;
        }
        // Add the seed to the list -- it will be added to the task list only when the spider is
        // started
        this.seedList.add(uri);
        // Add the appropriate 'robots.txt' as a seed
        if (getSpiderParam().isParseRobotsTxt()) {
            try {
                // Build the URI of the robots.txt file
                URI robotsUri;
                // If the port is not 80 or 443, add it to the URI
                if (uri.getPort() == 80 || uri.getPort() == 443) {
                    robotsUri = new URI(uri.getScheme() + "://" + host + "/robots.txt", true);
                } else {
                    robotsUri = new URI(uri.getScheme() + "://" + host + ":" + uri.getPort() + "/robots.txt", true);
                }
                this.seedList.add(robotsUri);
            } catch (Exception e) {
                log.warn("Error while creating URI for robots.txt file for site " + uri, e);
            }
        }
        // Add the appropriate 'sitemap.xml' as a seed
        if (getSpiderParam().isParseSitemapXml()) {
            try {
                // Build the URI of the sitemap.xml file
                URI sitemapUri;
                // If the port is not 80 or 443, add it to the URI
                if (uri.getPort() == 80 || uri.getPort() == 443) {
                    sitemapUri = new URI(uri.getScheme() + "://" + host + "/sitemap.xml", true);
                } else {
                    sitemapUri = new URI(uri.getScheme() + "://" + host + ":" + uri.getPort() + "/sitemap.xml",
                            true);
                }
                this.seedList.add(sitemapUri);
            } catch (Exception e) {
                log.warn("Error while creating URI for sitemap.xml file for site " + uri, e);
            }
        }
        // And add '.svn/entries' as a seed, for SVN based spidering
        if (getSpiderParam().isParseSVNEntries()) {
            try {
                URI svnEntriesURI1, svnEntriesURI2;
                // If the port is not 80 or 443, add it to the URI
                // SVN entries can exist in multiple directories, so make sure to add in the full path.
                String fullpath = uri.getPath();
                String name = uri.getName();
                if (fullpath == null)
                    fullpath = "";
                if (name == null)
                    name = "";

                String pathminusfilename = fullpath.substring(0, fullpath.lastIndexOf(name));
                if (pathminusfilename.equals(""))
                    pathminusfilename = "/";

                //if it's not an svn folder, add the seeds.
                Matcher matcherSvnUrl = svnUrlPattern.matcher(pathminusfilename);
                if (!matcherSvnUrl.find()) {
                    if (uri.getPort() == 80 || uri.getPort() == 443) {
                        svnEntriesURI1 = new URI(
                                uri.getScheme() + "://" + host + pathminusfilename + ".svn/entries", true);
                        svnEntriesURI2 = new URI(uri.getScheme() + "://" + host + pathminusfilename + ".svn/wc.db",
                                true);
                    } else {
                        svnEntriesURI1 = new URI(uri.getScheme() + "://" + host + ":" + uri.getPort()
                                + pathminusfilename + ".svn/entries", true);
                        svnEntriesURI2 = new URI(uri.getScheme() + "://" + host + ":" + uri.getPort()
                                + pathminusfilename + ".svn/wc.db", true);
                    }
                    this.seedList.add(svnEntriesURI1);
                    this.seedList.add(svnEntriesURI2);
                }
            } catch (Exception e) {
                log.warn("Error while creating a seed URI for the SVN files for site " + uri, e);
            }
        }

        // And add '.git/index' as a seed, for Git based spidering
        if (getSpiderParam().isParseGit()) {
            try {
                URI gitEntriesURI;
                // If the port is not 80 or 443, add it to the URI
                // Make sure to add in the full path.
                String fullpath = uri.getPath();
                String name = uri.getName();
                if (fullpath == null)
                    fullpath = "";
                if (name == null)
                    name = "";

                String pathminusfilename = fullpath.substring(0, fullpath.lastIndexOf(name));
                if (pathminusfilename.equals(""))
                    pathminusfilename = "/";

                //if it's not in a Git folder, add the seed.
                Matcher matcherGitUrl = gitUrlPattern.matcher(pathminusfilename);
                if (!matcherGitUrl.find()) {
                    if (uri.getPort() == 80 || uri.getPort() == 443) {
                        gitEntriesURI = new URI(uri.getScheme() + "://" + host + pathminusfilename + ".git/index",
                                true);
                    } else {
                        gitEntriesURI = new URI(uri.getScheme() + "://" + host + ":" + uri.getPort()
                                + pathminusfilename + ".git/index", true);
                    }
                    this.seedList.add(gitEntriesURI);
                }
            } catch (Exception e) {
                log.warn("Error while creating a seed URI for the Git files for site " + uri, e);
            }
        }

    }

    /**
     * Sets the exclude list which contains a List of strings, defining the uris that should be
     * excluded.
     * 
     * @param excludeList the new exclude list
     */
    public void setExcludeList(List<String> excludeList) {
        log.debug("New Exclude list: " + excludeList);
        defaultFetchFilter.setExcludeRegexes(excludeList);
    }

    /**
     * Adds a new fetch filter to the spider.
     * 
     * @param filter the filter
     */
    public void addFetchFilter(FetchFilter filter) {
        controller.addFetchFilter(filter);
    }

    /**
     * Adds a new parse filter to the spider.
     * 
     * @param filter the filter
     */
    public void addParseFilter(ParseFilter filter) {
        controller.addParseFilter(filter);
    }

    /**
     * Gets the http sender. Can be called from the SpiderTask.
     * 
     * @return the http sender
     */
    protected HttpSender getHttpSender() {
        return httpSender;
    }

    /**
     * Gets the spider parameters. Can be called from the SpiderTask.
     * 
     * @return the spider parameters
     */
    protected SpiderParam getSpiderParam() {
        return spiderParam;
    }

    protected ConnectionParam getConnectionParam() {
        return connectionParam;
    }

    /**
     * Gets the controller.
     * 
     * @return the controller
     */
    protected SpiderController getController() {
        return controller;
    }

    /**
     * Gets the cookie manager.
     * 
     * @return the cookie manager
     */
    protected CookieManager getCookieManager() {
        return cookieManager;
    }

    /**
     * Gets the model.
     * 
     * @return the model
     */
    protected Model getModel() {
        return this.model;
    }

    /**
     * Submit a new task to the spidering task pool.
     * 
     * @param task the task
     */
    protected synchronized void submitTask(SpiderTask task) {
        if (isStopped()) {
            log.debug("Submitting task skipped (" + task + ") as the Spider process is stopped.");
            return;
        }
        if (isTerminated()) {
            log.debug("Submitting task skipped (" + task + ") as the Spider process is terminated.");
            return;
        }
        this.tasksTotalCount++;
        try {
            this.threadPool.execute(task);
        } catch (RejectedExecutionException e) {
            if (log.isDebugEnabled()) {
                log.debug("Submitted task was rejected (" + task + "), spider state: [stopped=" + isStopped()
                        + ", terminated=" + isTerminated() + "].");
            }
        }
    }

    /* SPIDER PROCESS maintenance - pause, resume, shutdown, etc. */

    /**
     * Starts the Spider crawling.
     */
    public void start() {

        log.info("Starting spider...");

        // Check if seeds are available, otherwise the Spider will start, but will not have any
        // seeds and will not stop.
        if (seedList == null || seedList.isEmpty()) {
            log.warn("No seeds available for the Spider. Cancelling scan...");
            notifyListenersSpiderComplete(false);
            notifyListenersSpiderProgress(100, 0, 0);
            return;
        }

        if (scanUser != null)
            log.info("Scan will be performed from the point of view of User: " + scanUser.getName());

        this.controller.init();
        this.stopped = false;
        this.paused = false;
        this.initialized = false;

        // Initialize the thread pool
        this.threadPool = Executors.newFixedThreadPool(spiderParam.getThreadCount());

        // Initialize the HTTP sender
        httpSender = new HttpSender(connectionParam, true, HttpSender.SPIDER_INITIATOR);
        // Do not follow redirections because the request is not updated, the redirections will be
        // handled manually.
        httpSender.setFollowRedirect(false);

        // Add the seeds
        for (URI uri : seedList) {
            if (log.isInfoEnabled()) {
                log.info("Adding seed for spider: " + uri);
            }
            controller.addSeed(uri, HttpRequestHeader.GET);
        }
        // Mark the process as completely initialized
        initialized = true;
    }

    /**
     * Stops the Spider crawling. Must not be called from any of the threads in the thread pool.
     */
    public void stop() {
        if (stopped) {
            return;
        }
        this.stopped = true;
        log.info("Stopping spidering process by request.");

        if (this.paused) {
            // Have to resume first or we get a deadlock
            this.resume();
        }

        // Issue the shutdown command
        this.threadPool.shutdownNow();
        try {
            if (!this.threadPool.awaitTermination(2, TimeUnit.SECONDS)) {
                log.warn("Failed to await for all spider threads to stop in the given time (2s)...");
            }
        } catch (InterruptedException ignore) {
            log.warn("Interrupted while awaiting for all spider threads to stop...");
        }
        if (httpSender != null) {
            this.getHttpSender().shutdown();
            httpSender = null;
        }

        // Notify the controller to clean up memory
        controller.reset();
        this.threadPool = null;

        // Notify the listeners -- in the meanwhile
        notifyListenersSpiderComplete(false);
    }

    /**
     * The Spidering process is complete.
     */
    private void complete() {
        log.info("Spidering process is complete. Shutting down...");
        this.stopped = true;
        if (httpSender != null) {
            this.getHttpSender().shutdown();
            httpSender = null;
        }

        // Notify the controller to clean up memory
        controller.reset();

        // Issue the shutdown command on a separate thread, as the current thread is most likely one
        // from the pool
        new Thread(new Runnable() {
            @Override
            public void run() {
                if (threadPool != null) {
                    threadPool.shutdownNow();
                }
                // Notify the listeners -- in the meanwhile
                notifyListenersSpiderComplete(true);
                controller.reset();
                threadPool = null;
            }
        }).start();
    }

    /**
     * Pauses the Spider crawling.
     */
    public void pause() {
        pauseLock.lock();
        try {
            paused = true;
        } finally {
            pauseLock.unlock();
        }
    }

    /**
     * Resumes the Spider crawling.
     */
    public void resume() {
        pauseLock.lock();
        try {
            paused = false;
            // Wake up all threads that are currently paused
            pausedCondition.signalAll();
        } finally {
            pauseLock.unlock();
        }
    }

    /**
     * Sets the spider so it will scan from the point of view of a user.
     *
     * @param user the user to be scanned as
     */
    public void setScanAsUser(User user) {
        this.scanUser = user;
    }

    /**
     * Gets the user that will be used in the scanning.
     *
     * @return the scan user
     */
    protected User getScanUser() {
        return this.scanUser;
    }

    /**
     * This method is run by each thread in the Thread Pool before the task execution. Particularly,
     * it checks if the Spidering process is paused and, if it is, it waits on the corresponding
     * condition for the process to be resumed. Called from the SpiderTask.
     */
    protected void preTaskExecution() {
        checkPauseAndWait();
    }

    /**
     * This method is run by Threads in the ThreadPool and checks if the scan is paused and, if it
     * is, waits until it's unpaused.
     */
    protected void checkPauseAndWait() {
        pauseLock.lock();
        try {
            while (paused && !stopped) {
                pausedCondition.await();
            }
        } catch (InterruptedException e) {
        } finally {
            pauseLock.unlock();
        }
    }

    /**
     * This method is run by each thread in the Thread Pool after the task execution. Particularly,
     * it notifies the listeners of the progress and checks if the scan is complete. Called from the
     * SpiderTask.
     */
    protected synchronized void postTaskExecution() {
        tasksDoneCount++;
        int percentageComplete = tasksDoneCount * 100 / tasksTotalCount;

        // Compute the progress and notify the listeners
        this.notifyListenersSpiderProgress(percentageComplete, tasksDoneCount, tasksTotalCount - tasksDoneCount);

        // Check for ending conditions
        if (tasksDoneCount == tasksTotalCount && initialized) {
            this.complete();
        }
    }

    /**
     * Checks if is paused.
     * 
     * @return true, if is paused
     */
    public boolean isPaused() {
        return this.paused;
    }

    /**
     * Checks if is stopped, i.e. a shutdown was issued or it is not running.
     * 
     * @return true, if is stopped
     */
    public boolean isStopped() {
        return stopped;
    }

    /**
     * Checks if is terminated.
     * 
     * @return true, if is terminated
     */
    public boolean isTerminated() {
        return threadPool.isTerminated();
    }

    /* LISTENERS SECTION */

    /**
     * Adds a new spider listener.
     * 
     * @param listener the listener
     */
    public void addSpiderListener(SpiderListener listener) {
        this.listeners.add(listener);
    }

    /**
     * Removes a spider listener.
     * 
     * @param listener the listener
     */
    public void removeSpiderListener(SpiderListener listener) {
        this.listeners.remove(listener);
    }

    /**
     * Notifies all the listeners regarding the spider progress.
     * 
     * @param percentageComplete the percentage complete
     * @param numberCrawled the number of pages crawled
     * @param numberToCrawl the number of pages left to crawl
     */
    protected synchronized void notifyListenersSpiderProgress(int percentageComplete, int numberCrawled,
            int numberToCrawl) {
        for (SpiderListener l : listeners) {
            l.spiderProgress(percentageComplete, numberCrawled, numberToCrawl);
        }
    }

    /**
     * Notifies the listeners regarding a found uri.
     * 
     * @param uri the uri
     * @param method the method used for fetching the resource
     * @param status the {@link FetchStatus} stating if this uri will be processed, and, if not,
     *            stating the reason of the filtering
     */
    protected synchronized void notifyListenersFoundURI(String uri, String method, FetchStatus status) {
        for (SpiderListener l : listeners) {
            l.foundURI(uri, method, status);
        }
    }

    /**
     * Notifies the listeners regarding a read uri.
     * 
     * @param msg the message
     */
    protected synchronized void notifyListenersReadURI(HttpMessage msg) {
        for (SpiderListener l : listeners) {
            l.readURI(msg);
        }
    }

    /**
     * Notifies the listeners that the spider is complete.
     */
    protected synchronized void notifyListenersSpiderComplete(boolean successful) {
        for (SpiderListener l : listeners) {
            l.spiderComplete(successful);
        }
    }

    public void addCustomParser(SpiderParser sp) {
        this.controller.addSpiderParser(sp);
    }

}