org.zaproxy.zap.spider.SpiderTask.java Source code

Java tutorial

Introduction

Here is the source code for org.zaproxy.zap.spider.SpiderTask.java

Source

/*
 * Zed Attack Proxy (ZAP) and its related class files.
 * 
 * ZAP is an HTTP/HTTPS proxy for assessing web application security.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0 
 *   
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and 
 * limitations under the License. 
 */
package org.zaproxy.zap.spider;

import java.io.IOException;
import java.net.ConnectException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
import java.util.List;

import net.htmlparser.jericho.Source;

import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.log4j.Logger;
import org.parosproxy.paros.control.Control;
import org.parosproxy.paros.db.DatabaseException;
import org.parosproxy.paros.extension.history.ExtensionHistory;
import org.parosproxy.paros.model.HistoryReference;
import org.parosproxy.paros.network.HttpHeader;
import org.parosproxy.paros.network.HttpMalformedHeaderException;
import org.parosproxy.paros.network.HttpMessage;
import org.parosproxy.paros.network.HttpRequestHeader;
import org.zaproxy.zap.spider.filters.ParseFilter;
import org.zaproxy.zap.spider.parser.SpiderParser;

/**
 * The SpiderTask representing a spidering task performed during the Spidering process.
 */
public class SpiderTask implements Runnable {

    /** The parent spider. */
    private Spider parent;

    /**
     * The history reference to the database record where the request message has been partially filled in.
     * Cannot be null.
     */
    private HistoryReference reference;

    /** The depth of crawling where the uri was found. */
    private int depth;

    private ExtensionHistory extHistory = null;

    /** The Constant log. */
    private static final Logger log = Logger.getLogger(SpiderTask.class);

    /**
     * Instantiates a new spider task using the target URI. The purpose of this task is to crawl the given
     * uri, using the provided method, find any other uris in the fetched resource and create other tasks.
     * 
     * 
     * @param parent the spider controlling the crawling process
     * @param uri the uri that this task should process
     * @param depth the depth where this uri is located in the spidering process
     * @param method the HTTP method that should be used to fetch the resource
     * 
     */
    public SpiderTask(Spider parent, URI uri, int depth, String method) {
        this(parent, null, uri, depth, method, null);
    }

    /**
     * Instantiates a new spider task using the target URI. The purpose of this task is to crawl the given
     * uri, using the provided method, find any other uris in the fetched resource and create other tasks.
     * 
     * @param parent the spider controlling the crawling process
     * @param sourceUri the URI where the given {@code uri} was found
     * @param uri the uri that this task should process
     * @param depth the depth where this uri is located in the spidering process
     * @param method the HTTP method that should be used to fetch the resource
     * @since 2.4.0
     */
    public SpiderTask(Spider parent, URI sourceUri, URI uri, int depth, String method) {
        this(parent, sourceUri, uri, depth, method, null);
    }

    /**
     * Instantiates a new spider task using the target URI. The purpose of this task is to crawl the given
     * uri, using the provided method, find any other uris in the fetched resource and create other tasks.
     * 
     * <p>
     * The body of the request message is also provided in the {@literal requestBody} parameter and will be
     * used when fetching the resource from the specified uri.
     * </p>
     * 
     * @param parent the spider controlling the crawling process
     * @param uri the uri that this task should process
     * @param depth the depth where this uri is located in the spidering process
     * @param method the HTTP method that should be used to fetch the resource
     * @param requestBody the body of the request
     */
    public SpiderTask(Spider parent, URI uri, int depth, String method, String requestBody) {
        this(parent, null, uri, depth, method, requestBody);
    }

    /**
     * Instantiates a new spider task using the target URI. The purpose of this task is to crawl the given
     * uri, using the provided method, find any other uris in the fetched resource and create other tasks.
     * <p>
     * The body of the request message is also provided in the {@literal requestBody} parameter and will be
     * used when fetching the resource from the specified uri.
     * 
     * @param parent the spider controlling the crawling process
     * @param sourceUri the URI where the given {@code uri} was found
     * @param uri the uri that this task should process
     * @param depth the depth where this uri is located in the spidering process
     * @param method the HTTP method that should be used to fetch the resource
     * @param requestBody the body of the request
     * @since 2.4.0
     */
    public SpiderTask(Spider parent, URI sourceUri, URI uri, int depth, String method, String requestBody) {
        super();
        this.parent = parent;
        this.depth = depth;

        // Log the new task
        if (log.isDebugEnabled()) {
            log.debug("New task submitted for uri: " + uri);
        }

        // Create a new HttpMessage that will be used for the request and persist it in the database using
        // HistoryReference
        try {
            HttpRequestHeader requestHeader = new HttpRequestHeader(method, uri, HttpHeader.HTTP11,
                    parent.getConnectionParam());
            if (sourceUri != null && parent.getSpiderParam().isSendRefererHeader()) {
                requestHeader.setHeader(HttpRequestHeader.REFERER, sourceUri.toString());
            }
            HttpMessage msg = new HttpMessage(requestHeader);
            if (requestBody != null) {
                msg.getRequestHeader().setContentLength(requestBody.length());
                msg.setRequestBody(requestBody);
            }
            this.reference = new HistoryReference(parent.getModel().getSession(), HistoryReference.TYPE_SPIDER_TASK,
                    msg);
        } catch (HttpMalformedHeaderException e) {
            log.error("Error while building HttpMessage for uri: " + uri, e);
        } catch (DatabaseException e) {
            log.error("Error while persisting HttpMessage for uri: " + uri, e);
        }
    }

    @Override
    public void run() {

        // Log the task start
        if (log.isDebugEnabled()) {
            try {
                log.debug("Spider Task Started. Processing uri at depth " + depth
                        + " using already constructed message:  " + reference.getURI());
            } catch (Exception e1) { // Ignore it
            }
        }

        // Check if the should stop
        if (parent.isStopped()) {
            log.debug("Spider process is stopped. Skipping crawling task...");
            parent.postTaskExecution();
            return;
        }
        if (reference == null) {
            log.warn("Null URI. Skipping crawling task: " + this);
            parent.postTaskExecution();
            return;
        }

        // Check if the crawling process is paused and do any "before execution" processing
        parent.preTaskExecution();

        // Fetch the resource
        HttpMessage msg = null;
        try {
            msg = fetchResource();
        } catch (ConnectException e) {
            // This will have been logged at debug level with the URL (which we dont have here)
            parent.postTaskExecution();
            return;
        } catch (SocketTimeoutException e) {
            // This will have been logged at debug level with the URL (which we dont have here)
            parent.postTaskExecution();
            return;
        } catch (SocketException e) {
            // This will have been logged at debug level with the URL (which we dont have here)
            parent.postTaskExecution();
            return;
        } catch (UnknownHostException e) {
            // This will have been logged at debug level with the URL (which we dont have here)
            parent.postTaskExecution();
            return;
        } catch (Exception e) {
            log.error("An error occured while fetching the resource: " + e.getMessage(), e);
            parent.postTaskExecution();
            return;
        }

        // Check if the should stop
        if (parent.isStopped()) {
            log.debug("Spider process is stopped. Skipping crawling task...");
            parent.postTaskExecution();
            return;
        }
        // Check if the crawling process is paused
        parent.checkPauseAndWait();

        // Check the parse filters to see if the resource should be skipped from parsing
        boolean isFiltered = false;
        for (ParseFilter filter : parent.getController().getParseFilters()) {
            if (filter.isFiltered(msg)) {
                if (log.isDebugEnabled()) {
                    log.debug("Resource fetched, but will not be parsed due to a ParseFilter rule: "
                            + msg.getRequestHeader().getURI());
                }
                isFiltered = true;
                break;
            }
        }
        if (!isFiltered) {
            // Notify the SpiderListeners that a resource was read
            parent.notifyListenersReadURI(msg);
        }

        // Check if the should stop
        if (parent.isStopped()) {
            log.debug("Spider process is stopped. Skipping crawling task...");
            parent.postTaskExecution();
            return;
        }
        // Check if the crawling process is paused
        parent.checkPauseAndWait();

        // Process resource, if this is not the maximum depth
        if (!isFiltered && depth < parent.getSpiderParam().getMaxDepth()) {
            processResource(msg);
        }

        // Update the progress and check if the spidering process should stop
        parent.postTaskExecution();
        log.debug("Spider Task finished.");
    }

    /**
     * Process a resource, searching for links (uris) to other resources.
     * 
     * @param message the HTTP Message
     */
    private void processResource(HttpMessage message) {
        List<SpiderParser> parsers = parent.getController().getParsers();

        // Prepare the Jericho source
        Source source = new Source(message.getResponseBody().toString());

        // Get the full path of the file
        String path = null;
        try {
            path = message.getRequestHeader().getURI().getPath();
        } catch (URIException e) {
        } finally {
            // Handle null paths.
            if (path == null)
                path = "";
        }

        // Parse the resource
        boolean alreadyConsumed = false;
        for (SpiderParser parser : parsers) {
            if (parser.canParseResource(message, path, alreadyConsumed)) {
                if (log.isDebugEnabled())
                    log.debug("Parser " + parser + " can parse resource '" + path + "'");
                if (parser.parseResource(message, source, depth))
                    alreadyConsumed = true;
            } else {
                if (log.isDebugEnabled())
                    log.debug("Parser " + parser + " cannot parse resource '" + path + "'");
            }
        }
    }

    private ExtensionHistory getExtensionHistory() {
        if (this.extHistory == null) {
            this.extHistory = (ExtensionHistory) Control.getSingleton().getExtensionLoader()
                    .getExtension(ExtensionHistory.NAME);
        }
        return this.extHistory;
    }

    /**
     * Fetches a resource.
     * 
     * @return the response http message
     * @throws HttpException the http exception
     * @throws IOException Signals that an I/O exception has occurred.
     * @throws DatabaseException
     */
    private HttpMessage fetchResource() throws HttpException, IOException, DatabaseException {

        // Build fetch the request message from the database
        HttpMessage msg;
        try {
            msg = reference.getHttpMessage();
        } finally {
            // Remove the history reference from the database, as it's not used anymore
            if (getExtensionHistory() != null) {
                getExtensionHistory().delete(reference);
            }
        }

        msg.getRequestHeader().setHeader(HttpHeader.IF_MODIFIED_SINCE, null);
        msg.getRequestHeader().setHeader(HttpHeader.IF_NONE_MATCH, null);

        // Check if there is a custom user agent
        if (parent.getSpiderParam().getUserAgent() != null) {
            msg.getRequestHeader().setHeader(HttpHeader.USER_AGENT, parent.getSpiderParam().getUserAgent());
        }

        //Check if there's a need to send the message from the point of view of a User
        if (parent.getScanUser() != null) {
            msg.setRequestingUser(parent.getScanUser());
        }

        // Fetch the page
        if (parent.getHttpSender() != null) {
            try {
                parent.getHttpSender().sendAndReceive(msg);
            } catch (ConnectException e) {
                log.debug("Failed to connect to: " + msg.getRequestHeader().getURI(), e);
                throw e;
            } catch (SocketTimeoutException e) {
                log.debug("Socket timeout: " + msg.getRequestHeader().getURI(), e);
                throw e;
            } catch (SocketException e) {
                log.debug("Socket exception: " + msg.getRequestHeader().getURI(), e);
                throw e;
            } catch (UnknownHostException e) {
                log.debug("Unknown host: " + msg.getRequestHeader().getURI(), e);
                throw e;
            }
        }

        return msg;

    }

}