Java tutorial
/* * * Paros and its related class files. * * Paros is an HTTP/HTTPS proxy for assessing web application security. * Copyright (C) 2003-2004 Chinotec Technologies Company * * This program is free software; you can redistribute it and/or * modify it under the terms of the Clarified Artistic License * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Clarified Artistic License for more details. * * You should have received a copy of the Clarified Artistic License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package org.parosproxy.paros.core.spider; import java.io.IOException; import java.util.List; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.URI; import org.apache.commons.httpclient.URIException; import org.parosproxy.paros.network.HttpHeader; import org.parosproxy.paros.network.HttpMalformedHeaderException; import org.parosproxy.paros.network.HttpMessage; import org.parosproxy.paros.network.HttpResponseHeader; import org.parosproxy.paros.network.HttpStatusCode; /** * * To change the template for this generated type comment go to * Window - Preferences - Java - Code Generation - Code and Comments */ public class SpiderThread extends Thread { private static final String[] NEGLECT_SUFFIXES = { "gif", "jpg", "bmp", "mp3", "arj", "doc", "swf", "pdf", "mpg", "wmv", "zip", "exe", "cab", "iso", "avi" }; private Spider parent = null; private boolean stop = false; private List queue = null; private boolean completed = false; private Collector collector = null; private boolean emptyQueue = false; /** * @return Returns the emptyQueue. */ boolean isEmptyQueue() { return emptyQueue; } SpiderThread(Spider parent) { this.parent = parent; queue = parent.getQueue(); collector = new Collector(this); this.setDaemon(true); this.setPriority(Thread.NORM_PRIORITY - 2); } /** * @return Returns the stop. */ boolean isStop() { return stop; } /** * @param stop The stop to set. */ void setStop(boolean stop) { this.stop = stop; } public void run() { QueueItem item = null; // while (!isStop() && !queue.isEmpty()) { // to avoid 1 thread running but other thread exit. while (!isStop() && !parent.isAllThreadEmptyQueue()) { try { synchronized (queue) { // get distinct item from queue do { item = null; if (queue.isEmpty()) { try { setEmptyQueue(true); Thread.sleep(500); } catch (InterruptedException ie) { } } else { item = (QueueItem) queue.remove(0); setEmptyQueue(false); } } while (!stop && item != null && parent.isInVisitedLink(item.getMessage())); } if (item != null) { parent.SpiderProgress(item); crawl(item.getMessage(), item.getDepth()); item.getHistoryReference().delete(); try { Thread.sleep(30); } catch (InterruptedException e1) { } } else if (!stop) { // no item, waiting for all spider queue empty try { Thread.sleep(500); } catch (InterruptedException e1) { } } } catch (Exception e) { e.printStackTrace(); } } if (queue.isEmpty()) { completed = true; } parent.checkIfAllThreadCompleted(); } private void readMsgResponse(HttpMessage msg) throws HttpException, IOException, HttpMalformedHeaderException { msg.getRequestHeader().setHeader(HttpHeader.IF_MODIFIED_SINCE, null); msg.getRequestHeader().setHeader(HttpHeader.IF_NONE_MATCH, null); msg.getRequestHeader().setContentLength(msg.getRequestBody().length()); parent.getHttpSender().sendAndReceive(msg); msg.getResponseHeader().setHeader(HttpHeader.TRANSFER_ENCODING, null); } private void crawl(HttpMessage msg, int depth) { Html html = null; try { if (isNeglectCrawl(msg)) { parent.readURI(msg); return; } readMsgResponse(msg); // if (msg.getResponseHeader().isEmpty() || msg.getResponseHeader().getStatusCode() == HttpStatusCode.NOT_MODIFIED) { // if (!readMsgResponse(msg)) { // return; // } // } if (!HttpStatusCode.isSuccess(msg.getResponseHeader().getStatusCode())) { return; } if (msg.getResponseHeader().getContentLength() > 200000) { msg.setResponseHeader(new HttpResponseHeader()); msg.getResponseBody().setBody(""); } parent.readURI(msg); if (isNeglectResponse(msg.getResponseHeader())) { return; } html = new Html(msg.getRequestHeader().getURI(), msg.getResponseBody().toString()); collector.collect(html, depth); // no more response processing needed. remove from msg to save memory } catch (Exception e) { e.printStackTrace(); } finally { msg.setResponseHeader(new HttpResponseHeader()); msg.getResponseBody().setBody(""); parent.addVisitedLink(msg); } } /** * Build URI given a base HTML. Keep absolute if it is. * @param html * @param link * @return * @throws URIException */ private URI buildURI(URI base, String link) throws URIException { URI uri = null; /* try { uri = new URI(link, true); if (uri.isAbsoluteURI()) { return uri; } } catch (URIException e) {} */ uri = new URI(base, link, true); return uri; } void foundURI(HttpMessage msg, String referer, int currentDepth) throws URIException { msg.getRequestHeader().setHeader(HttpHeader.REFERER, referer); parent.foundURI(msg, currentDepth + 1); } private boolean isNeglectCrawl(HttpMessage msg) { boolean result = false; URI uri = msg.getRequestHeader().getURI(); try { // check if need to skip this URL from config if (parent.getSpiderParam().isSkipURL(uri)) { return true; } // check if suffix relevant if (uri.getPath() != null) { String path = uri.getPath().toLowerCase(); for (int i = 0; i < NEGLECT_SUFFIXES.length; i++) { String suffix = "." + NEGLECT_SUFFIXES[i]; if (path.endsWith(suffix)) { return true; } } } } catch (Exception e) { } return result; } private boolean isNeglectResponse(HttpResponseHeader resHeader) { if (!HttpStatusCode.isSuccess(resHeader.getStatusCode())) { return true; } if (resHeader.isImage()) { return true; } if (resHeader.isText()) { return false; } // do not process - not html file if (resHeader.getContentLength() > 200000) { return true; } return false; } /** * @return Returns the completed. */ public boolean isCompleted() { return completed; } /** * @param emptyQueue The emptyQueue to set. */ private void setEmptyQueue(boolean emptyQueue) { this.emptyQueue = emptyQueue; } Spider getParent() { return parent; } }