List of usage examples for java.net SocketTimeoutException getClass
@HotSpotIntrinsicCandidate public final native Class<?> getClass();
From source file:org.apache.manifoldcf.crawler.connectors.rss.RSSConnector.java
/** Process a set of documents. * This is the method that should cause each document to be fetched, processed, and the results either added * to the queue of documents for the current job, and/or entered into the incremental ingestion manager. * The document specification allows this class to filter what is done based on the job. * The connector will be connected before this method can be called. *@param documentIdentifiers is the set of document identifiers to process. *@param statuses are the currently-stored document versions for each document in the set of document identifiers * passed in above./*from w ww .ja va 2 s . c o m*/ *@param activities is the interface this method should use to queue up new document references * and ingest documents. *@param jobMode is an integer describing how the job is being run, whether continuous or once-only. *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one. */ @Override public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption { getSession(); // The connection limit is designed to permit this connector to coexist with potentially other connectors, such as the web connector. // There is currently no good way to enforce connection limits across all installed connectors - this will require considerably more // thought to set up properly. int connectionLimit = 200; String[] fixedList = new String[2]; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In getDocumentVersions for " + Integer.toString(documentIdentifiers.length) + " documents"); Filter f = new Filter(spec, false); String[] acls = f.getAcls(); // Sort it, java.util.Arrays.sort(acls); // NOTE: There are two kinds of documents in here; documents that are RSS feeds (that presumably have a content-type // of text/xml), and documents that need to be indexed. // // For the latter, the metadata etc is part of the version string. For the former, the only thing that is part of the version string is the // document's checksum. // // The need to exclude documents from fetch based on whether they match an expression causes some difficulties, because we really // DON'T want this to apply to the feeds themselves. Since the distinguishing characteristic of a feed is that it is in the seed list, // and that its content-type is text/xml, we could use either of these characteristics to treat feeds differently from // fetchable urls. But the latter approach requires a fetch, which is forbidden. So - the spec will be used to characterize the url. // However, the spec might change, and the url might be dropped from the list - and then what?? // // The final solution is to simply not queue what cannot be mapped. int feedTimeout = f.getFeedTimeoutValue(); // The document specification has already been used to trim out documents that are not // allowed from appearing in the queue. So, even that has already been done. for (String documentIdentifier : documentIdentifiers) { // If it is in this list, we presume that it has been vetted against the map etc., so we don't do that again. We just fetch it. // And, if the content type is xml, we calculate the version as if it is a feed rather than a document. // Get the url String urlValue = documentIdentifier; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Getting version string for '" + urlValue + "'"); String versionString; String ingestURL = null; String[] pubDates = null; String[] sources = null; String[] titles = null; String[] authorNames = null; String[] authorEmails = null; String[] categories = null; String[] descriptions = null; try { // If there's a carrydown "data" value for this url, we use that value rather than actually fetching the document. This also means we don't need to // do a robots check, because we aren't actually crawling anything. So, ALWAYS do this first... CharacterInput[] dechromedData = activities.retrieveParentDataAsFiles(urlValue, "data"); try { if (dechromedData.length > 0) { // Data already available. The fetch cycle can be entirely avoided, as can the robots check. ingestURL = f.mapDocumentURL(urlValue); if (ingestURL != null) { // Open up an input stream corresponding to the carrydown data. The stream will be encoded as utf-8. try { InputStream is = dechromedData[0].getUtf8Stream(); try { StringBuilder sb = new StringBuilder(); long checkSum = cache.addData(activities, urlValue, "text/html", is); // Grab what we need from the passed-down data for the document. These will all become part // of the version string. pubDates = activities.retrieveParentData(urlValue, "pubdate"); sources = activities.retrieveParentData(urlValue, "source"); titles = activities.retrieveParentData(urlValue, "title"); authorNames = activities.retrieveParentData(urlValue, "authorname"); authorEmails = activities.retrieveParentData(urlValue, "authoremail"); categories = activities.retrieveParentData(urlValue, "category"); descriptions = activities.retrieveParentData(urlValue, "description"); java.util.Arrays.sort(pubDates); java.util.Arrays.sort(sources); java.util.Arrays.sort(titles); java.util.Arrays.sort(authorNames); java.util.Arrays.sort(authorEmails); java.util.Arrays.sort(categories); java.util.Arrays.sort(descriptions); if (sources.length == 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Warning; URL '" + ingestURL + "' doesn't seem to have any RSS feed source!"); } sb.append('+'); packList(sb, acls, '+'); if (acls.length > 0) { sb.append('+'); pack(sb, defaultAuthorityDenyToken, '+'); } else sb.append('-'); // The ingestion URL pack(sb, ingestURL, '+'); // The pub dates packList(sb, pubDates, '+'); // The titles packList(sb, titles, '+'); // The sources packList(sb, sources, '+'); // The categories packList(sb, categories, '+'); // The descriptions packList(sb, descriptions, '+'); // The author names packList(sb, authorNames, '+'); // The author emails packList(sb, authorEmails, '+'); // Do the checksum part, which does not need to be parseable. sb.append(new Long(checkSum).toString()); versionString = sb.toString(); } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException( "IO exception reading data from string: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException( "IO exception reading data from string: " + e.getMessage(), e); } } else { // Document a seed or unmappable; just skip if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping carry-down document '" + urlValue + "' because it is unmappable or is a seed."); } } else { // Get the old version string String oldVersionString = statuses.getIndexedVersionString(documentIdentifier); // Unpack the old version as much as possible. // We are interested in what the ETag and Last-Modified headers were last time. String lastETagValue = null; String lastModifiedValue = null; // Note well: Non-continuous jobs cannot use etag because the rss document MUST be fetched each time for such jobs, // or the documents it points at would get deleted. // // NOTE: I disabled this code because we really need the feed's TTL value in order to reschedule properly. I can't get the // TTL value without refetching the document - therefore ETag and Last-Modified cannot be used :-( if (false && jobMode == JOBMODE_CONTINUOUS && oldVersionString != null && oldVersionString.startsWith("-")) { // It's a feed, so the last etag and last-modified fields should be encoded in this version string. StringBuilder lastETagBuffer = new StringBuilder(); int unpackPos = unpack(lastETagBuffer, oldVersionString, 1, '+'); StringBuilder lastModifiedBuffer = new StringBuilder(); unpackPos = unpack(lastModifiedBuffer, oldVersionString, unpackPos, '+'); if (lastETagBuffer.length() > 0) lastETagValue = lastETagBuffer.toString(); if (lastModifiedBuffer.length() > 0) lastModifiedValue = lastModifiedBuffer.toString(); } if (Logging.connectors.isDebugEnabled() && (lastETagValue != null || lastModifiedValue != null)) Logging.connectors.debug( "RSS: Document '" + urlValue + "' was found to have a previous ETag value of '" + ((lastETagValue == null) ? "null" : lastETagValue) + "' and a previous Last-Modified value of '" + ((lastModifiedValue == null) ? "null" : lastModifiedValue) + "'"); // Robots check. First, we need to separate the url into its components URL url; try { url = new URL(urlValue); } catch (MalformedURLException e) { Logging.connectors.debug("RSS: URL '" + urlValue + "' is malformed; skipping", e); activities.deleteDocument(documentIdentifier); continue; } String protocol = url.getProtocol(); int port = url.getPort(); String hostName = url.getHost(); String pathPart = url.getFile(); // Check with robots to see if it's allowed if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext, throttleGroupName, protocol, port, hostName, url.getPath(), userAgent, from, proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities, connectionLimit)) { activities.recordActivity(null, ACTIVITY_FETCH, null, urlValue, Integer.toString(-2), "Robots exclusion", null); if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: Skipping url '" + urlValue + "' because robots.txt says to"); activities.deleteDocument(documentIdentifier); continue; } // Now, use the fetcher, and get the file. IThrottledConnection connection = fetcher.createConnection(currentContext, throttleGroupName, hostName, connectionLimit, feedTimeout, proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities); try { // Begin the fetch connection.beginFetch("Data"); try { // Execute the request. // Use the connect timeout from the document specification! int status = connection.executeFetch(protocol, port, pathPart, userAgent, from, lastETagValue, lastModifiedValue); switch (status) { case IThrottledConnection.STATUS_NOCHANGE: versionString = oldVersionString; break; case IThrottledConnection.STATUS_OK: try { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Successfully fetched " + urlValue); // Document successfully fetched! // If its content is xml, presume it's a feed... String contentType = connection.getResponseHeader("Content-Type"); // Some sites have multiple content types. We just look at the LAST one in that case. if (contentType != null) { String[] contentTypes = contentType.split(","); if (contentTypes.length > 0) contentType = contentTypes[contentTypes.length - 1].trim(); else contentType = null; } String strippedContentType = contentType; if (strippedContentType != null) { int pos = strippedContentType.indexOf(";"); if (pos != -1) strippedContentType = strippedContentType.substring(0, pos).trim(); } boolean isXML = (strippedContentType != null && xmlContentTypes.contains(strippedContentType)); ingestURL = null; if (!isXML) { // If the chromed content mode is set to "skip", and we got here, it means // we should not include the content. if (f.getChromedContentMode() == CHROMED_SKIP) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing url '" + urlValue + "' because it no longer has dechromed content available"); versionString = null; break; } // Decide whether to exclude this document based on what we see here. // Basically, we want to get rid of everything that we don't know what // to do with in the ingestion system. if (!activities.checkMimeTypeIndexable(contentType)) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing url '" + urlValue + "' because it had the wrong content type: " + ((contentType == null) ? "null" : "'" + contentType + "'")); versionString = null; break; } ingestURL = f.mapDocumentURL(urlValue); } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: The url '" + urlValue + "' is a feed"); if (!f.isSeed(urlValue)) { // Remove the feed from consideration, since it has left the list of seeds if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing feed url '" + urlValue + "' because it is not a seed."); versionString = null; break; } } InputStream is = connection.getResponseBodyStream(); try { long checkSum = cache.addData(activities, urlValue, contentType, is); StringBuilder sb = new StringBuilder(); if (ingestURL != null) { // We think it is ingestable. The version string accordingly starts with a "+". // Grab what we need from the passed-down data for the document. These will all become part // of the version string. pubDates = activities.retrieveParentData(urlValue, "pubdate"); sources = activities.retrieveParentData(urlValue, "source"); titles = activities.retrieveParentData(urlValue, "title"); authorNames = activities.retrieveParentData(urlValue, "authorname"); authorEmails = activities.retrieveParentData(urlValue, "authoremail"); categories = activities.retrieveParentData(urlValue, "category"); descriptions = activities.retrieveParentData(urlValue, "description"); java.util.Arrays.sort(pubDates); java.util.Arrays.sort(sources); java.util.Arrays.sort(titles); java.util.Arrays.sort(authorNames); java.util.Arrays.sort(authorEmails); java.util.Arrays.sort(categories); java.util.Arrays.sort(descriptions); if (sources.length == 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Warning; URL '" + ingestURL + "' doesn't seem to have any RSS feed source!"); } sb.append('+'); packList(sb, acls, '+'); if (acls.length > 0) { sb.append('+'); pack(sb, defaultAuthorityDenyToken, '+'); } else sb.append('-'); // The ingestion URL pack(sb, ingestURL, '+'); // The pub dates packList(sb, pubDates, '+'); // The titles packList(sb, titles, '+'); // The sources packList(sb, sources, '+'); // The categories packList(sb, categories, '+'); // The descriptions packList(sb, descriptions, '+'); // The author names packList(sb, authorNames, '+'); // The author emails packList(sb, authorEmails, '+'); } else { sb.append('-'); String etag = connection.getResponseHeader("ETag"); if (etag == null) pack(sb, "", '+'); else pack(sb, etag, '+'); String lastModified = connection.getResponseHeader("Last-Modified"); if (lastModified == null) pack(sb, "", '+'); else pack(sb, lastModified, '+'); } // Do the checksum part, which does not need to be parseable. sb.append(new Long(checkSum).toString()); versionString = sb.toString(); } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { Logging.connectors .warn("RSS: Socket timeout exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } catch (ConnectTimeoutException e) { Logging.connectors .warn("RSS: Connecto timeout exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { Logging.connectors.warn("RSS: IO exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } break; case IThrottledConnection.STATUS_SITEERROR: case IThrottledConnection.STATUS_PAGEERROR: default: // Record an *empty* version. // This signals the processDocuments() method that we really don't want to ingest this document, but we also don't // want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times. versionString = ""; break; } } finally { connection.doneFetch(activities); } } finally { connection.close(); } if (versionString == null) { activities.deleteDocument(documentIdentifier); continue; } if (!(versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier, versionString))) continue; // Process document! if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Processing '" + urlValue + "'"); // The only links we extract come from documents that we think are RSS feeds. // When we think that's the case, we attempt to parse it as RSS XML. if (ingestURL == null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Interpreting document '" + urlValue + "' as a feed"); // We think it is a feed. // If this is a continuous job, AND scanonly is true, it means that the document was either identical to the // previous fetch, or was not fetched at all. In that case, it may not even be there, and we *certainly* don't // want to attempt to process it in any case. // // NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost. If the // TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds. if (true || jobMode != JOBMODE_CONTINUOUS) { handleRSSFeedSAX(urlValue, activities, f); if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Extraction of feed '" + urlValue + "' complete"); // Record the feed's version string, so we won't refetch unless needed. // This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to // keep track of the adaptive parameters. activities.recordDocument(documentIdentifier, versionString); } else { // The problem here is that we really do need to set the rescan time to something reasonable. // But we might not even have read the feed! So what to do?? // One answer is to build a connector-specific table that carries the last value of every feed around. // Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified). if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Feed '" + urlValue + "' does not appear to differ from previous fetch for a continuous job; not extracting!"); long currentTime = System.currentTimeMillis(); Long defaultRescanTime = f.getDefaultRescanTime(currentTime); if (defaultRescanTime != null) { Long minimumTime = f.getMinimumRescanTime(currentTime); if (minimumTime != null) { if (defaultRescanTime.longValue() < minimumTime.longValue()) defaultRescanTime = minimumTime; } } activities.setDocumentScheduleBounds(urlValue, defaultRescanTime, defaultRescanTime, null, null); } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Interpreting '" + urlValue + "' as a document"); String errorCode = null; String errorDesc = null; long startTime = System.currentTimeMillis(); Long fileLengthLong = null; try { long documentLength = cache.getDataLength(documentIdentifier); if (!activities.checkLengthIndexable(documentLength)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_LENGTH; errorDesc = "Document rejected because of length (" + documentLength + ")"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its length was rejected (" + documentLength + ")"); continue; } if (!activities.checkURLIndexable(documentIdentifier)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_URL; errorDesc = "Document rejected because of URL ('" + documentIdentifier + "')"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its URL was rejected ('" + documentIdentifier + "')"); continue; } // Check if it's a recognized content type String contentType = cache.getContentType(documentIdentifier); // Some sites have multiple content types. We just look at the LAST one in that case. if (contentType != null) { String[] contentTypes = contentType.split(","); if (contentTypes.length > 0) contentType = contentTypes[contentTypes.length - 1].trim(); else contentType = null; } if (!activities.checkMimeTypeIndexable(contentType)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_MIMETYPE; errorDesc = "Document rejected because of mime type (" + contentType + ")"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its mime type was rejected ('" + contentType + "')"); continue; } // Treat it as an ingestable document. long dataSize = cache.getDataLength(urlValue); RepositoryDocument rd = new RepositoryDocument(); // Set content type if (contentType != null) rd.setMimeType(contentType); // Turn into acls and add into description String[] denyAcls; if (acls == null) denyAcls = null; else if (acls.length == 0) denyAcls = new String[0]; else denyAcls = new String[] { defaultAuthorityDenyToken }; if (acls != null && denyAcls != null) rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, acls, denyAcls); if (titles != null && titles.length > 0) rd.addField("title", titles); if (authorNames != null && authorNames.length > 0) rd.addField("authorname", authorNames); if (authorEmails != null && authorEmails.length > 0) rd.addField("authoremail", authorEmails); if (descriptions != null && descriptions.length > 0) rd.addField("summary", descriptions); if (sources != null && sources.length > 0) rd.addField("source", sources); if (categories != null && categories.length > 0) rd.addField("category", categories); // The pubdates are a ms since epoch value; we want the minimum one for the origination time. Long minimumOrigTime = null; if (pubDates != null && pubDates.length > 0) { String[] pubDateValuesISO = new String[pubDates.length]; TimeZone tz = TimeZone.getTimeZone("UTC"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'"); df.setTimeZone(tz); for (int k = 0; k < pubDates.length; k++) { String pubDate = pubDates[k]; try { Long pubDateLong = new Long(pubDate); if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue()) minimumOrigTime = pubDateLong; pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue())); } catch (NumberFormatException e) { // Do nothing; the version string seems to not mean anything pubDateValuesISO[k] = ""; } } rd.addField("pubdate", pubDates); rd.addField("pubdateiso", pubDateValuesISO); } if (minimumOrigTime != null) activities.setDocumentOriginationTime(urlValue, minimumOrigTime); InputStream is = cache.getData(urlValue); if (is != null) { try { rd.setBinary(is, dataSize); try { activities.ingestDocumentWithException(documentIdentifier, versionString, ingestURL, rd); errorCode = "OK"; fileLengthLong = new Long(dataSize); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "reading data"); } } finally { try { is.close(); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "closing stream"); } } } } catch (ManifoldCFException e) { if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) errorCode = null; throw e; } finally { if (errorCode != null) activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, null, urlValue, errorCode, errorDesc, null); } } } } finally { for (CharacterInput ci : dechromedData) { if (ci != null) ci.discard(); } } } finally { // Remove any fetched documents. cache.deleteData(documentIdentifier); } } }
From source file:org.forgerock.maven.plugins.LinkTester.java
private void checkUrl(String path, String docUrl) { if (shouldSkipUrl(docUrl)) { debug("Skipping " + docUrl + " since it matches a skipUrlPattern"); return;// ww w. jav a 2s. c o m } if (tested.contains(docUrl)) { if (failedUrls.containsValue(docUrl)) { failedUrls.put(path, docUrl); } return; } debug("Checking " + docUrl + " from file: " + path); try { URL url = new URL(docUrl); URLConnection urlConn = url.openConnection(); if (urlConn instanceof HttpURLConnection) { HttpURLConnection conn = (HttpURLConnection) urlConn; if (conn instanceof HttpsURLConnection) { HttpsURLConnection httpsConn = (HttpsURLConnection) conn; httpsConn.setHostnameVerifier(new TrustAllHostnameVerifier()); httpsConn.setSSLSocketFactory(TRUST_ALL_SOCKET_FACTORY); } conn.setConnectTimeout(1000); //if we don't get anything back within 15 seconds it is safe to assume that something is really wrong //with that site.. conn.setReadTimeout(15000); int responseCode = conn.getResponseCode(); if (responseCode >= 400) { warn(docUrl + ": received unexpected response code: " + responseCode); failedUrls.put(path, docUrl); } } } catch (SocketTimeoutException ste) { warn(docUrl + ": " + ste.getClass().getName() + " " + ste.getMessage()); timedOutUrls.put(path, docUrl); } catch (Exception ex) { warn(docUrl + ": " + ex.getClass().getName() + " " + ex.getMessage()); failedUrls.put(path, docUrl); } tested.add(docUrl); }
From source file:org.zaproxy.zap.extension.ascanrules.TestPathTraversal.java
/** * scans all GET and POST parameters for Path Traversal vulnerabilities * * @param msg/*from w w w. ja va2 s. c om*/ * @param param * @param value */ @Override public void scan(HttpMessage msg, String param, String value) { try { // figure out how aggressively we should test int nixCount = 0; int winCount = 0; int dirCount = 0; int localTraversalLength = 0; // DEBUG only if (log.isDebugEnabled()) { log.debug("Attacking at Attack Strength: " + this.getAttackStrength()); } switch (this.getAttackStrength()) { case LOW: // This works out as a total of 2+2+2+0*4+1 = 7 reqs / param nixCount = 2; winCount = 2; dirCount = 2; localTraversalLength = 0; break; case MEDIUM: // This works out as a total of 2+4+4+1*4+1 = 15 reqs / param nixCount = 2; winCount = 4; dirCount = 4; localTraversalLength = 1; break; case HIGH: // This works out as a total of 4+8+7+2*4+1 = 28 reqs / param nixCount = 4; winCount = 8; dirCount = 7; localTraversalLength = 2; break; case INSANE: // This works out as a total of 6+18+19+4*4+1 = 60 reqs / param nixCount = NIX_LOCAL_FILE_TARGETS.length; winCount = WIN_LOCAL_FILE_TARGETS.length; dirCount = LOCAL_DIR_TARGETS.length; localTraversalLength = 4; break; default: // Default to off } if (log.isDebugEnabled()) { log.debug("Checking [" + getBaseMsg().getRequestHeader().getMethod() + "] [" + getBaseMsg().getRequestHeader().getURI() + "], parameter [" + param + "] for Path Traversal to local files"); } // Check 1: Start detection for Windows patterns // note that depending on the AttackLevel, the number of prefixes that we will try // changes. if (inScope(Tech.Windows)) { for (int h = 0; h < winCount; h++) { // Check if a there was a finding or the scan has been stopped // if yes dispose resources and exit if (sendAndCheckPayload(param, WIN_LOCAL_FILE_TARGETS[h], WIN_PATTERN) || isStop()) { // Dispose all resources // Exit the plugin return; } } } // Check 2: Start detection for *NIX patterns // note that depending on the AttackLevel, the number of prefixes that we will try // changes. if (inScope(Tech.Linux) || inScope(Tech.MacOS)) { for (int h = 0; h < nixCount; h++) { // Check if a there was a finding or the scan has been stopped // if yes dispose resources and exit if (sendAndCheckPayload(param, NIX_LOCAL_FILE_TARGETS[h], NIX_PATTERN) || isStop()) { // Dispose all resources // Exit the plugin return; } } } // Check 3: Detect if this page is a directory browsing component // example: https://www.buggedsite.org/log/index.php?dir=C:\ // note that depending on the AttackLevel, the number of prefixes that we will try // changes. for (int h = 0; h < dirCount; h++) { // Check if a there was a finding or the scan has been stopped // if yes dispose resources and exit if (sendAndCheckPayload(param, LOCAL_DIR_TARGETS[h], DIR_PATTERN) || isStop()) { // Dispose all resources // Exit the plugin return; } } // Check 4: Start detection for internal well known files // try variants based on increasing ../ ..\ prefixes and the presence of the / and \ // trailer // e.g. WEB-INF/web.xml, /WEB-INF/web.xml, ../WEB-INF/web.xml, /../WEB-INF/web.xml, ecc. // Both slashed and backslashed variants are checked // ------------------------------- // Currently we've always checked only for J2EE known files // and this remains also for this version // // Web.config for .NET in the future? // ------------------------------- String sslashPattern = "WEB-INF/web.xml"; // The backslashed version of the same check String bslashPattern = sslashPattern.replace('/', '\\'); if (inScope(Tech.Tomcat)) { for (int idx = 0; idx < localTraversalLength; idx++) { // Check if a there was a finding or the scan has been stopped // if yes dispose resources and exit if (sendAndCheckPayload(param, sslashPattern, WAR_PATTERN) || sendAndCheckPayload(param, bslashPattern, WAR_PATTERN) || sendAndCheckPayload(param, '/' + sslashPattern, WAR_PATTERN) || sendAndCheckPayload(param, '\\' + bslashPattern, WAR_PATTERN) || isStop()) { // Dispose all resources // Exit the plugin return; } sslashPattern = "../" + sslashPattern; bslashPattern = "..\\" + bslashPattern; } } // Check 5: try a local file Path Traversal on the file name of the URL (which obviously // will not be in the target list above). // first send a query for a random parameter value, and see if we get a 200 back // if 200 is returned, abort this check (on the url filename itself), because it would // be unreliable. // if we know that a random query returns <> 200, then a 200 response likely means // something! // this logic is all about avoiding false positives, while still attempting to match on // actual vulnerabilities msg = getNewMsg(); setParameter(msg, param, NON_EXISTANT_FILENAME); // send the modified message (with a hopefully non-existent filename), and see what we // get back try { sendAndReceive(msg); } catch (SocketException | IllegalStateException | UnknownHostException | IllegalArgumentException | InvalidRedirectLocationException | URIException ex) { if (log.isDebugEnabled()) { log.debug("Caught " + ex.getClass().getName() + " " + ex.getMessage() + " when accessing: " + msg.getRequestHeader().getURI().toString()); } return; // Something went wrong, no point continuing } // do some pattern matching on the results. Pattern errorPattern = Pattern.compile("Exception|Error"); Matcher errorMatcher = errorPattern.matcher(msg.getResponseBody().toString()); String urlfilename = msg.getRequestHeader().getURI().getName(); // url file name may be empty, i.e. there is no file name for next check if (!StringUtils.isEmpty(urlfilename) && (msg.getResponseHeader().getStatusCode() != HttpStatusCode.OK || errorMatcher.find())) { if (log.isDebugEnabled()) { log.debug("It is possible to check for local file Path Traversal on the url filename on [" + msg.getRequestHeader().getMethod() + "] [" + msg.getRequestHeader().getURI() + "], [" + param + "]"); } String prefixedUrlfilename; // for the url filename, try each of the prefixes in turn for (String prefix : LOCAL_FILE_RELATIVE_PREFIXES) { prefixedUrlfilename = prefix + urlfilename; msg = getNewMsg(); setParameter(msg, param, prefixedUrlfilename); // send the modified message (with the url filename), and see what we get back try { sendAndReceive(msg); } catch (SocketException | IllegalStateException | UnknownHostException | IllegalArgumentException | InvalidRedirectLocationException | URIException ex) { if (log.isDebugEnabled()) { log.debug("Caught " + ex.getClass().getName() + " " + ex.getMessage() + " when accessing: " + msg.getRequestHeader().getURI().toString()); } continue; // Something went wrong, move to the next prefix in the loop } // did we get an Exception or an Error? errorMatcher = errorPattern.matcher(msg.getResponseBody().toString()); if ((msg.getResponseHeader().getStatusCode() == HttpStatusCode.OK) && (!errorMatcher.find())) { // if it returns OK, and the random string above did NOT return ok, then // raise an alert // since the filename has likely been picked up and used as a file name from // the parameter bingo(Alert.RISK_HIGH, Alert.CONFIDENCE_MEDIUM, null, param, prefixedUrlfilename, null, msg); // All done. No need to look for vulnerabilities on subsequent parameters // on the same request (to reduce performance impact) return; } // Check if the scan has been stopped // if yes dispose resources and exit if (isStop()) { // Dispose all resources // Exit the plugin return; } } } // Check 6 for local file names // TODO: consider making this check 1, for performance reasons // TODO: if the original query was http://www.example.com/a/b/c/d.jsp?param=paramvalue // then check if the following gives comparable results to the original query // http://www.example.com/a/b/c/d.jsp?param=../c/paramvalue // if it does, then we likely have a local file Path Traversal vulnerability // this is nice because it means we do not have to guess any file names, and would only // require one // request to find the vulnerability // but it would be foiled by simple input validation on "..", for instance. } catch (SocketTimeoutException ste) { log.warn("A timeout occurred while checking [" + msg.getRequestHeader().getMethod() + "] [" + msg.getRequestHeader().getURI() + "], parameter [" + param + "] for Path Traversal. " + "The currently configured timeout is: " + Integer.toString( Model.getSingleton().getOptionsParam().getConnectionParam().getTimeoutInSecs())); if (log.isDebugEnabled()) { log.debug("Caught " + ste.getClass().getName() + " " + ste.getMessage()); } } catch (IOException e) { log.warn("An error occurred while checking [" + msg.getRequestHeader().getMethod() + "] [" + msg.getRequestHeader().getURI() + "], parameter [" + param + "] for Path Traversal." + "Caught " + e.getClass().getName() + " " + e.getMessage()); } }