Example usage for java.net SocketTimeoutException getClass

List of usage examples for java.net SocketTimeoutException getClass

Introduction

In this page you can find the example usage for java.net SocketTimeoutException getClass.

Prototype

@HotSpotIntrinsicCandidate
public final native Class<?> getClass();

Source Link

Document

Returns the runtime class of this Object .

Usage

From source file:org.apache.manifoldcf.crawler.connectors.rss.RSSConnector.java

/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
* The document specification allows this class to filter what is done based on the job.
* The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers to process.
*@param statuses are the currently-stored document versions for each document in the set of document identifiers
* passed in above./*from w  ww  .ja  va  2  s  .  c  o  m*/
*@param activities is the interface this method should use to queue up new document references
* and ingest documents.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
*/
@Override
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
        IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
        throws ManifoldCFException, ServiceInterruption {
    getSession();

    // The connection limit is designed to permit this connector to coexist with potentially other connectors, such as the web connector.
    // There is currently no good way to enforce connection limits across all installed connectors - this will require considerably more
    // thought to set up properly.
    int connectionLimit = 200;

    String[] fixedList = new String[2];

    if (Logging.connectors.isDebugEnabled())
        Logging.connectors.debug("RSS: In getDocumentVersions for "
                + Integer.toString(documentIdentifiers.length) + " documents");

    Filter f = new Filter(spec, false);

    String[] acls = f.getAcls();
    // Sort it,
    java.util.Arrays.sort(acls);

    // NOTE: There are two kinds of documents in here; documents that are RSS feeds (that presumably have a content-type
    // of text/xml), and documents that need to be indexed.
    //
    // For the latter, the metadata etc is part of the version string.  For the former, the only thing that is part of the version string is the
    // document's checksum.
    //
    // The need to exclude documents from fetch based on whether they match an expression causes some difficulties, because we really
    // DON'T want this to apply to the feeds themselves.  Since the distinguishing characteristic of a feed is that it is in the seed list,
    // and that its content-type is text/xml, we could use either of these characteristics to treat feeds differently from
    // fetchable urls.  But the latter approach requires a fetch, which is forbidden.  So - the spec will be used to characterize the url.
    // However, the spec might change, and the url might be dropped from the list - and then what??
    //
    // The final solution is to simply not queue what cannot be mapped.

    int feedTimeout = f.getFeedTimeoutValue();

    // The document specification has already been used to trim out documents that are not
    // allowed from appearing in the queue.  So, even that has already been done.
    for (String documentIdentifier : documentIdentifiers) {
        // If it is in this list, we presume that it has been vetted against the map etc., so we don't do that again.  We just fetch it.
        // And, if the content type is xml, we calculate the version as if it is a feed rather than a document.

        // Get the url
        String urlValue = documentIdentifier;

        if (Logging.connectors.isDebugEnabled())
            Logging.connectors.debug("RSS: Getting version string for '" + urlValue + "'");

        String versionString;
        String ingestURL = null;
        String[] pubDates = null;
        String[] sources = null;
        String[] titles = null;
        String[] authorNames = null;
        String[] authorEmails = null;
        String[] categories = null;
        String[] descriptions = null;

        try {
            // If there's a carrydown "data" value for this url, we use that value rather than actually fetching the document.  This also means we don't need to
            // do a robots check, because we aren't actually crawling anything.  So, ALWAYS do this first...
            CharacterInput[] dechromedData = activities.retrieveParentDataAsFiles(urlValue, "data");
            try {
                if (dechromedData.length > 0) {
                    // Data already available.  The fetch cycle can be entirely avoided, as can the robots check.
                    ingestURL = f.mapDocumentURL(urlValue);
                    if (ingestURL != null) {
                        // Open up an input stream corresponding to the carrydown data.  The stream will be encoded as utf-8.
                        try {
                            InputStream is = dechromedData[0].getUtf8Stream();
                            try {
                                StringBuilder sb = new StringBuilder();
                                long checkSum = cache.addData(activities, urlValue, "text/html", is);
                                // Grab what we need from the passed-down data for the document.  These will all become part
                                // of the version string.
                                pubDates = activities.retrieveParentData(urlValue, "pubdate");
                                sources = activities.retrieveParentData(urlValue, "source");
                                titles = activities.retrieveParentData(urlValue, "title");
                                authorNames = activities.retrieveParentData(urlValue, "authorname");
                                authorEmails = activities.retrieveParentData(urlValue, "authoremail");
                                categories = activities.retrieveParentData(urlValue, "category");
                                descriptions = activities.retrieveParentData(urlValue, "description");
                                java.util.Arrays.sort(pubDates);
                                java.util.Arrays.sort(sources);
                                java.util.Arrays.sort(titles);
                                java.util.Arrays.sort(authorNames);
                                java.util.Arrays.sort(authorEmails);
                                java.util.Arrays.sort(categories);
                                java.util.Arrays.sort(descriptions);

                                if (sources.length == 0) {
                                    if (Logging.connectors.isDebugEnabled())
                                        Logging.connectors.debug("RSS: Warning; URL '" + ingestURL
                                                + "' doesn't seem to have any RSS feed source!");
                                }

                                sb.append('+');
                                packList(sb, acls, '+');
                                if (acls.length > 0) {
                                    sb.append('+');
                                    pack(sb, defaultAuthorityDenyToken, '+');
                                } else
                                    sb.append('-');
                                // The ingestion URL
                                pack(sb, ingestURL, '+');
                                // The pub dates
                                packList(sb, pubDates, '+');
                                // The titles
                                packList(sb, titles, '+');
                                // The sources
                                packList(sb, sources, '+');
                                // The categories
                                packList(sb, categories, '+');
                                // The descriptions
                                packList(sb, descriptions, '+');
                                // The author names
                                packList(sb, authorNames, '+');
                                // The author emails
                                packList(sb, authorEmails, '+');

                                // Do the checksum part, which does not need to be parseable.
                                sb.append(new Long(checkSum).toString());

                                versionString = sb.toString();
                            } finally {
                                is.close();
                            }
                        } catch (java.net.SocketTimeoutException e) {
                            throw new ManifoldCFException(
                                    "IO exception reading data from string: " + e.getMessage(), e);
                        } catch (InterruptedIOException e) {
                            throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
                                    ManifoldCFException.INTERRUPTED);
                        } catch (IOException e) {
                            throw new ManifoldCFException(
                                    "IO exception reading data from string: " + e.getMessage(), e);
                        }
                    } else {
                        // Document a seed or unmappable; just skip
                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("RSS: Skipping carry-down document '" + urlValue
                                    + "' because it is unmappable or is a seed.");
                    }
                } else {
                    // Get the old version string
                    String oldVersionString = statuses.getIndexedVersionString(documentIdentifier);

                    // Unpack the old version as much as possible.
                    // We are interested in what the ETag and Last-Modified headers were last time.
                    String lastETagValue = null;
                    String lastModifiedValue = null;
                    // Note well: Non-continuous jobs cannot use etag because the rss document MUST be fetched each time for such jobs,
                    // or the documents it points at would get deleted.
                    //
                    // NOTE: I disabled this code because we really need the feed's TTL value in order to reschedule properly.  I can't get the
                    // TTL value without refetching the document - therefore ETag and Last-Modified cannot be used :-(
                    if (false && jobMode == JOBMODE_CONTINUOUS && oldVersionString != null
                            && oldVersionString.startsWith("-")) {
                        // It's a feed, so the last etag and last-modified fields should be encoded in this version string.
                        StringBuilder lastETagBuffer = new StringBuilder();
                        int unpackPos = unpack(lastETagBuffer, oldVersionString, 1, '+');
                        StringBuilder lastModifiedBuffer = new StringBuilder();
                        unpackPos = unpack(lastModifiedBuffer, oldVersionString, unpackPos, '+');
                        if (lastETagBuffer.length() > 0)
                            lastETagValue = lastETagBuffer.toString();
                        if (lastModifiedBuffer.length() > 0)
                            lastModifiedValue = lastModifiedBuffer.toString();
                    }

                    if (Logging.connectors.isDebugEnabled()
                            && (lastETagValue != null || lastModifiedValue != null))
                        Logging.connectors.debug(
                                "RSS: Document '" + urlValue + "' was found to have a previous ETag value of '"
                                        + ((lastETagValue == null) ? "null" : lastETagValue)
                                        + "' and a previous Last-Modified value of '"
                                        + ((lastModifiedValue == null) ? "null" : lastModifiedValue) + "'");

                    // Robots check.  First, we need to separate the url into its components
                    URL url;
                    try {
                        url = new URL(urlValue);
                    } catch (MalformedURLException e) {
                        Logging.connectors.debug("RSS: URL '" + urlValue + "' is malformed; skipping", e);
                        activities.deleteDocument(documentIdentifier);
                        continue;
                    }

                    String protocol = url.getProtocol();
                    int port = url.getPort();
                    String hostName = url.getHost();
                    String pathPart = url.getFile();

                    // Check with robots to see if it's allowed
                    if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext, throttleGroupName,
                            protocol, port, hostName, url.getPath(), userAgent, from, proxyHost, proxyPort,
                            proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities,
                            connectionLimit)) {
                        activities.recordActivity(null, ACTIVITY_FETCH, null, urlValue, Integer.toString(-2),
                                "Robots exclusion", null);

                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors
                                    .debug("RSS: Skipping url '" + urlValue + "' because robots.txt says to");
                        activities.deleteDocument(documentIdentifier);
                        continue;
                    }

                    // Now, use the fetcher, and get the file.
                    IThrottledConnection connection = fetcher.createConnection(currentContext,
                            throttleGroupName, hostName, connectionLimit, feedTimeout, proxyHost, proxyPort,
                            proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities);
                    try {
                        // Begin the fetch
                        connection.beginFetch("Data");
                        try {
                            // Execute the request.
                            // Use the connect timeout from the document specification!
                            int status = connection.executeFetch(protocol, port, pathPart, userAgent, from,
                                    lastETagValue, lastModifiedValue);
                            switch (status) {
                            case IThrottledConnection.STATUS_NOCHANGE:
                                versionString = oldVersionString;
                                break;
                            case IThrottledConnection.STATUS_OK:
                                try {
                                    if (Logging.connectors.isDebugEnabled())
                                        Logging.connectors.debug("RSS: Successfully fetched " + urlValue);
                                    // Document successfully fetched!
                                    // If its content is xml, presume it's a feed...
                                    String contentType = connection.getResponseHeader("Content-Type");
                                    // Some sites have multiple content types.  We just look at the LAST one in that case.
                                    if (contentType != null) {
                                        String[] contentTypes = contentType.split(",");
                                        if (contentTypes.length > 0)
                                            contentType = contentTypes[contentTypes.length - 1].trim();
                                        else
                                            contentType = null;
                                    }
                                    String strippedContentType = contentType;
                                    if (strippedContentType != null) {
                                        int pos = strippedContentType.indexOf(";");
                                        if (pos != -1)
                                            strippedContentType = strippedContentType.substring(0, pos).trim();
                                    }
                                    boolean isXML = (strippedContentType != null
                                            && xmlContentTypes.contains(strippedContentType));
                                    ingestURL = null;
                                    if (!isXML) {
                                        // If the chromed content mode is set to "skip", and we got here, it means
                                        // we should not include the content.
                                        if (f.getChromedContentMode() == CHROMED_SKIP) {
                                            if (Logging.connectors.isDebugEnabled())
                                                Logging.connectors.debug("RSS: Removing url '" + urlValue
                                                        + "' because it no longer has dechromed content available");
                                            versionString = null;
                                            break;
                                        }

                                        // Decide whether to exclude this document based on what we see here.
                                        // Basically, we want to get rid of everything that we don't know what
                                        // to do with in the ingestion system.
                                        if (!activities.checkMimeTypeIndexable(contentType)) {
                                            if (Logging.connectors.isDebugEnabled())
                                                Logging.connectors.debug("RSS: Removing url '" + urlValue
                                                        + "' because it had the wrong content type: "
                                                        + ((contentType == null) ? "null"
                                                                : "'" + contentType + "'"));
                                            versionString = null;
                                            break;
                                        }

                                        ingestURL = f.mapDocumentURL(urlValue);
                                    } else {
                                        if (Logging.connectors.isDebugEnabled())
                                            Logging.connectors
                                                    .debug("RSS: The url '" + urlValue + "' is a feed");

                                        if (!f.isSeed(urlValue)) {
                                            // Remove the feed from consideration, since it has left the list of seeds
                                            if (Logging.connectors.isDebugEnabled())
                                                Logging.connectors.debug("RSS: Removing feed url '" + urlValue
                                                        + "' because it is not a seed.");
                                            versionString = null;
                                            break;
                                        }
                                    }

                                    InputStream is = connection.getResponseBodyStream();
                                    try {
                                        long checkSum = cache.addData(activities, urlValue, contentType, is);
                                        StringBuilder sb = new StringBuilder();
                                        if (ingestURL != null) {
                                            // We think it is ingestable.  The version string accordingly starts with a "+".

                                            // Grab what we need from the passed-down data for the document.  These will all become part
                                            // of the version string.
                                            pubDates = activities.retrieveParentData(urlValue, "pubdate");
                                            sources = activities.retrieveParentData(urlValue, "source");
                                            titles = activities.retrieveParentData(urlValue, "title");
                                            authorNames = activities.retrieveParentData(urlValue, "authorname");
                                            authorEmails = activities.retrieveParentData(urlValue,
                                                    "authoremail");
                                            categories = activities.retrieveParentData(urlValue, "category");
                                            descriptions = activities.retrieveParentData(urlValue,
                                                    "description");
                                            java.util.Arrays.sort(pubDates);
                                            java.util.Arrays.sort(sources);
                                            java.util.Arrays.sort(titles);
                                            java.util.Arrays.sort(authorNames);
                                            java.util.Arrays.sort(authorEmails);
                                            java.util.Arrays.sort(categories);
                                            java.util.Arrays.sort(descriptions);

                                            if (sources.length == 0) {
                                                if (Logging.connectors.isDebugEnabled())
                                                    Logging.connectors.debug("RSS: Warning; URL '" + ingestURL
                                                            + "' doesn't seem to have any RSS feed source!");
                                            }

                                            sb.append('+');
                                            packList(sb, acls, '+');
                                            if (acls.length > 0) {
                                                sb.append('+');
                                                pack(sb, defaultAuthorityDenyToken, '+');
                                            } else
                                                sb.append('-');
                                            // The ingestion URL
                                            pack(sb, ingestURL, '+');
                                            // The pub dates
                                            packList(sb, pubDates, '+');
                                            // The titles
                                            packList(sb, titles, '+');
                                            // The sources
                                            packList(sb, sources, '+');
                                            // The categories
                                            packList(sb, categories, '+');
                                            // The descriptions
                                            packList(sb, descriptions, '+');
                                            // The author names
                                            packList(sb, authorNames, '+');
                                            // The author emails
                                            packList(sb, authorEmails, '+');
                                        } else {
                                            sb.append('-');
                                            String etag = connection.getResponseHeader("ETag");
                                            if (etag == null)
                                                pack(sb, "", '+');
                                            else
                                                pack(sb, etag, '+');
                                            String lastModified = connection.getResponseHeader("Last-Modified");
                                            if (lastModified == null)
                                                pack(sb, "", '+');
                                            else
                                                pack(sb, lastModified, '+');

                                        }

                                        // Do the checksum part, which does not need to be parseable.
                                        sb.append(new Long(checkSum).toString());

                                        versionString = sb.toString();
                                    } finally {
                                        is.close();
                                    }
                                } catch (java.net.SocketTimeoutException e) {
                                    Logging.connectors
                                            .warn("RSS: Socket timeout exception fetching document contents '"
                                                    + urlValue + "' - skipping: " + e.getMessage(), e);
                                    versionString = null;
                                } catch (ConnectTimeoutException e) {
                                    Logging.connectors
                                            .warn("RSS: Connecto timeout exception fetching document contents '"
                                                    + urlValue + "' - skipping: " + e.getMessage(), e);
                                    versionString = null;
                                } catch (InterruptedIOException e) {
                                    throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
                                            ManifoldCFException.INTERRUPTED);
                                } catch (IOException e) {
                                    Logging.connectors.warn("RSS: IO exception fetching document contents '"
                                            + urlValue + "' - skipping: " + e.getMessage(), e);
                                    versionString = null;
                                }

                                break;

                            case IThrottledConnection.STATUS_SITEERROR:
                            case IThrottledConnection.STATUS_PAGEERROR:
                            default:
                                // Record an *empty* version.
                                // This signals the processDocuments() method that we really don't want to ingest this document, but we also don't
                                // want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times.
                                versionString = "";
                                break;
                            }
                        } finally {
                            connection.doneFetch(activities);
                        }
                    } finally {
                        connection.close();
                    }

                    if (versionString == null) {
                        activities.deleteDocument(documentIdentifier);
                        continue;
                    }

                    if (!(versionString.length() == 0
                            || activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)))
                        continue;

                    // Process document!
                    if (Logging.connectors.isDebugEnabled())
                        Logging.connectors.debug("RSS: Processing '" + urlValue + "'");

                    // The only links we extract come from documents that we think are RSS feeds.
                    // When we think that's the case, we attempt to parse it as RSS XML.
                    if (ingestURL == null) {
                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("RSS: Interpreting document '" + urlValue + "' as a feed");

                        // We think it is a feed.
                        // If this is a continuous job, AND scanonly is true, it means that the document was either identical to the
                        // previous fetch, or was not fetched at all.  In that case, it may not even be there, and we *certainly* don't
                        // want to attempt to process it in any case.
                        //

                        // NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost.  If the
                        // TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds.
                        if (true || jobMode != JOBMODE_CONTINUOUS) {
                            handleRSSFeedSAX(urlValue, activities, f);
                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("RSS: Extraction of feed '" + urlValue + "' complete");

                            // Record the feed's version string, so we won't refetch unless needed.
                            // This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to
                            // keep track of the adaptive parameters.
                            activities.recordDocument(documentIdentifier, versionString);
                        } else {
                            // The problem here is that we really do need to set the rescan time to something reasonable.
                            // But we might not even have read the feed!  So what to do??
                            // One answer is to build a connector-specific table that carries the last value of every feed around.
                            // Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified).
                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("RSS: Feed '" + urlValue
                                        + "' does not appear to differ from previous fetch for a continuous job; not extracting!");

                            long currentTime = System.currentTimeMillis();

                            Long defaultRescanTime = f.getDefaultRescanTime(currentTime);

                            if (defaultRescanTime != null) {
                                Long minimumTime = f.getMinimumRescanTime(currentTime);
                                if (minimumTime != null) {
                                    if (defaultRescanTime.longValue() < minimumTime.longValue())
                                        defaultRescanTime = minimumTime;
                                }
                            }

                            activities.setDocumentScheduleBounds(urlValue, defaultRescanTime, defaultRescanTime,
                                    null, null);

                        }
                    } else {
                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("RSS: Interpreting '" + urlValue + "' as a document");

                        String errorCode = null;
                        String errorDesc = null;
                        long startTime = System.currentTimeMillis();
                        Long fileLengthLong = null;
                        try {
                            long documentLength = cache.getDataLength(documentIdentifier);
                            if (!activities.checkLengthIndexable(documentLength)) {
                                activities.noDocument(documentIdentifier, versionString);
                                errorCode = activities.EXCLUDED_LENGTH;
                                errorDesc = "Document rejected because of length (" + documentLength + ")";
                                if (Logging.connectors.isDebugEnabled())
                                    Logging.connectors.debug("RSS: Skipping document '" + urlValue
                                            + "' because its length was rejected (" + documentLength + ")");
                                continue;
                            }

                            if (!activities.checkURLIndexable(documentIdentifier)) {
                                activities.noDocument(documentIdentifier, versionString);
                                errorCode = activities.EXCLUDED_URL;
                                errorDesc = "Document rejected because of URL ('" + documentIdentifier + "')";
                                if (Logging.connectors.isDebugEnabled())
                                    Logging.connectors.debug("RSS: Skipping document '" + urlValue
                                            + "' because its URL was rejected ('" + documentIdentifier + "')");
                                continue;
                            }

                            // Check if it's a recognized content type
                            String contentType = cache.getContentType(documentIdentifier);
                            // Some sites have multiple content types.  We just look at the LAST one in that case.
                            if (contentType != null) {
                                String[] contentTypes = contentType.split(",");
                                if (contentTypes.length > 0)
                                    contentType = contentTypes[contentTypes.length - 1].trim();
                                else
                                    contentType = null;
                            }
                            if (!activities.checkMimeTypeIndexable(contentType)) {
                                activities.noDocument(documentIdentifier, versionString);
                                errorCode = activities.EXCLUDED_MIMETYPE;
                                errorDesc = "Document rejected because of mime type (" + contentType + ")";
                                if (Logging.connectors.isDebugEnabled())
                                    Logging.connectors.debug("RSS: Skipping document '" + urlValue
                                            + "' because its mime type was rejected ('" + contentType + "')");
                                continue;
                            }

                            // Treat it as an ingestable document.

                            long dataSize = cache.getDataLength(urlValue);
                            RepositoryDocument rd = new RepositoryDocument();

                            // Set content type
                            if (contentType != null)
                                rd.setMimeType(contentType);

                            // Turn into acls and add into description
                            String[] denyAcls;
                            if (acls == null)
                                denyAcls = null;
                            else if (acls.length == 0)
                                denyAcls = new String[0];
                            else
                                denyAcls = new String[] { defaultAuthorityDenyToken };

                            if (acls != null && denyAcls != null)
                                rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, acls, denyAcls);

                            if (titles != null && titles.length > 0)
                                rd.addField("title", titles);
                            if (authorNames != null && authorNames.length > 0)
                                rd.addField("authorname", authorNames);
                            if (authorEmails != null && authorEmails.length > 0)
                                rd.addField("authoremail", authorEmails);
                            if (descriptions != null && descriptions.length > 0)
                                rd.addField("summary", descriptions);
                            if (sources != null && sources.length > 0)
                                rd.addField("source", sources);
                            if (categories != null && categories.length > 0)
                                rd.addField("category", categories);

                            // The pubdates are a ms since epoch value; we want the minimum one for the origination time.
                            Long minimumOrigTime = null;
                            if (pubDates != null && pubDates.length > 0) {
                                String[] pubDateValuesISO = new String[pubDates.length];
                                TimeZone tz = TimeZone.getTimeZone("UTC");
                                DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'");
                                df.setTimeZone(tz);
                                for (int k = 0; k < pubDates.length; k++) {
                                    String pubDate = pubDates[k];
                                    try {
                                        Long pubDateLong = new Long(pubDate);
                                        if (minimumOrigTime == null
                                                || pubDateLong.longValue() < minimumOrigTime.longValue())
                                            minimumOrigTime = pubDateLong;
                                        pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue()));
                                    } catch (NumberFormatException e) {
                                        // Do nothing; the version string seems to not mean anything
                                        pubDateValuesISO[k] = "";
                                    }
                                }
                                rd.addField("pubdate", pubDates);
                                rd.addField("pubdateiso", pubDateValuesISO);
                            }

                            if (minimumOrigTime != null)
                                activities.setDocumentOriginationTime(urlValue, minimumOrigTime);

                            InputStream is = cache.getData(urlValue);
                            if (is != null) {
                                try {
                                    rd.setBinary(is, dataSize);
                                    try {
                                        activities.ingestDocumentWithException(documentIdentifier,
                                                versionString, ingestURL, rd);
                                        errorCode = "OK";
                                        fileLengthLong = new Long(dataSize);
                                    } catch (IOException e) {
                                        errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                                        errorDesc = e.getMessage();
                                        handleIOException(e, "reading data");
                                    }
                                } finally {
                                    try {
                                        is.close();
                                    } catch (IOException e) {
                                        errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                                        errorDesc = e.getMessage();
                                        handleIOException(e, "closing stream");
                                    }
                                }
                            }
                        } catch (ManifoldCFException e) {
                            if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
                                errorCode = null;
                            throw e;
                        } finally {
                            if (errorCode != null)
                                activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, null, urlValue,
                                        errorCode, errorDesc, null);
                        }
                    }
                }
            } finally {
                for (CharacterInput ci : dechromedData) {
                    if (ci != null)
                        ci.discard();
                }

            }
        } finally {
            // Remove any fetched documents.
            cache.deleteData(documentIdentifier);
        }
    }
}

From source file:org.forgerock.maven.plugins.LinkTester.java

private void checkUrl(String path, String docUrl) {
    if (shouldSkipUrl(docUrl)) {
        debug("Skipping " + docUrl + " since it matches a skipUrlPattern");
        return;//  ww  w. jav a 2s. c o  m
    }
    if (tested.contains(docUrl)) {
        if (failedUrls.containsValue(docUrl)) {
            failedUrls.put(path, docUrl);
        }
        return;
    }
    debug("Checking " + docUrl + " from file: " + path);
    try {
        URL url = new URL(docUrl);
        URLConnection urlConn = url.openConnection();
        if (urlConn instanceof HttpURLConnection) {
            HttpURLConnection conn = (HttpURLConnection) urlConn;
            if (conn instanceof HttpsURLConnection) {
                HttpsURLConnection httpsConn = (HttpsURLConnection) conn;
                httpsConn.setHostnameVerifier(new TrustAllHostnameVerifier());
                httpsConn.setSSLSocketFactory(TRUST_ALL_SOCKET_FACTORY);
            }

            conn.setConnectTimeout(1000);
            //if we don't get anything back within 15 seconds it is safe to assume that something is really wrong
            //with that site..
            conn.setReadTimeout(15000);
            int responseCode = conn.getResponseCode();
            if (responseCode >= 400) {
                warn(docUrl + ": received unexpected response code: " + responseCode);
                failedUrls.put(path, docUrl);
            }
        }
    } catch (SocketTimeoutException ste) {
        warn(docUrl + ": " + ste.getClass().getName() + " " + ste.getMessage());
        timedOutUrls.put(path, docUrl);
    } catch (Exception ex) {
        warn(docUrl + ": " + ex.getClass().getName() + " " + ex.getMessage());
        failedUrls.put(path, docUrl);
    }
    tested.add(docUrl);
}

From source file:org.zaproxy.zap.extension.ascanrules.TestPathTraversal.java

/**
 * scans all GET and POST parameters for Path Traversal vulnerabilities
 *
 * @param msg/*from   w  w  w. ja  va2 s.  c om*/
 * @param param
 * @param value
 */
@Override
public void scan(HttpMessage msg, String param, String value) {

    try {
        // figure out how aggressively we should test
        int nixCount = 0;
        int winCount = 0;
        int dirCount = 0;
        int localTraversalLength = 0;

        // DEBUG only
        if (log.isDebugEnabled()) {
            log.debug("Attacking at Attack Strength: " + this.getAttackStrength());
        }

        switch (this.getAttackStrength()) {
        case LOW:
            // This works out as a total of 2+2+2+0*4+1 = 7 reqs / param
            nixCount = 2;
            winCount = 2;
            dirCount = 2;
            localTraversalLength = 0;
            break;

        case MEDIUM:
            // This works out as a total of 2+4+4+1*4+1 = 15 reqs / param
            nixCount = 2;
            winCount = 4;
            dirCount = 4;
            localTraversalLength = 1;
            break;

        case HIGH:
            // This works out as a total of 4+8+7+2*4+1 = 28 reqs / param
            nixCount = 4;
            winCount = 8;
            dirCount = 7;
            localTraversalLength = 2;
            break;

        case INSANE:
            // This works out as a total of 6+18+19+4*4+1 = 60 reqs / param
            nixCount = NIX_LOCAL_FILE_TARGETS.length;
            winCount = WIN_LOCAL_FILE_TARGETS.length;
            dirCount = LOCAL_DIR_TARGETS.length;
            localTraversalLength = 4;
            break;

        default:
            // Default to off
        }

        if (log.isDebugEnabled()) {
            log.debug("Checking [" + getBaseMsg().getRequestHeader().getMethod() + "] ["
                    + getBaseMsg().getRequestHeader().getURI() + "], parameter [" + param
                    + "] for Path Traversal to local files");
        }

        // Check 1: Start detection for Windows patterns
        // note that depending on the AttackLevel, the number of prefixes that we will try
        // changes.
        if (inScope(Tech.Windows)) {

            for (int h = 0; h < winCount; h++) {

                // Check if a there was a finding or the scan has been stopped
                // if yes dispose resources and exit
                if (sendAndCheckPayload(param, WIN_LOCAL_FILE_TARGETS[h], WIN_PATTERN) || isStop()) {
                    // Dispose all resources
                    // Exit the plugin
                    return;
                }
            }
        }

        // Check 2: Start detection for *NIX patterns
        // note that depending on the AttackLevel, the number of prefixes that we will try
        // changes.
        if (inScope(Tech.Linux) || inScope(Tech.MacOS)) {

            for (int h = 0; h < nixCount; h++) {

                // Check if a there was a finding or the scan has been stopped
                // if yes dispose resources and exit
                if (sendAndCheckPayload(param, NIX_LOCAL_FILE_TARGETS[h], NIX_PATTERN) || isStop()) {
                    // Dispose all resources
                    // Exit the plugin
                    return;
                }
            }
        }

        // Check 3: Detect if this page is a directory browsing component
        // example: https://www.buggedsite.org/log/index.php?dir=C:\
        // note that depending on the AttackLevel, the number of prefixes that we will try
        // changes.
        for (int h = 0; h < dirCount; h++) {

            // Check if a there was a finding or the scan has been stopped
            // if yes dispose resources and exit
            if (sendAndCheckPayload(param, LOCAL_DIR_TARGETS[h], DIR_PATTERN) || isStop()) {
                // Dispose all resources
                // Exit the plugin
                return;
            }
        }

        // Check 4: Start detection for internal well known files
        // try variants based on increasing ../ ..\ prefixes and the presence of the / and \
        // trailer
        // e.g. WEB-INF/web.xml, /WEB-INF/web.xml, ../WEB-INF/web.xml, /../WEB-INF/web.xml, ecc.
        // Both slashed and backslashed variants are checked
        // -------------------------------
        // Currently we've always checked only for J2EE known files
        // and this remains also for this version
        //
        // Web.config for .NET in the future?
        // -------------------------------
        String sslashPattern = "WEB-INF/web.xml";
        // The backslashed version of the same check
        String bslashPattern = sslashPattern.replace('/', '\\');

        if (inScope(Tech.Tomcat)) {

            for (int idx = 0; idx < localTraversalLength; idx++) {

                // Check if a there was a finding or the scan has been stopped
                // if yes dispose resources and exit
                if (sendAndCheckPayload(param, sslashPattern, WAR_PATTERN)
                        || sendAndCheckPayload(param, bslashPattern, WAR_PATTERN)
                        || sendAndCheckPayload(param, '/' + sslashPattern, WAR_PATTERN)
                        || sendAndCheckPayload(param, '\\' + bslashPattern, WAR_PATTERN) || isStop()) {

                    // Dispose all resources
                    // Exit the plugin
                    return;
                }

                sslashPattern = "../" + sslashPattern;
                bslashPattern = "..\\" + bslashPattern;
            }
        }

        // Check 5: try a local file Path Traversal on the file name of the URL (which obviously
        // will not be in the target list above).
        // first send a query for a random parameter value, and see if we get a 200 back
        // if 200 is returned, abort this check (on the url filename itself), because it would
        // be unreliable.
        // if we know that a random query returns <> 200, then a 200 response likely means
        // something!
        // this logic is all about avoiding false positives, while still attempting to match on
        // actual vulnerabilities
        msg = getNewMsg();
        setParameter(msg, param, NON_EXISTANT_FILENAME);

        // send the modified message (with a hopefully non-existent filename), and see what we
        // get back
        try {
            sendAndReceive(msg);

        } catch (SocketException | IllegalStateException | UnknownHostException | IllegalArgumentException
                | InvalidRedirectLocationException | URIException ex) {
            if (log.isDebugEnabled()) {
                log.debug("Caught " + ex.getClass().getName() + " " + ex.getMessage() + " when accessing: "
                        + msg.getRequestHeader().getURI().toString());
            }

            return; // Something went wrong, no point continuing
        }

        // do some pattern matching on the results.
        Pattern errorPattern = Pattern.compile("Exception|Error");
        Matcher errorMatcher = errorPattern.matcher(msg.getResponseBody().toString());

        String urlfilename = msg.getRequestHeader().getURI().getName();

        // url file name may be empty, i.e. there is no file name for next check
        if (!StringUtils.isEmpty(urlfilename)
                && (msg.getResponseHeader().getStatusCode() != HttpStatusCode.OK || errorMatcher.find())) {

            if (log.isDebugEnabled()) {
                log.debug("It is possible to check for local file Path Traversal on the url filename on ["
                        + msg.getRequestHeader().getMethod() + "] [" + msg.getRequestHeader().getURI() + "], ["
                        + param + "]");
            }

            String prefixedUrlfilename;

            // for the url filename, try each of the prefixes in turn
            for (String prefix : LOCAL_FILE_RELATIVE_PREFIXES) {

                prefixedUrlfilename = prefix + urlfilename;
                msg = getNewMsg();
                setParameter(msg, param, prefixedUrlfilename);

                // send the modified message (with the url filename), and see what we get back
                try {
                    sendAndReceive(msg);

                } catch (SocketException | IllegalStateException | UnknownHostException
                        | IllegalArgumentException | InvalidRedirectLocationException | URIException ex) {
                    if (log.isDebugEnabled()) {
                        log.debug("Caught " + ex.getClass().getName() + " " + ex.getMessage()
                                + " when accessing: " + msg.getRequestHeader().getURI().toString());
                    }

                    continue; // Something went wrong, move to the next prefix in the loop
                }

                // did we get an Exception or an Error?
                errorMatcher = errorPattern.matcher(msg.getResponseBody().toString());
                if ((msg.getResponseHeader().getStatusCode() == HttpStatusCode.OK) && (!errorMatcher.find())) {

                    // if it returns OK, and the random string above did NOT return ok, then
                    // raise an alert
                    // since the filename has likely been picked up and used as a file name from
                    // the parameter
                    bingo(Alert.RISK_HIGH, Alert.CONFIDENCE_MEDIUM, null, param, prefixedUrlfilename, null,
                            msg);

                    // All done. No need to look for vulnerabilities on subsequent parameters
                    // on the same request (to reduce performance impact)
                    return;
                }

                // Check if the scan has been stopped
                // if yes dispose resources and exit
                if (isStop()) {
                    // Dispose all resources
                    // Exit the plugin
                    return;
                }
            }
        }

        // Check 6 for local file names
        // TODO: consider making this check 1, for performance reasons
        // TODO: if the original query was http://www.example.com/a/b/c/d.jsp?param=paramvalue
        // then check if the following gives comparable results to the original query
        // http://www.example.com/a/b/c/d.jsp?param=../c/paramvalue
        // if it does, then we likely have a local file Path Traversal vulnerability
        // this is nice because it means we do not have to guess any file names, and would only
        // require one
        // request to find the vulnerability
        // but it would be foiled by simple input validation on "..", for instance.

    } catch (SocketTimeoutException ste) {
        log.warn("A timeout occurred while checking [" + msg.getRequestHeader().getMethod() + "] ["
                + msg.getRequestHeader().getURI() + "], parameter [" + param + "] for Path Traversal. "
                + "The currently configured timeout is: " + Integer.toString(
                        Model.getSingleton().getOptionsParam().getConnectionParam().getTimeoutInSecs()));

        if (log.isDebugEnabled()) {
            log.debug("Caught " + ste.getClass().getName() + " " + ste.getMessage());
        }

    } catch (IOException e) {
        log.warn("An error occurred while checking [" + msg.getRequestHeader().getMethod() + "] ["
                + msg.getRequestHeader().getURI() + "], parameter [" + param + "] for Path Traversal."
                + "Caught " + e.getClass().getName() + " " + e.getMessage());
    }
}