Example usage for java.net SocketTimeoutException getMessage

List of usage examples for java.net SocketTimeoutException getMessage


In this page you can find the example usage for java.net SocketTimeoutException getMessage.


public String getMessage() 

Source Link


Returns the detail message string of this throwable.


From source file:org.apache.manifoldcf.crawler.connectors.meridio.MeridioConnector.java

/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
* The document specification allows this class to filter what is done based on the job.
* The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers to process.
*@param statuses are the currently-stored document versions for each document in the set of document identifiers
* passed in above./*ww w . j  a v a2s. c  o  m*/
*@param activities is the interface this method should use to queue up new document references
* and ingest documents.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
        IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
        throws ManifoldCFException, ServiceInterruption {
    // Get forced acls/security enable/disable
    String[] acls = getAcls(spec);
    // Sort it, in case it is needed.
    if (acls != null)

    // Look at the metadata attributes.
    // So that the version strings are comparable, we will put them in an array first, and sort them.
    Set<String> holder = new HashSet<String>();

    String pathAttributeName = null;
    MatchMap matchMap = new MatchMap();
    boolean allMetadata = false;

    int i = 0;
    while (i < spec.getChildCount()) {
        SpecificationNode n = spec.getChild(i++);
        if (n.getType().equals("ReturnedMetadata")) {
            String category = n.getAttributeValue("category");
            String attributeName = n.getAttributeValue("property");
            String metadataName;
            if (category == null || category.length() == 0)
                metadataName = attributeName;
                metadataName = category + "." + attributeName;
        } else if (n.getType().equals("AllMetadata")) {
            String value = n.getAttributeValue("value");
            if (value != null && value.equals("true")) {
                allMetadata = true;
        } else if (n.getType().equals("pathnameattribute"))
            pathAttributeName = n.getAttributeValue("value");
        else if (n.getType().equals("pathmap")) {
            // Path mapping info also needs to be looked at, because it affects what is
            // ingested.
            String pathMatch = n.getAttributeValue("match");
            String pathReplace = n.getAttributeValue("replace");
            matchMap.appendMatchPair(pathMatch, pathReplace);

    while (true) {


        // The version string returned must include everything that could affect what is ingested.  In meridio's
        // case, this includes the date stamp, but it also includes the part of the specification that describes
        // the metadata desired.

        // The code here relies heavily on the search method to do it's thing.  The search method originally
        // used the document specification to determine what metadata to return, which was problematic because that
        // meant this method had to modify the specification (not good practice), and was also wrong from the point
        // of view that we need to get the metadata specification appended to the version string in some way, and
        // use THAT data in processDocuments().  So I've broken all that up.

        try {
            // Put into an array
            ReturnMetadata[] categoryPropertyValues;
            String[] categoryPropertyStringValues;
            String[] sortArray;
            if (allMetadata) {
                categoryPropertyStringValues = getMeridioDocumentProperties();
            } else {
                categoryPropertyStringValues = new String[holder.size()];
                i = 0;
                for (String value : holder) {
                    categoryPropertyStringValues[i++] = value;
            // Sort!
            categoryPropertyValues = new ReturnMetadata[categoryPropertyStringValues.length];
            i = 0;
            for (String value : categoryPropertyStringValues) {
                int dotIndex = value.indexOf(".");
                String categoryName = null;
                String propertyName;
                if (dotIndex == -1)
                    propertyName = value;
                else {
                    categoryName = value.substring(0, dotIndex);
                    propertyName = value.substring(dotIndex + 1);

                categoryPropertyValues[i++] = new ReturnMetadata(categoryName, propertyName);

            // Prepare the part of the version string that is decodeable
            StringBuilder decodeableString = new StringBuilder();

            // Add the metadata piece first
            packList(decodeableString, categoryPropertyStringValues, '+');

            // Now, put in the forced acls.
            // The version string needs only to contain the forced acls, since the version date captures changes
            // made to the acls that are actually associated with the document.
            if (acls == null)
            else {
                packList(decodeableString, acls, '+');
                pack(decodeableString, defaultAuthorityDenyToken, '+');

            // Calculate the part of the version string that comes from path name and mapping.
            if (pathAttributeName != null) {
                pack(decodeableString, pathAttributeName, '+');
                pack(decodeableString, matchMap.toString(), '+');
            } else

            long[] docIds = new long[documentIdentifiers.length];
            for (i = 0; i < documentIdentifiers.length; i++) {
                docIds[i] = new Long(documentIdentifiers[i]).longValue();

            * Call the search, with the document specification and the list of
            * document ids - the search will never return more than exactly
            * one match per document id
            * We are assuming that the maximum number of hits to return
            * should never be more than the maximum batch size set up for this
            * class
            * We are just making one web service call (to the search API)
            * rather than iteratively calling a web service method for each
            * document passed in as part of the document array
            * Additionally, re-using the same search method as for the
            * "getDocumentIdentifiers" method ensures that we are not
            * duplicating any logic which ensures that the document/records
            * in question match the search criteria or not.
            DMSearchResults searchResults = documentSpecificationSearch(spec, 0, 0, 1,
                    this.getMaxDocumentRequest(), docIds, null);

            if (Logging.connectors.isDebugEnabled())
                Logging.connectors.debug("Found a total of <" + searchResults.totalHitsCount + "> hit(s) "
                        + "and <" + searchResults.returnedHitsCount + "> were returned by the method call");

            // If we are searching based on document identifier, then it is possible that we will not
            // find a document we are looking for, if it was removed from the system between the time
            // it was put in the queue and when it's version is obtained.  Documents where this happens
            // should return a version string of null.

            // Let's go through the search results and build a hash based on the document identifier.
            Map<Long, SEARCHRESULTS_DOCUMENTS> documentMap = new HashMap<Long, SEARCHRESULTS_DOCUMENTS>();
            if (searchResults.dsDM != null) {
                SEARCHRESULTS_DOCUMENTS[] srd = searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS();
                for (i = 0; i < srd.length; i++) {
                    documentMap.put(new Long(srd[i].getDocId()), srd[i]);

            // Now, walk through the individual documents.
            Map<Long, String> versionStrings = new HashMap<Long, String>();
            for (int j = 0; j < docIds.length; j++) {
                String documentIdentifier = documentIdentifiers[j];
                long docId = docIds[j];
                Long docKey = new Long(docId);
                // Look up the record.
                SEARCHRESULTS_DOCUMENTS doc = documentMap.get(docKey);
                if (doc != null) {
                    // Set the version string.  The parseable stuff goes first, so parsing is easy.
                    String version = doc.getStr_value();
                    StringBuilder composedVersion = new StringBuilder();
                    // Added 9/7/2007
                    String versionString = composedVersion.toString();
                    if (Logging.connectors.isDebugEnabled())
                                .debug("Meridio: Document " + docKey + " has version " + versionString);
                    if (activities.checkDocumentNeedsReindexing(documentIdentifier, versionString))
                        versionStrings.put(docKey, versionString);
                } else {
                    if (Logging.connectors.isDebugEnabled())
                        Logging.connectors.debug("Meridio: Document " + docKey
                                + " is no longer in the search set, or has been deleted - removing.");

            // Now submit search requests for all the documents requiring fetch.

            Map<Long, Map<String, String>> documentPropertyMap = new HashMap<Long, Map<String, String>>();

            // Only look up metadata if we need some!
            if (versionStrings.size() > 0 && categoryPropertyValues.length > 0) {
                long[] fetchIds = new long[versionStrings.size()];
                i = 0;
                for (Long docKey : versionStrings.keySet()) {
                    fetchIds[i++] = docKey;

                * Call the search, with the document specification and the list of
                * document ids - the search will never return more than exactly
                * one match per document id
                * This call will return all the metadata that was specified in the
                * document specification for all the documents and
                * records in one call.
                searchResults = documentSpecificationSearch(spec, 0, 0, 1, fetchIds.length, fetchIds,

                // If we ask for a document and it is no longer there, we should treat this as a deletion.
                // The activity in that case is to delete the document.  A similar thing should happen if
                // any of the other methods (like getting the document's content) also fail to find the
                // document.

                // Let's build a hash which contains all the document metadata returned.  The form of
                // the hash will be: key = the document identifier, value = another hash, which is keyed
                // by the metadata category/property, and which has a value that is the metadata value.

                Map<Long, MutableInteger> counterMap = new HashMap<Long, MutableInteger>();

                if (searchResults.dsDM != null) {
                    SEARCHRESULTS_DOCUMENTS[] searchResultsDocuments = searchResults.dsDM
                    for (SEARCHRESULTS_DOCUMENTS searchResultsDocument : searchResultsDocuments) {
                        long docId = searchResultsDocument.getDocId();
                        Long docKey = new Long(docId);
                        MutableInteger counterMapItem = counterMap.get(docKey);
                        if (counterMapItem == null) {
                            counterMapItem = new MutableInteger();
                            counterMap.put(docKey, counterMapItem);

                        String propertyName = categoryPropertyStringValues[counterMapItem.getValue()];
                        String propertyValue = searchResultsDocuments[i].getStr_value();
                        Map<String, String> propertyMap = documentPropertyMap.get(docKey);
                        if (propertyMap == null) {
                            propertyMap = new HashMap<String, String>();
                            documentPropertyMap.put(docKey, propertyMap);
                        if (propertyValue != null && propertyValue.length() > 0)
                            propertyMap.put(propertyName, propertyValue);

            // Okay, we are ready now to go through the individual documents and do the ingestion or deletion.
            for (String documentIdentifier : documentIdentifiers) {
                Long docKey = new Long(documentIdentifier);
                long docId = docKey.longValue();
                String docVersion = versionStrings.get(docKey);
                if (docVersion != null) {
                    if (Logging.connectors.isDebugEnabled())
                        Logging.connectors.debug("Processing document identifier '" + documentIdentifier + "' "
                                + "with version string '" + docVersion + "'");

                    // For each document, be sure the job is still allowed to run.

                    RepositoryDocument repositoryDocument = new RepositoryDocument();

                    // Load the metadata items into the ingestion document object
                    Map<String, String> docMetadataMap = documentPropertyMap.get(docKey);
                    if (docMetadataMap != null) {
                        for (String categoryPropertyName : categoryPropertyStringValues) {
                            String propertyValue = docMetadataMap.get(categoryPropertyName);
                            if (propertyValue != null && propertyValue.length() > 0)
                                repositoryDocument.addField(categoryPropertyName, propertyValue);

                    * Construct the URL to the object
                    * HTTP://HOST:PORT/meridio/browse/downloadcontent.aspx?documentId=<docId>&launchMode=1&launchAs=0
                    * I expect we need to add additional parameters to the configuration
                    * specification
                    String fileURL = urlBase + new Long(docId).toString();
                    if (Logging.connectors.isDebugEnabled())
                                "URL for document '" + new Long(docId).toString() + "' is '" + fileURL + "'");

                    * Get the object's ACLs and owner information
                    DMDataSet documentData = null;
                    documentData = meridio_.getDocumentData((int) docId, true, true, false, false,
                            DmVersionInfo.LATEST, false, false, false);

                    if (null == documentData) {
                        if (Logging.connectors.isDebugEnabled())
                                    .debug("Meridio: Could not retrieve document data for document id '"
                                            + new Long(docId).toString()
                                            + "' in processDocuments method - deleting document.");
                        activities.noDocument(documentIdentifier, docVersion);

                    if (null == documentData.getDOCUMENTS() || documentData.getDOCUMENTS().length != 1) {
                        if (Logging.connectors.isDebugEnabled())
                                    .debug("Meridio: Could not retrieve document owner for document id '"
                                            + new Long(docId).toString()
                                            + "' in processDocuments method. No information or incorrect amount "
                                            + "of information was returned");
                        activities.noDocument(documentIdentifier, docVersion);

                    // Do path metadata
                    if (pathAttributeName != null && pathAttributeName.length() > 0) {
                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("Meridio: Path attribute name is " + pathAttributeName);
                        RMDataSet partList;
                        int recordType = documentData.getDOCUMENTS()[0].getPROP_recordType();
                        if (recordType == 0 || recordType == 4 || recordType == 19)
                            partList = meridio_.getRecordPartList((int) docId, false, false);
                            partList = meridio_.getDocumentPartList((int) docId);
                        if (partList != null) {
                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("Meridio: Document '" + new Long(docId).toString()
                                        + "' has a part list with "
                                        + Integer.toString(partList.getRm2vPart().length) + " values");

                            for (int k = 0; k < partList.getRm2vPart().length; k++) {
                        } else {
                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("Meridio: Document '" + new Long(docId).toString()
                                        + "' has no part list, so no path attribute");

                    // Process acls.  If there are forced acls, use those, otherwise get them from Meridio.
                    String[] allowAcls;
                    String[] denyAcls;

                    // forcedAcls will be null if security is off, or nonzero length if security is on but hard-wired
                    if (acls != null && acls.length == 0) {
                        ACCESSCONTROL[] documentAcls = documentData.getACCESSCONTROL();
                        List<String> allowAclsArrayList = new ArrayList<String>();
                        List<String> denyAclsArrayList = new ArrayList<String>();

                        // Allow a broken authority to disable all Meridio documents, even if the document is 'wide open', because
                        // Meridio does not permit viewing of the document if the user does not exist (at least, I don't know of a way).

                        if (documentAcls != null) {
                            for (int j = 0; j < documentAcls.length; j++) {
                                if (Logging.connectors.isDebugEnabled())
                                    Logging.connectors.debug("Object Id '" + documentAcls[j].getObjectId()
                                            + "' " + "Object Type '" + documentAcls[j].getObjectType() + "' "
                                            + "Permission '" + documentAcls[j].getPermission() + "' "
                                            + "User Id '" + documentAcls[j].getUserId() + "' " + "Group Id '"
                                            + documentAcls[j].getGroupId() + "'");

                                if (documentAcls[j].getPermission() == 0) // prohibit permission
                                    if (documentAcls[j].getGroupId() > 0) {
                                        denyAclsArrayList.add("G" + documentAcls[j].getGroupId());
                                    } else if (documentAcls[j].getUserId() > 0) {
                                        denyAclsArrayList.add("U" + documentAcls[j].getUserId());
                                } else // read, amend or manage
                                    if (documentAcls[j].getGroupId() > 0) {
                                        allowAclsArrayList.add("G" + documentAcls[j].getGroupId());
                                    } else if (documentAcls[j].getUserId() > 0) {
                                        allowAclsArrayList.add("U" + documentAcls[j].getUserId());

                        DOCUMENTS document = documentData.getDOCUMENTS()[0];

                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("Document id '" + new Long(docId).toString()
                                    + "' is owned by owner id '" + document.getPROP_ownerId()
                                    + "' having the owner name '" + document.getPROP_ownerName()
                                    + "' Record Type is '" + document.getPROP_recordType() + "'");

                        if (document.getPROP_recordType() == 4 || document.getPROP_recordType() == 19) {
                            RMDataSet rmds = meridio_.getRecord((int) docId, false, false, false);
                            Rm2vRecord record = rmds.getRm2vRecord()[0];

                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("Record User Id Owner is '" + record.getOwnerID()
                                        + "' Record Group Owner Id is '" + record.getGroupOwnerID() + "'");

                            * Either a group or a user owns a record, cannot be both and the
                            * group takes priority if it is set
                            if (record.getGroupOwnerID() > 0) {
                                allowAclsArrayList.add("G" + record.getGroupOwnerID());
                            } else if (record.getOwnerID() > 0) {
                                allowAclsArrayList.add("U" + record.getOwnerID());
                        } else {
                            allowAclsArrayList.add("U" + document.getPROP_ownerId());

                        * Set up the string arrays and then set the ACLs in the
                        * repository document
                        allowAcls = new String[allowAclsArrayList.size()];
                        for (int j = 0; j < allowAclsArrayList.size(); j++) {
                            allowAcls[j] = allowAclsArrayList.get(j);
                            if (Logging.connectors.isDebugEnabled())
                                        .debug("Meridio: Adding '" + allowAcls[j] + "' to allow ACLs");

                        denyAcls = new String[denyAclsArrayList.size()];
                        for (int j = 0; j < denyAclsArrayList.size(); j++) {
                            denyAcls[j] = denyAclsArrayList.get(j);
                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("Meridio: Adding '" + denyAcls[j] + "' to deny ACLs");
                    } else {
                        allowAcls = acls;
                        if (allowAcls == null)
                            denyAcls = null;
                            denyAcls = new String[] { defaultAuthorityDenyToken };

                    repositoryDocument.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, allowAcls,

                    * Get the object's content, and ingest the document
                    try {
                        AttachmentPart ap = meridio_.getLatestVersionFile((int) docId);
                        if (null == ap) {
                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("Meridio: Failed to get content for document '"
                                        + new Long(docId).toString() + "'");
                            // No document.  Delete what's there
                            activities.noDocument(documentIdentifier, docVersion);
                        try {
                            // Get the file name.
                            String fileName = ap.getDataHandler().getName();
                            // Log what we are about to do.
                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("Meridio: File data is supposedly in " + fileName);
                            File theTempFile = new File(fileName);
                            if (theTempFile.isFile()) {
                                long fileSize = theTempFile.length(); // ap.getSize();
                                if (activities.checkLengthIndexable(fileSize)) {
                                    InputStream is = new FileInputStream(theTempFile); // ap.getDataHandler().getInputStream();
                                    try {
                                        repositoryDocument.setBinary(is, fileSize);

                                        if (null != activities) {
                                                    docVersion, fileURL, repositoryDocument);
                                    } finally {
                                } else {
                                    activities.noDocument(documentIdentifier, docVersion);
                            } else {
                                if (Logging.connectors.isDebugEnabled())
                                            "Meridio: Expected temporary file was not present - skipping document '"
                                                    + new Long(docId).toString() + "'");
                        } finally {

                    } catch (java.net.SocketTimeoutException ioex) {
                        throw new ManifoldCFException("Socket timeout exception: " + ioex.getMessage(), ioex);
                    } catch (ConnectTimeoutException ioex) {
                        throw new ManifoldCFException("Connect timeout exception: " + ioex.getMessage(), ioex);
                    } catch (InterruptedIOException e) {
                        throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
                    } catch (org.apache.axis.AxisFault e) {
                        throw e;
                    } catch (RemoteException e) {
                        throw e;
                    } catch (SOAPException soapEx) {
                        throw new ManifoldCFException(
                                "SOAP Exception encountered while retrieving document content: "
                                        + soapEx.getMessage(),
                    } catch (IOException ioex) {
                        throw new ManifoldCFException("Input stream failure: " + ioex.getMessage(), ioex);

            Logging.connectors.debug("Meridio: Exiting 'processDocuments' method");
        } catch (org.apache.axis.AxisFault e) {
            long currentTime = System.currentTimeMillis();
            if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/", "HTTP"))) {
                org.w3c.dom.Element elem = e.lookupFaultDetail(
                        new javax.xml.namespace.QName("http://xml.apache.org/axis/", "HttpErrorCode"));
                if (elem != null) {
                    String httpErrorCode = elem.getFirstChild().getNodeValue().trim();
                    throw new ManifoldCFException("Unexpected http error code " + httpErrorCode
                            + " accessing Meridio: " + e.getMessage(), e);
                throw new ManifoldCFException(
                        "Unknown http error occurred while getting doc versions: " + e.getMessage(), e);
            if (e.getFaultCode().equals(new javax.xml.namespace.QName(
                    "http://schemas.xmlsoap.org/soap/envelope/", "Server.userException"))) {
                String exceptionName = e.getFaultString();
                if (exceptionName.equals("java.lang.InterruptedException"))
                    throw new ManifoldCFException("Interrupted", ManifoldCFException.INTERRUPTED);
            if (e.getFaultCode().equals(
                    new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/", "Server"))) {
                if (e.getFaultString().indexOf(" 23031#") != -1) {
                    // This means that the session has expired, so reset it and retry
                    meridio_ = null;

            if (Logging.connectors.isDebugEnabled())
                        .debug("Meridio: Got an unknown remote exception getting doc versions - axis fault = "
                                + e.getFaultCode().getLocalPart() + ", detail = " + e.getFaultString()
                                + " - retrying", e);
            throw new ServiceInterruption("Remote procedure exception: " + e.getMessage(), e,
                    currentTime + 300000L, currentTime + 3 * 60 * 60000L, -1, false);
        } catch (RemoteException remoteException) {
            throw new ManifoldCFException("Meridio: A remote exception occurred while getting doc versions: "
                    + remoteException.getMessage(), remoteException);
        } catch (MeridioDataSetException meridioDataSetException) {
            throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " + "Service XML: "
                    + meridioDataSetException.getMessage(), meridioDataSetException);

From source file:org.apache.camel.component.box.internal.LongPollingEventsManager.java

public void poll(long streamPosition, final String streamType, final int limit, final EventCallback callback)
        throws BoxServerException, AuthFatalFailureException, BoxRestException {

    // get BoxClient Event Manager
    final IBoxEventsManager eventsManager = cachedBoxClient.getBoxClient().getEventsManager();

    // get current stream position if requested
    if (BoxEventRequestObject.STREAM_POSITION_NOW == streamPosition) {
        streamPosition = getCurrentStreamPosition(eventsManager, streamPosition);
    }/*from  w w w .  j a  va2s. c o  m*/

    // validate parameters
    ObjectHelper.notNull(streamPosition, "streamPosition");
    ObjectHelper.notEmpty(streamType, "streamType");
    ObjectHelper.notNull(callback, "eventCallback");

    httpClient = new DefaultHttpClient(cachedBoxClient.getClientConnectionManager(), httpParams);

    // start polling thread
    LOG.info("Started event polling thread for " + cachedBoxClient);

    final long startStreamPosition = streamPosition;
    pollFuture = executorService.submit(new Runnable() {
        public void run() {

            final ObjectMapper mapper = new ObjectMapper();

            long currentStreamPosition = startStreamPosition;
            BoxRealTimeServer realTimeServer = null;

            boolean retry = false;
            int retries = 0;
            int maxRetries = 1;

            while (!done) {
                try {
                    // set to true if no exceptions thrown
                    retry = false;

                    if (realTimeServer == null) {

                        // get RTS URL
                        realTimeServer = getBoxRealTimeServer(currentStreamPosition, eventsManager);

                        // update HTTP timeout
                        final int requestTimeout = Integer
                        final HttpParams params = httpClient.getParams();
                        HttpConnectionParams.setSoTimeout(params, requestTimeout * 1000);

                        // update maxRetries
                        maxRetries = Integer.parseInt(realTimeServer.getExtraData(MAX_RETRIES).toString());

                    // create HTTP request for RTS
                    httpGet = getPollRequest(realTimeServer.getUrl(), currentStreamPosition);

                    // execute RTS poll
                    HttpResponse httpResponse = null;
                    try {
                        httpResponse = httpClient.execute(httpGet, (HttpContext) null);
                    } catch (SocketTimeoutException e) {
                        LOG.debug("Poll timed out, retrying for " + cachedBoxClient);

                    if (httpResponse != null) {

                        // parse response
                        final StatusLine statusLine = httpResponse.getStatusLine();
                        if (statusLine != null && statusLine.getStatusCode() == HttpStatus.SC_OK) {
                            final HttpEntity entity = httpResponse.getEntity();
                            Map<String, String> rtsResponse = mapper.readValue(entity.getContent(), Map.class);

                            final String message = rtsResponse.get(MESSAGE);
                            if (NEW_CHANGE.equals(message)) {

                                // get events
                                final BoxEventRequestObject requestObject = BoxEventRequestObject
                                final BoxEventCollection events = eventsManager.getEvents(requestObject);

                                // notify callback

                                // update stream position
                                currentStreamPosition = events.getNextStreamPosition();

                            } else if (RECONNECT.equals(message) || MAX_RETRIES.equals(message)) {
                                LOG.debug("Long poll reconnect for " + cachedBoxClient);
                                realTimeServer = null;
                            } else if (OUT_OF_DATE.equals(message)) {
                                // update currentStreamPosition
                                LOG.debug("Long poll out of date for " + cachedBoxClient);
                                currentStreamPosition = getCurrentStreamPosition(eventsManager,
                                realTimeServer = null;
                            } else {
                                throw new RuntimeCamelException("Unknown poll response " + message);
                        } else {
                            String msg = "Unknown error";
                            if (statusLine != null) {
                                msg = String.format("Error polling events for %s: code=%s, message=%s",
                                        cachedBoxClient, statusLine.getStatusCode(),
                            throw new RuntimeCamelException(msg);

                    // keep polling
                    retry = true;

                } catch (InterruptedException e) {
                    LOG.debug("Interrupted event polling thread for {}, exiting...", cachedBoxClient);
                } catch (BoxSDKException e) {
                } catch (RuntimeCamelException e) {
                } catch (SocketException e) {
                    // TODO handle connection aborts!!!
                    LOG.debug("Socket exception while event polling for {}", cachedBoxClient);
                    retry = true;
                    realTimeServer = null;
                } catch (Exception e) {
                    callback.onException(new RuntimeCamelException(
                            "Error while polling for " + cachedBoxClient + ": " + e.getMessage(), e));
                } finally {
                    // are we done yet?
                    if (!retry) {
                        done = true;
                    } else {
                        if (realTimeServer != null && (++retries > maxRetries)) {
                            // make another option call
                            realTimeServer = null;
            LOG.info("Stopped event polling thread for " + cachedBoxClient);

From source file:org.apache.manifoldcf.crawler.connectors.rss.RSSConnector.java

/** Handle an RSS feed document, using SAX to limit the memory impact */
protected void handleRSSFeedSAX(String documentIdentifier, IProcessActivity activities, Filter filter)
        throws ManifoldCFException, ServiceInterruption {
    // The SAX model uses parsing events to control parsing, which allows me to manage memory usage much better.
    // This is essential for when a feed contains dechromed content as well as links.

    // First, catch all flavors of IO exception, and handle them properly
    try {/*w  ww. j a  v  a  2s  .  c om*/
        // Open the input stream, and set up the parse
        InputStream is = cache.getData(documentIdentifier);
        if (is == null) {
            Logging.connectors.error("RSS: Document '" + documentIdentifier + "' should be in cache but isn't");
        try {
            Parser p = new Parser();
            // Parse the document.  This will cause various things to occur, within the instantiated XMLParsingContext class.
            XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState();
            OuterContextClass c = new OuterContextClass(x, documentIdentifier, activities, filter);
            try {
                // Believe it or not, there are no parsing errors we can get back now.
                p.parseWithCharsetDetection(null, is, x);
            } finally {
        } finally {
    } catch (java.net.SocketTimeoutException e) {
        throw new ManifoldCFException("Socket timeout error: " + e.getMessage(), e);
    } catch (ConnectTimeoutException e) {
        throw new ManifoldCFException("Socket connect timeout error: " + e.getMessage(), e);
    } catch (InterruptedIOException e) {
        throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
    } catch (IOException e) {
        throw new ManifoldCFException("IO error: " + e.getMessage(), e);


From source file:org.apache.manifoldcf.crawler.connectors.rss.RSSConnector.java

/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
* The document specification allows this class to filter what is done based on the job.
* The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers to process.
*@param statuses are the currently-stored document versions for each document in the set of document identifiers
* passed in above./*from w  ww . ja  v  a2  s . com*/
*@param activities is the interface this method should use to queue up new document references
* and ingest documents.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
        IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
        throws ManifoldCFException, ServiceInterruption {

    // The connection limit is designed to permit this connector to coexist with potentially other connectors, such as the web connector.
    // There is currently no good way to enforce connection limits across all installed connectors - this will require considerably more
    // thought to set up properly.
    int connectionLimit = 200;

    String[] fixedList = new String[2];

    if (Logging.connectors.isDebugEnabled())
        Logging.connectors.debug("RSS: In getDocumentVersions for "
                + Integer.toString(documentIdentifiers.length) + " documents");

    Filter f = new Filter(spec, false);

    String[] acls = f.getAcls();
    // Sort it,

    // NOTE: There are two kinds of documents in here; documents that are RSS feeds (that presumably have a content-type
    // of text/xml), and documents that need to be indexed.
    // For the latter, the metadata etc is part of the version string.  For the former, the only thing that is part of the version string is the
    // document's checksum.
    // The need to exclude documents from fetch based on whether they match an expression causes some difficulties, because we really
    // DON'T want this to apply to the feeds themselves.  Since the distinguishing characteristic of a feed is that it is in the seed list,
    // and that its content-type is text/xml, we could use either of these characteristics to treat feeds differently from
    // fetchable urls.  But the latter approach requires a fetch, which is forbidden.  So - the spec will be used to characterize the url.
    // However, the spec might change, and the url might be dropped from the list - and then what??
    // The final solution is to simply not queue what cannot be mapped.

    int feedTimeout = f.getFeedTimeoutValue();

    // The document specification has already been used to trim out documents that are not
    // allowed from appearing in the queue.  So, even that has already been done.
    for (String documentIdentifier : documentIdentifiers) {
        // If it is in this list, we presume that it has been vetted against the map etc., so we don't do that again.  We just fetch it.
        // And, if the content type is xml, we calculate the version as if it is a feed rather than a document.

        // Get the url
        String urlValue = documentIdentifier;

        if (Logging.connectors.isDebugEnabled())
            Logging.connectors.debug("RSS: Getting version string for '" + urlValue + "'");

        String versionString;
        String ingestURL = null;
        String[] pubDates = null;
        String[] sources = null;
        String[] titles = null;
        String[] authorNames = null;
        String[] authorEmails = null;
        String[] categories = null;
        String[] descriptions = null;

        try {
            // If there's a carrydown "data" value for this url, we use that value rather than actually fetching the document.  This also means we don't need to
            // do a robots check, because we aren't actually crawling anything.  So, ALWAYS do this first...
            CharacterInput[] dechromedData = activities.retrieveParentDataAsFiles(urlValue, "data");
            try {
                if (dechromedData.length > 0) {
                    // Data already available.  The fetch cycle can be entirely avoided, as can the robots check.
                    ingestURL = f.mapDocumentURL(urlValue);
                    if (ingestURL != null) {
                        // Open up an input stream corresponding to the carrydown data.  The stream will be encoded as utf-8.
                        try {
                            InputStream is = dechromedData[0].getUtf8Stream();
                            try {
                                StringBuilder sb = new StringBuilder();
                                long checkSum = cache.addData(activities, urlValue, "text/html", is);
                                // Grab what we need from the passed-down data for the document.  These will all become part
                                // of the version string.
                                pubDates = activities.retrieveParentData(urlValue, "pubdate");
                                sources = activities.retrieveParentData(urlValue, "source");
                                titles = activities.retrieveParentData(urlValue, "title");
                                authorNames = activities.retrieveParentData(urlValue, "authorname");
                                authorEmails = activities.retrieveParentData(urlValue, "authoremail");
                                categories = activities.retrieveParentData(urlValue, "category");
                                descriptions = activities.retrieveParentData(urlValue, "description");

                                if (sources.length == 0) {
                                    if (Logging.connectors.isDebugEnabled())
                                        Logging.connectors.debug("RSS: Warning; URL '" + ingestURL
                                                + "' doesn't seem to have any RSS feed source!");

                                packList(sb, acls, '+');
                                if (acls.length > 0) {
                                    pack(sb, defaultAuthorityDenyToken, '+');
                                } else
                                // The ingestion URL
                                pack(sb, ingestURL, '+');
                                // The pub dates
                                packList(sb, pubDates, '+');
                                // The titles
                                packList(sb, titles, '+');
                                // The sources
                                packList(sb, sources, '+');
                                // The categories
                                packList(sb, categories, '+');
                                // The descriptions
                                packList(sb, descriptions, '+');
                                // The author names
                                packList(sb, authorNames, '+');
                                // The author emails
                                packList(sb, authorEmails, '+');

                                // Do the checksum part, which does not need to be parseable.
                                sb.append(new Long(checkSum).toString());

                                versionString = sb.toString();
                            } finally {
                        } catch (java.net.SocketTimeoutException e) {
                            throw new ManifoldCFException(
                                    "IO exception reading data from string: " + e.getMessage(), e);
                        } catch (InterruptedIOException e) {
                            throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
                        } catch (IOException e) {
                            throw new ManifoldCFException(
                                    "IO exception reading data from string: " + e.getMessage(), e);
                    } else {
                        // Document a seed or unmappable; just skip
                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("RSS: Skipping carry-down document '" + urlValue
                                    + "' because it is unmappable or is a seed.");
                } else {
                    // Get the old version string
                    String oldVersionString = statuses.getIndexedVersionString(documentIdentifier);

                    // Unpack the old version as much as possible.
                    // We are interested in what the ETag and Last-Modified headers were last time.
                    String lastETagValue = null;
                    String lastModifiedValue = null;
                    // Note well: Non-continuous jobs cannot use etag because the rss document MUST be fetched each time for such jobs,
                    // or the documents it points at would get deleted.
                    // NOTE: I disabled this code because we really need the feed's TTL value in order to reschedule properly.  I can't get the
                    // TTL value without refetching the document - therefore ETag and Last-Modified cannot be used :-(
                    if (false && jobMode == JOBMODE_CONTINUOUS && oldVersionString != null
                            && oldVersionString.startsWith("-")) {
                        // It's a feed, so the last etag and last-modified fields should be encoded in this version string.
                        StringBuilder lastETagBuffer = new StringBuilder();
                        int unpackPos = unpack(lastETagBuffer, oldVersionString, 1, '+');
                        StringBuilder lastModifiedBuffer = new StringBuilder();
                        unpackPos = unpack(lastModifiedBuffer, oldVersionString, unpackPos, '+');
                        if (lastETagBuffer.length() > 0)
                            lastETagValue = lastETagBuffer.toString();
                        if (lastModifiedBuffer.length() > 0)
                            lastModifiedValue = lastModifiedBuffer.toString();

                    if (Logging.connectors.isDebugEnabled()
                            && (lastETagValue != null || lastModifiedValue != null))
                                "RSS: Document '" + urlValue + "' was found to have a previous ETag value of '"
                                        + ((lastETagValue == null) ? "null" : lastETagValue)
                                        + "' and a previous Last-Modified value of '"
                                        + ((lastModifiedValue == null) ? "null" : lastModifiedValue) + "'");

                    // Robots check.  First, we need to separate the url into its components
                    URL url;
                    try {
                        url = new URL(urlValue);
                    } catch (MalformedURLException e) {
                        Logging.connectors.debug("RSS: URL '" + urlValue + "' is malformed; skipping", e);

                    String protocol = url.getProtocol();
                    int port = url.getPort();
                    String hostName = url.getHost();
                    String pathPart = url.getFile();

                    // Check with robots to see if it's allowed
                    if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext, throttleGroupName,
                            protocol, port, hostName, url.getPath(), userAgent, from, proxyHost, proxyPort,
                            proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities,
                            connectionLimit)) {
                        activities.recordActivity(null, ACTIVITY_FETCH, null, urlValue, Integer.toString(-2),
                                "Robots exclusion", null);

                        if (Logging.connectors.isDebugEnabled())
                                    .debug("RSS: Skipping url '" + urlValue + "' because robots.txt says to");

                    // Now, use the fetcher, and get the file.
                    IThrottledConnection connection = fetcher.createConnection(currentContext,
                            throttleGroupName, hostName, connectionLimit, feedTimeout, proxyHost, proxyPort,
                            proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities);
                    try {
                        // Begin the fetch
                        try {
                            // Execute the request.
                            // Use the connect timeout from the document specification!
                            int status = connection.executeFetch(protocol, port, pathPart, userAgent, from,
                                    lastETagValue, lastModifiedValue);
                            switch (status) {
                            case IThrottledConnection.STATUS_NOCHANGE:
                                versionString = oldVersionString;
                            case IThrottledConnection.STATUS_OK:
                                try {
                                    if (Logging.connectors.isDebugEnabled())
                                        Logging.connectors.debug("RSS: Successfully fetched " + urlValue);
                                    // Document successfully fetched!
                                    // If its content is xml, presume it's a feed...
                                    String contentType = connection.getResponseHeader("Content-Type");
                                    // Some sites have multiple content types.  We just look at the LAST one in that case.
                                    if (contentType != null) {
                                        String[] contentTypes = contentType.split(",");
                                        if (contentTypes.length > 0)
                                            contentType = contentTypes[contentTypes.length - 1].trim();
                                            contentType = null;
                                    String strippedContentType = contentType;
                                    if (strippedContentType != null) {
                                        int pos = strippedContentType.indexOf(";");
                                        if (pos != -1)
                                            strippedContentType = strippedContentType.substring(0, pos).trim();
                                    boolean isXML = (strippedContentType != null
                                            && xmlContentTypes.contains(strippedContentType));
                                    ingestURL = null;
                                    if (!isXML) {
                                        // If the chromed content mode is set to "skip", and we got here, it means
                                        // we should not include the content.
                                        if (f.getChromedContentMode() == CHROMED_SKIP) {
                                            if (Logging.connectors.isDebugEnabled())
                                                Logging.connectors.debug("RSS: Removing url '" + urlValue
                                                        + "' because it no longer has dechromed content available");
                                            versionString = null;

                                        // Decide whether to exclude this document based on what we see here.
                                        // Basically, we want to get rid of everything that we don't know what
                                        // to do with in the ingestion system.
                                        if (!activities.checkMimeTypeIndexable(contentType)) {
                                            if (Logging.connectors.isDebugEnabled())
                                                Logging.connectors.debug("RSS: Removing url '" + urlValue
                                                        + "' because it had the wrong content type: "
                                                        + ((contentType == null) ? "null"
                                                                : "'" + contentType + "'"));
                                            versionString = null;

                                        ingestURL = f.mapDocumentURL(urlValue);
                                    } else {
                                        if (Logging.connectors.isDebugEnabled())
                                                    .debug("RSS: The url '" + urlValue + "' is a feed");

                                        if (!f.isSeed(urlValue)) {
                                            // Remove the feed from consideration, since it has left the list of seeds
                                            if (Logging.connectors.isDebugEnabled())
                                                Logging.connectors.debug("RSS: Removing feed url '" + urlValue
                                                        + "' because it is not a seed.");
                                            versionString = null;

                                    InputStream is = connection.getResponseBodyStream();
                                    try {
                                        long checkSum = cache.addData(activities, urlValue, contentType, is);
                                        StringBuilder sb = new StringBuilder();
                                        if (ingestURL != null) {
                                            // We think it is ingestable.  The version string accordingly starts with a "+".

                                            // Grab what we need from the passed-down data for the document.  These will all become part
                                            // of the version string.
                                            pubDates = activities.retrieveParentData(urlValue, "pubdate");
                                            sources = activities.retrieveParentData(urlValue, "source");
                                            titles = activities.retrieveParentData(urlValue, "title");
                                            authorNames = activities.retrieveParentData(urlValue, "authorname");
                                            authorEmails = activities.retrieveParentData(urlValue,
                                            categories = activities.retrieveParentData(urlValue, "category");
                                            descriptions = activities.retrieveParentData(urlValue,

                                            if (sources.length == 0) {
                                                if (Logging.connectors.isDebugEnabled())
                                                    Logging.connectors.debug("RSS: Warning; URL '" + ingestURL
                                                            + "' doesn't seem to have any RSS feed source!");

                                            packList(sb, acls, '+');
                                            if (acls.length > 0) {
                                                pack(sb, defaultAuthorityDenyToken, '+');
                                            } else
                                            // The ingestion URL
                                            pack(sb, ingestURL, '+');
                                            // The pub dates
                                            packList(sb, pubDates, '+');
                                            // The titles
                                            packList(sb, titles, '+');
                                            // The sources
                                            packList(sb, sources, '+');
                                            // The categories
                                            packList(sb, categories, '+');
                                            // The descriptions
                                            packList(sb, descriptions, '+');
                                            // The author names
                                            packList(sb, authorNames, '+');
                                            // The author emails
                                            packList(sb, authorEmails, '+');
                                        } else {
                                            String etag = connection.getResponseHeader("ETag");
                                            if (etag == null)
                                                pack(sb, "", '+');
                                                pack(sb, etag, '+');
                                            String lastModified = connection.getResponseHeader("Last-Modified");
                                            if (lastModified == null)
                                                pack(sb, "", '+');
                                                pack(sb, lastModified, '+');


                                        // Do the checksum part, which does not need to be parseable.
                                        sb.append(new Long(checkSum).toString());

                                        versionString = sb.toString();
                                    } finally {
                                } catch (java.net.SocketTimeoutException e) {
                                            .warn("RSS: Socket timeout exception fetching document contents '"
                                                    + urlValue + "' - skipping: " + e.getMessage(), e);
                                    versionString = null;
                                } catch (ConnectTimeoutException e) {
                                            .warn("RSS: Connecto timeout exception fetching document contents '"
                                                    + urlValue + "' - skipping: " + e.getMessage(), e);
                                    versionString = null;
                                } catch (InterruptedIOException e) {
                                    throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
                                } catch (IOException e) {
                                    Logging.connectors.warn("RSS: IO exception fetching document contents '"
                                            + urlValue + "' - skipping: " + e.getMessage(), e);
                                    versionString = null;


                            case IThrottledConnection.STATUS_SITEERROR:
                            case IThrottledConnection.STATUS_PAGEERROR:
                                // Record an *empty* version.
                                // This signals the processDocuments() method that we really don't want to ingest this document, but we also don't
                                // want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times.
                                versionString = "";
                        } finally {
                    } finally {

                    if (versionString == null) {

                    if (!(versionString.length() == 0
                            || activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)))

                    // Process document!
                    if (Logging.connectors.isDebugEnabled())
                        Logging.connectors.debug("RSS: Processing '" + urlValue + "'");

                    // The only links we extract come from documents that we think are RSS feeds.
                    // When we think that's the case, we attempt to parse it as RSS XML.
                    if (ingestURL == null) {
                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("RSS: Interpreting document '" + urlValue + "' as a feed");

                        // We think it is a feed.
                        // If this is a continuous job, AND scanonly is true, it means that the document was either identical to the
                        // previous fetch, or was not fetched at all.  In that case, it may not even be there, and we *certainly* don't
                        // want to attempt to process it in any case.

                        // NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost.  If the
                        // TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds.
                        if (true || jobMode != JOBMODE_CONTINUOUS) {
                            handleRSSFeedSAX(urlValue, activities, f);
                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("RSS: Extraction of feed '" + urlValue + "' complete");

                            // Record the feed's version string, so we won't refetch unless needed.
                            // This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to
                            // keep track of the adaptive parameters.
                            activities.recordDocument(documentIdentifier, versionString);
                        } else {
                            // The problem here is that we really do need to set the rescan time to something reasonable.
                            // But we might not even have read the feed!  So what to do??
                            // One answer is to build a connector-specific table that carries the last value of every feed around.
                            // Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified).
                            if (Logging.connectors.isDebugEnabled())
                                Logging.connectors.debug("RSS: Feed '" + urlValue
                                        + "' does not appear to differ from previous fetch for a continuous job; not extracting!");

                            long currentTime = System.currentTimeMillis();

                            Long defaultRescanTime = f.getDefaultRescanTime(currentTime);

                            if (defaultRescanTime != null) {
                                Long minimumTime = f.getMinimumRescanTime(currentTime);
                                if (minimumTime != null) {
                                    if (defaultRescanTime.longValue() < minimumTime.longValue())
                                        defaultRescanTime = minimumTime;

                            activities.setDocumentScheduleBounds(urlValue, defaultRescanTime, defaultRescanTime,
                                    null, null);

                    } else {
                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("RSS: Interpreting '" + urlValue + "' as a document");

                        String errorCode = null;
                        String errorDesc = null;
                        long startTime = System.currentTimeMillis();
                        Long fileLengthLong = null;
                        try {
                            long documentLength = cache.getDataLength(documentIdentifier);
                            if (!activities.checkLengthIndexable(documentLength)) {
                                activities.noDocument(documentIdentifier, versionString);
                                errorCode = activities.EXCLUDED_LENGTH;
                                errorDesc = "Document rejected because of length (" + documentLength + ")";
                                if (Logging.connectors.isDebugEnabled())
                                    Logging.connectors.debug("RSS: Skipping document '" + urlValue
                                            + "' because its length was rejected (" + documentLength + ")");

                            if (!activities.checkURLIndexable(documentIdentifier)) {
                                activities.noDocument(documentIdentifier, versionString);
                                errorCode = activities.EXCLUDED_URL;
                                errorDesc = "Document rejected because of URL ('" + documentIdentifier + "')";
                                if (Logging.connectors.isDebugEnabled())
                                    Logging.connectors.debug("RSS: Skipping document '" + urlValue
                                            + "' because its URL was rejected ('" + documentIdentifier + "')");

                            // Check if it's a recognized content type
                            String contentType = cache.getContentType(documentIdentifier);
                            // Some sites have multiple content types.  We just look at the LAST one in that case.
                            if (contentType != null) {
                                String[] contentTypes = contentType.split(",");
                                if (contentTypes.length > 0)
                                    contentType = contentTypes[contentTypes.length - 1].trim();
                                    contentType = null;
                            if (!activities.checkMimeTypeIndexable(contentType)) {
                                activities.noDocument(documentIdentifier, versionString);
                                errorCode = activities.EXCLUDED_MIMETYPE;
                                errorDesc = "Document rejected because of mime type (" + contentType + ")";
                                if (Logging.connectors.isDebugEnabled())
                                    Logging.connectors.debug("RSS: Skipping document '" + urlValue
                                            + "' because its mime type was rejected ('" + contentType + "')");

                            // Treat it as an ingestable document.

                            long dataSize = cache.getDataLength(urlValue);
                            RepositoryDocument rd = new RepositoryDocument();

                            // Set content type
                            if (contentType != null)

                            // Turn into acls and add into description
                            String[] denyAcls;
                            if (acls == null)
                                denyAcls = null;
                            else if (acls.length == 0)
                                denyAcls = new String[0];
                                denyAcls = new String[] { defaultAuthorityDenyToken };

                            if (acls != null && denyAcls != null)
                                rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, acls, denyAcls);

                            if (titles != null && titles.length > 0)
                                rd.addField("title", titles);
                            if (authorNames != null && authorNames.length > 0)
                                rd.addField("authorname", authorNames);
                            if (authorEmails != null && authorEmails.length > 0)
                                rd.addField("authoremail", authorEmails);
                            if (descriptions != null && descriptions.length > 0)
                                rd.addField("summary", descriptions);
                            if (sources != null && sources.length > 0)
                                rd.addField("source", sources);
                            if (categories != null && categories.length > 0)
                                rd.addField("category", categories);

                            // The pubdates are a ms since epoch value; we want the minimum one for the origination time.
                            Long minimumOrigTime = null;
                            if (pubDates != null && pubDates.length > 0) {
                                String[] pubDateValuesISO = new String[pubDates.length];
                                TimeZone tz = TimeZone.getTimeZone("UTC");
                                DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'");
                                for (int k = 0; k < pubDates.length; k++) {
                                    String pubDate = pubDates[k];
                                    try {
                                        Long pubDateLong = new Long(pubDate);
                                        if (minimumOrigTime == null
                                                || pubDateLong.longValue() < minimumOrigTime.longValue())
                                            minimumOrigTime = pubDateLong;
                                        pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue()));
                                    } catch (NumberFormatException e) {
                                        // Do nothing; the version string seems to not mean anything
                                        pubDateValuesISO[k] = "";
                                rd.addField("pubdate", pubDates);
                                rd.addField("pubdateiso", pubDateValuesISO);

                            if (minimumOrigTime != null)
                                activities.setDocumentOriginationTime(urlValue, minimumOrigTime);

                            InputStream is = cache.getData(urlValue);
                            if (is != null) {
                                try {
                                    rd.setBinary(is, dataSize);
                                    try {
                                                versionString, ingestURL, rd);
                                        errorCode = "OK";
                                        fileLengthLong = new Long(dataSize);
                                    } catch (IOException e) {
                                        errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                                        errorDesc = e.getMessage();
                                        handleIOException(e, "reading data");
                                } finally {
                                    try {
                                    } catch (IOException e) {
                                        errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                                        errorDesc = e.getMessage();
                                        handleIOException(e, "closing stream");
                        } catch (ManifoldCFException e) {
                            if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
                                errorCode = null;
                            throw e;
                        } finally {
                            if (errorCode != null)
                                activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, null, urlValue,
                                        errorCode, errorDesc, null);
            } finally {
                for (CharacterInput ci : dechromedData) {
                    if (ci != null)

        } finally {
            // Remove any fetched documents.

From source file:org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector.java

/** Is the document text, as far as we can tell? */
protected boolean isDocumentText(String documentURI) throws ManifoldCFException {
    try {//from   w  w  w  . j  a  va 2 s  . c o m
        // Look at the first 4K
        byte[] byteBuffer = new byte[4096];
        int amt;

        // Open file for reading.
        InputStream is = cache.getData(documentURI);
        if (is == null)
            return false;
        try {
            amt = 0;
            while (amt < byteBuffer.length) {
                int incr = is.read(byteBuffer, amt, byteBuffer.length - amt);
                if (incr == -1)
                amt += incr;
        } finally {

        if (amt == 0)
            return false;

        return isText(byteBuffer, amt);
    } catch (SocketTimeoutException e) {
        throw new ManifoldCFException("Socket timeout exception accessing cached document: " + e.getMessage(),
    } catch (ConnectTimeoutException e) {
        throw new ManifoldCFException("Socket timeout exception accessing cached document: " + e.getMessage(),
    } catch (InterruptedIOException e) {
        throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
    } catch (IOException e) {
        throw new ManifoldCFException("IO exception accessing cached document: " + e.getMessage(), e);

From source file:org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector.java

/** Handle document references from XML.  Right now we only understand RSS. */
protected void handleXML(String documentURI, IXMLHandler handler)
        throws ManifoldCFException, ServiceInterruption {
    try {/*from w w  w .j av a2 s . c  o  m*/
        int responseCode = cache.getResponseCode(documentURI);
        if (responseCode != 200)

        // We ONLY look for XML if the content type *says* it is XML.
        String contentType = extractContentType(cache.getContentType(documentURI));
        String mimeType = extractMimeType(contentType);
        boolean isXML = mimeType.equals("text/xml") || mimeType.equals("application/rss+xml")
                || mimeType.equals("application/xml") || mimeType.equals("application/atom+xml")
                || mimeType.equals("application/xhtml+xml") || mimeType.equals("text/XML")
                || mimeType.equals("application/rdf+xml") || mimeType.equals("text/application")
                || mimeType.equals("XML");

        if (!isXML)

        // OK, it's XML.  Now what?  Well, we get the encoding, and we verify that it is text, then we try to get links
        // from it presuming it is an RSS feed.

        String encoding = extractEncoding(contentType);

        InputStream is = cache.getData(documentURI);
        if (is == null) {
            Logging.connectors.error("WEB: Document '" + documentURI + "' should be in cache but isn't");
        try {
            // Parse the document.  This will cause various things to occur, within the instantiated XMLParsingContext class.
            Parser p = new Parser();
            XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState();
            OuterContextClass c = new OuterContextClass(x, documentURI, handler);
            try {
                p.parseWithCharsetDetection(encoding, is, x);
            } finally {
        } finally {
    } catch (java.net.SocketTimeoutException e) {
        throw new ManifoldCFException("Socket timeout exception: " + e.getMessage(), e);
    } catch (ConnectTimeoutException e) {
        throw new ManifoldCFException("Socket connect timeout exception: " + e.getMessage(), e);
    } catch (InterruptedIOException e) {
        //Logging.connectors.warn("IO interruption seen",e);

        throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
    } catch (IOException e) {
        throw new ManifoldCFException("IO error: " + e.getMessage(), e);

From source file:org.zaproxy.zap.extension.ascanrulesAlpha.SQLInjectionSQLite.java

 * scans for SQL Injection vulnerabilities, using SQLite specific syntax. If it doesn't use
 * specifically SQLite syntax, it does not belong in here, but in TestSQLInjection
 *//*from   w  w  w . j a  v  a  2s . c  om*/
public void scan(HttpMessage originalMessage, String paramName, String originalParamValue) {

    try {
        // the original message passed to us never has the response populated. fix that by
        // re-retrieving it..
        sendAndReceive(originalMessage, false); // do not follow redirects

        // Do time based SQL injection checks..
        // Timing Baseline check: we need to get the time that it took the original query, to
        // know if the time based check is working correctly..
        HttpMessage msgTimeBaseline = getNewMsg();
        long originalTimeStarted = System.currentTimeMillis();
        try {
        } catch (java.net.SocketTimeoutException e) {
            // to be expected occasionally, if the base query was one that contains some
            // parameters exploiting time based SQL injection?
            if (this.debugEnabled)
                log.debug("The Base Time Check timed out on [" + msgTimeBaseline.getRequestHeader().getMethod()
                        + "] URL [" + msgTimeBaseline.getRequestHeader().getURI().getURI() + "]");
        long originalTimeUsed = System.currentTimeMillis() - originalTimeStarted;
        // if the time was very slow (because JSP was being compiled on first call, for
        // instance)
        // then the rest of the time based logic will fail.  Lets double-check for that scenario
        // by requesting the url again.
        // If it comes back in a more reasonable time, we will use that time instead as our
        // baseline.  If it come out in a slow fashion again,
        // we will abort the check on this URL, since we will only spend lots of time trying
        // request, when we will (very likely) not get positive results.
        if (originalTimeUsed > 5000) {
            long originalTimeStarted2 = System.currentTimeMillis();
            try {
            } catch (java.net.SocketTimeoutException e) {
                // to be expected occasionally, if the base query was one that contains some
                // parameters exploiting time based SQL injection?
                if (this.debugEnabled)
                            "Base Time Check 2 timed out on [" + msgTimeBaseline.getRequestHeader().getMethod()
                                    + "] URL [" + msgTimeBaseline.getRequestHeader().getURI().getURI() + "]");
            long originalTimeUsed2 = System.currentTimeMillis() - originalTimeStarted2;
            if (originalTimeUsed2 > 5000) {
                // no better the second time around.  we need to bale out.
                if (this.debugEnabled)
                    log.debug("Both base time checks 1 and 2 for ["
                            + msgTimeBaseline.getRequestHeader().getMethod() + "] URL ["
                            + msgTimeBaseline.getRequestHeader().getURI().getURI()
                            + "] are way too slow to be usable for the purposes of checking for time based SQL Injection checking.  We are aborting the check on this particular url.");
            } else {
                // phew.  the second time came in within the limits. use the later timing
                // details as the base time for the checks.
                originalTimeUsed = originalTimeUsed2;
                originalTimeStarted = originalTimeStarted2;
        // end of timing baseline check

        int countTimeBasedRequests = 0;
        if (this.debugEnabled)
            log.debug("Scanning URL [" + getBaseMsg().getRequestHeader().getMethod() + "] ["
                    + getBaseMsg().getRequestHeader().getURI() + "], [" + paramName + "] with value ["
                    + originalParamValue + "] for SQL Injection");

        // SQLite specific time-based SQL injection checks
        boolean foundTimeBased = false;
        for (int timeBasedSQLindex = 0; timeBasedSQLindex < SQL_SQLITE_TIME_REPLACEMENTS.length && doTimeBased
                && countTimeBasedRequests < doTimeMaxRequests && !foundTimeBased; timeBasedSQLindex++) {
            // since we have no means to create a deterministic delay in SQLite, we need to take
            // a different approach:
            // in each iteration, increase the number of random blobs for SQLite to create.  If
            // we can detect an increasing delay, we know
            // that the payload has been successfully injected.
            int numberOfSequentialIncreases = 0;
            String detectableDelayParameter = null;
            long detectableDelay = 0;
            String maxDelayParameter = null;
            long maxDelay = 0;
            HttpMessage detectableDelayMessage = null;
            long previousDelay = originalTimeUsed;
            boolean potentialTimeBasedSQLInjection = false;
            boolean timeExceeded = false;

            for (long numBlobsToCreate = minBlobBytes; numBlobsToCreate <= this.maxBlobBytes && !timeExceeded
                    && numberOfSequentialIncreases < incrementalDelayIncreasesForAlert; numBlobsToCreate *= 10) {

                HttpMessage msgDelay = getNewMsg();
                String newTimeBasedInjectionValue = SQL_SQLITE_TIME_REPLACEMENTS[timeBasedSQLindex]
                        .replace("<<<<ORIGINALVALUE>>>>", originalParamValue);
                newTimeBasedInjectionValue = newTimeBasedInjectionValue.replace("<<<<NUMBLOBBYTES>>>>",
                setParameter(msgDelay, paramName, newTimeBasedInjectionValue);

                if (this.debugEnabled)
                    log.debug("\nTrying '" + newTimeBasedInjectionValue
                            + "'. The number of Sequential Increases already is "
                            + numberOfSequentialIncreases);

                // send it.
                long modifiedTimeStarted = System.currentTimeMillis();
                try {
                } catch (java.net.SocketTimeoutException e) {
                    // to be expected occasionally, if the contains some parameters exploiting
                    // time based SQL injection
                    if (this.debugEnabled)
                        log.debug("The time check query timed out on ["
                                + msgTimeBaseline.getRequestHeader().getMethod() + "] URL ["
                                + msgTimeBaseline.getRequestHeader().getURI().getURI() + "] on field: ["
                                + paramName + "]");
                long modifiedTimeUsed = System.currentTimeMillis() - modifiedTimeStarted;

                // before we do the time based checking, first check for a known error message
                // from the atatck, indicating a SQL injection vuln
                for (Pattern errorMessagePattern : errorMessagePatterns) {
                    Matcher matcher = errorMessagePattern.matcher(msgDelay.getResponseBody().toString());
                    boolean errorFound = matcher.find();
                    if (errorFound) {
                        // Likely an error based SQL Injection. Raise it
                        String extraInfo = Constant.messages.getString(
                        // raise the alert
                        bingo(Alert.RISK_HIGH, Alert.CONFIDENCE_MEDIUM, getName(), getDescription(),
                                getBaseMsg().getRequestHeader().getURI().getURI(), // url
                                paramName, newTimeBasedInjectionValue, extraInfo, getSolution(),
                                errorMessagePattern.toString(), this.getCweId(), this.getWascId(), msgDelay);

                        if (this.debugEnabled)
                            log.debug("A likely Error Based SQL Injection Vulnerability has been found with ["
                                    + msgDelay.getRequestHeader().getMethod() + "] URL ["
                                    + msgDelay.getRequestHeader().getURI().getURI() + "] on field: ["
                                    + paramName + "], by matching for pattern ["
                                    + errorMessagePattern.toString() + "]");
                        foundTimeBased = true; // yeah, I know. we found an error based, while looking
                        // for a time based. bale out anyways.
                        break; // out of the loop
                // outta the time based loop..
                if (foundTimeBased)

                // no error message detected from the time based attack.. continue looking for
                // time based injection point.

                // cap the time we will delay by to 10 seconds
                if (modifiedTimeUsed > 10000)
                    timeExceeded = true;

                boolean parseTimeEquivalent = false;
                if (modifiedTimeUsed > previousDelay) {
                    if (this.debugEnabled)
                        log.debug("The response time " + modifiedTimeUsed + " is > the previous response time "
                                + previousDelay);
                    // in order to rule out false positives due to the increasing SQL parse time
                    // for longer parameter values
                    // we send a random (alphanumeric only) string value of the same length as
                    // the attack parameter
                    // we expect the response time for the SQLi attack to be greater than or
                    // equal to the response time for
                    // the random alphanumeric string parameter
                    // if this is not the case, then we assume that the attack parameter is not
                    // a potential SQL injection causing payload.
                    HttpMessage msgParseDelay = getNewMsg();
                    String parseDelayCheckParameter = RandomStringUtils
                            .random(newTimeBasedInjectionValue.length(), RANDOM_PARAMETER_CHARS);
                    setParameter(msgParseDelay, paramName, parseDelayCheckParameter);
                    long parseDelayTimeStarted = System.currentTimeMillis();
                    long parseDelayTimeUsed = System.currentTimeMillis() - parseDelayTimeStarted;

                    // figure out if the attack delay and the (non-sql-injection) parse delay
                    // are within X ms of each other..
                    parseTimeEquivalent = (Math
                            .abs(modifiedTimeUsed - parseDelayTimeUsed) < this.parseDelayDifference);
                    if (this.debugEnabled)
                        log.debug("The parse time a random parameter of the same length is "
                                + parseDelayTimeUsed + ", so the attack and random parameter are "
                                + (parseTimeEquivalent ? "" : "NOT ")
                                + "equivalent (given the user defined attack threshold)");

                if (modifiedTimeUsed > previousDelay && !parseTimeEquivalent) {

                    maxDelayParameter = newTimeBasedInjectionValue;
                    maxDelay = modifiedTimeUsed;

                    // potential for SQL injection, detectable with "numBlobsToCreate" random
                    // blobs being created..
                    if (!potentialTimeBasedSQLInjection) {
                        if (log.isDebugEnabled())
                            log.debug("Setting the Detectable Delay parameter to '" + newTimeBasedInjectionValue
                                    + "'");
                        detectableDelayParameter = newTimeBasedInjectionValue;
                        detectableDelay = modifiedTimeUsed;
                        detectableDelayMessage = msgDelay;
                    potentialTimeBasedSQLInjection = true;
                } else {
                    // either no SQL injection, invalid SQL syntax, or timing difference is not
                    // detectable with "numBlobsToCreate" random blobs being created.
                    // keep trying with larger numbers of "numBlobsToCreate", since that's the
                    // thing we can most easily control and verify
                    // note also: if for some reason, an earlier attack with a smaller number of
                    // blobs indicated there might be a vulnerability
                    // then this case will rule that out if it was a fluke...
                    // the timing delay must keep increasing, as the number of blobs is
                    // increased.
                    potentialTimeBasedSQLInjection = false;
                    numberOfSequentialIncreases = 0;
                    detectableDelayParameter = null;
                    detectableDelay = 0;
                    detectableDelayMessage = null;
                    maxDelayParameter = null;
                    maxDelay = 0;
                    // do not break at this point, since we may simply need to keep increasing
                    // numBlobsToCreate to
                    // a point where we can detect the resulting delay
                if (this.debugEnabled)
                    log.debug("Time Based SQL Injection test for " + numBlobsToCreate + " random blob bytes: ["
                            + newTimeBasedInjectionValue + "] on field: [" + paramName + "] with value ["
                            + newTimeBasedInjectionValue + "] took " + modifiedTimeUsed
                            + "ms, where the original took " + originalTimeUsed + "ms");
                previousDelay = modifiedTimeUsed;

                // bale out if we were asked nicely
                if (isStop()) {
                    if (this.debugEnabled)
                        log.debug("Stopping the scan due to a user request");
            } // end of for loop to increase the number of random blob bytes to create

            // the number of times that we could sequentially increase the delay by increasing
            // the "number of random blob bytes to create"
            // is the basis for the threshold of the alert.  In some cases, the user may want to
            // see a solid increase in delay
            // for say 4 or 5 iterations, in order to be confident the vulnerability exists.  In
            // other cases, the user may be happy with just 2 sequential increases...
            if (this.debugEnabled)
                log.debug("Number of sequential increases: " + numberOfSequentialIncreases);
            if (numberOfSequentialIncreases >= this.incrementalDelayIncreasesForAlert) {
                // Likely a SQL Injection. Raise it
                String extraInfo = Constant.messages.getString(
                        "ascanalpha.sqlinjection.sqlite.alert.timebased.extrainfo", detectableDelayParameter,
                        detectableDelay, maxDelayParameter, maxDelay, originalParamValue, originalTimeUsed);

                // raise the alert
                bingo(Alert.RISK_HIGH, Alert.CONFIDENCE_MEDIUM, getName(), getDescription(),
                        getBaseMsg().getRequestHeader().getURI().getURI(), // url
                        paramName, detectableDelayParameter, extraInfo, getSolution(),
                        extraInfo /*as evidence*/, this.getCweId(), this.getWascId(), detectableDelayMessage);

                if (this.debugEnabled)
                    log.debug("A likely Time Based SQL Injection Vulnerability has been found with ["
                            + detectableDelayMessage.getRequestHeader().getMethod() + "] URL ["
                            + detectableDelayMessage.getRequestHeader().getURI().getURI() + "] on field: ["
                            + paramName + "]");

                // outta the time based loop..
                foundTimeBased = true;
            } // the user-define threshold has been exceeded. raise it.

            // outta the time based loop..
            if (foundTimeBased)

            // bale out if we were asked nicely
            if (isStop()) {
                if (this.debugEnabled)
                    log.debug("Stopping the scan due to a user request");
        } // for each time based SQL index
          // end of check for SQLite time based SQL Injection

        // TODO: fix this logic, cos it's broken already. it reports version 2.2 and 4.0..
        // (false positives ahoy)
        doUnionBased = false;

        // try to get the version of SQLite, using a UNION based SQL injection vulnerability
        // do this regardless of whether we already found a vulnerability using another
        // technique.
        if (doUnionBased) {
            int unionRequests = 0;
            // catch 3.0, 3.0.1,,, etc
            Pattern versionNumberPattern = Pattern.compile(
            String candidateValues[] = { "", originalParamValue };
            // shonky break label. labels the loop to break out of.  I believe I just finished a
            // sentence with a preposition too. Oh My.
            unionLoops: for (String sqliteVersionFunction : SQLITE_VERSION_FUNCTIONS) {
                for (String statementTypeCloser : SYNTACTIC_PREVIOUS_STATEMENT_TYPE_CLOSERS) {
                    for (String statementClauseCloser : SYNTACTIC_PREVIOUS_STATEMENT_CLAUSE_CLOSERS) {
                        for (String unionAdditionalColms : UNION_ADDITIONAL_COLUMNS) {
                            for (String nextStatementCommenter : SYNTACTIC_NEXT_STATEMENT_COMMENTER) {
                                for (String statementUnionStatement : SYNTACTIC_UNION_STATEMENTS) {
                                    for (String value : candidateValues) {
                                        // are we out of lives yet?
                                        // TODO: fix so that the logic does not spin through the
                                        // loop headers to get out of all of the nested loops..
                                        // without using the shonky break to label logic
                                        if (unionRequests > doUnionMaxRequests) {
                                            break unionLoops;

                                        String unionAttack = UNION_ATTACK_TEMPLATE;
                                        unionAttack = unionAttack.replace("<<<<SQLITE_VERSION_FUNCTION>>>>",
                                        unionAttack = unionAttack.replace(
                                        unionAttack = unionAttack.replace(
                                        unionAttack = unionAttack.replace("<<<<UNIONADDITIONALCOLUMNS>>>>",
                                        unionAttack = unionAttack.replace(
                                        unionAttack = unionAttack.replace("<<<<UNIONSTATEMENT>>>>",
                                        unionAttack = unionAttack.replace("<<<<VALUE>>>>", value);

                                        if (log.isDebugEnabled())
                                            log.debug("About to try to determine the SQLite version with ["
                                                    + unionAttack + "]");
                                        HttpMessage unionAttackMessage = getNewMsg();
                                        setParameter(unionAttackMessage, paramName, unionAttack);

                                        // check the response for the version information..
                                        Matcher matcher = versionNumberPattern
                                        while (matcher.find()) {
                                            String versionNumber = matcher.group();
                                            Pattern actualVersionNumberPattern = Pattern
                                                    .compile("\\Q" + versionNumber + "\\E", PATTERN_PARAM);
                                            if (log.isDebugEnabled())
                                                log.debug("Found a candidate SQLite version number '"
                                                        + versionNumber
                                                        + "'. About to look for the absence of '"
                                                        + actualVersionNumberPattern
                                                        + "' in the (re-created) original response body (of length "
                                                        + originalMessage.getResponseBody().toString().length()
                                                        + ") to validate it");

                                            // if the version number was not in the original*
                                            // response, we will call it..
                                            Matcher matcherVersionInOriginal = actualVersionNumberPattern
                                            if (!matcherVersionInOriginal.find()) {
                                                // we have the SQLite version number..
                                                if (log.isDebugEnabled())
                                                            "We found SQLite version [" + versionNumber + "]");

                                                String extraInfo = Constant.messages.getString(
                                                // raise the alert
                                                bingo(Alert.RISK_HIGH, Alert.CONFIDENCE_MEDIUM,
                                                        getName() + " - " + versionNumber, getDescription(),
                                                        getBaseMsg().getRequestHeader().getURI().getURI(), // url
                                                        paramName, unionAttack, extraInfo, getSolution(),
                                                        versionNumber /*as evidence*/, this.getCweId(),
                                                        this.getWascId(), unionAttackMessage);
                                                break unionLoops;
                                        // bale out if we were asked nicely
                                        if (isStop()) {
                                            if (this.debugEnabled)
                                                log.debug("Stopping the scan due to a user request");
        } // end of doUnionBased

    } catch (InvalidRedirectLocationException | UnknownHostException | URIException e) {
        if (log.isDebugEnabled()) {
            log.debug("Failed to send HTTP message, cause: " + e.getMessage());
    } catch (Exception e) {
        // Do not try to internationalise this.. we need an error message in any event..
        // if it's in English, it's still better than not having it at all.
        log.error("An error occurred checking a url for SQLite SQL Injection vulnerabilities", e);