List of usage examples for java.net SocketTimeoutException getMessage
public String getMessage()
From source file:org.apache.manifoldcf.crawler.connectors.meridio.MeridioConnector.java
/** Process a set of documents. * This is the method that should cause each document to be fetched, processed, and the results either added * to the queue of documents for the current job, and/or entered into the incremental ingestion manager. * The document specification allows this class to filter what is done based on the job. * The connector will be connected before this method can be called. *@param documentIdentifiers is the set of document identifiers to process. *@param statuses are the currently-stored document versions for each document in the set of document identifiers * passed in above./*ww w . j a v a2s. c o m*/ *@param activities is the interface this method should use to queue up new document references * and ingest documents. *@param jobMode is an integer describing how the job is being run, whether continuous or once-only. *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one. */ @Override public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption { // Get forced acls/security enable/disable String[] acls = getAcls(spec); // Sort it, in case it is needed. if (acls != null) java.util.Arrays.sort(acls); // Look at the metadata attributes. // So that the version strings are comparable, we will put them in an array first, and sort them. Set<String> holder = new HashSet<String>(); String pathAttributeName = null; MatchMap matchMap = new MatchMap(); boolean allMetadata = false; int i = 0; while (i < spec.getChildCount()) { SpecificationNode n = spec.getChild(i++); if (n.getType().equals("ReturnedMetadata")) { String category = n.getAttributeValue("category"); String attributeName = n.getAttributeValue("property"); String metadataName; if (category == null || category.length() == 0) metadataName = attributeName; else metadataName = category + "." + attributeName; holder.add(metadataName); } else if (n.getType().equals("AllMetadata")) { String value = n.getAttributeValue("value"); if (value != null && value.equals("true")) { allMetadata = true; } } else if (n.getType().equals("pathnameattribute")) pathAttributeName = n.getAttributeValue("value"); else if (n.getType().equals("pathmap")) { // Path mapping info also needs to be looked at, because it affects what is // ingested. String pathMatch = n.getAttributeValue("match"); String pathReplace = n.getAttributeValue("replace"); matchMap.appendMatchPair(pathMatch, pathReplace); } } while (true) { getSession(); // The version string returned must include everything that could affect what is ingested. In meridio's // case, this includes the date stamp, but it also includes the part of the specification that describes // the metadata desired. // The code here relies heavily on the search method to do it's thing. The search method originally // used the document specification to determine what metadata to return, which was problematic because that // meant this method had to modify the specification (not good practice), and was also wrong from the point // of view that we need to get the metadata specification appended to the version string in some way, and // use THAT data in processDocuments(). So I've broken all that up. try { // Put into an array ReturnMetadata[] categoryPropertyValues; String[] categoryPropertyStringValues; String[] sortArray; if (allMetadata) { categoryPropertyStringValues = getMeridioDocumentProperties(); } else { categoryPropertyStringValues = new String[holder.size()]; i = 0; for (String value : holder) { categoryPropertyStringValues[i++] = value; } } // Sort! java.util.Arrays.sort(categoryPropertyStringValues); categoryPropertyValues = new ReturnMetadata[categoryPropertyStringValues.length]; i = 0; for (String value : categoryPropertyStringValues) { int dotIndex = value.indexOf("."); String categoryName = null; String propertyName; if (dotIndex == -1) propertyName = value; else { categoryName = value.substring(0, dotIndex); propertyName = value.substring(dotIndex + 1); } categoryPropertyValues[i++] = new ReturnMetadata(categoryName, propertyName); } // Prepare the part of the version string that is decodeable StringBuilder decodeableString = new StringBuilder(); // Add the metadata piece first packList(decodeableString, categoryPropertyStringValues, '+'); // Now, put in the forced acls. // The version string needs only to contain the forced acls, since the version date captures changes // made to the acls that are actually associated with the document. if (acls == null) decodeableString.append('-'); else { decodeableString.append('+'); packList(decodeableString, acls, '+'); decodeableString.append('+'); pack(decodeableString, defaultAuthorityDenyToken, '+'); } // Calculate the part of the version string that comes from path name and mapping. if (pathAttributeName != null) { decodeableString.append("+"); pack(decodeableString, pathAttributeName, '+'); pack(decodeableString, matchMap.toString(), '+'); } else decodeableString.append("-"); long[] docIds = new long[documentIdentifiers.length]; for (i = 0; i < documentIdentifiers.length; i++) { docIds[i] = new Long(documentIdentifiers[i]).longValue(); } /*================================================================= * Call the search, with the document specification and the list of * document ids - the search will never return more than exactly * one match per document id * * We are assuming that the maximum number of hits to return * should never be more than the maximum batch size set up for this * class * * We are just making one web service call (to the search API) * rather than iteratively calling a web service method for each * document passed in as part of the document array * * Additionally, re-using the same search method as for the * "getDocumentIdentifiers" method ensures that we are not * duplicating any logic which ensures that the document/records * in question match the search criteria or not. *================================================================*/ DMSearchResults searchResults = documentSpecificationSearch(spec, 0, 0, 1, this.getMaxDocumentRequest(), docIds, null); if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Found a total of <" + searchResults.totalHitsCount + "> hit(s) " + "and <" + searchResults.returnedHitsCount + "> were returned by the method call"); // If we are searching based on document identifier, then it is possible that we will not // find a document we are looking for, if it was removed from the system between the time // it was put in the queue and when it's version is obtained. Documents where this happens // should return a version string of null. // Let's go through the search results and build a hash based on the document identifier. Map<Long, SEARCHRESULTS_DOCUMENTS> documentMap = new HashMap<Long, SEARCHRESULTS_DOCUMENTS>(); if (searchResults.dsDM != null) { SEARCHRESULTS_DOCUMENTS[] srd = searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS(); for (i = 0; i < srd.length; i++) { documentMap.put(new Long(srd[i].getDocId()), srd[i]); } } // Now, walk through the individual documents. Map<Long, String> versionStrings = new HashMap<Long, String>(); for (int j = 0; j < docIds.length; j++) { String documentIdentifier = documentIdentifiers[j]; long docId = docIds[j]; Long docKey = new Long(docId); // Look up the record. SEARCHRESULTS_DOCUMENTS doc = documentMap.get(docKey); if (doc != null) { // Set the version string. The parseable stuff goes first, so parsing is easy. String version = doc.getStr_value(); StringBuilder composedVersion = new StringBuilder(); composedVersion.append(decodeableString); composedVersion.append(version); // Added 9/7/2007 composedVersion.append("_").append(urlVersionBase); // String versionString = composedVersion.toString(); if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("Meridio: Document " + docKey + " has version " + versionString); if (activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)) versionStrings.put(docKey, versionString); } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Meridio: Document " + docKey + " is no longer in the search set, or has been deleted - removing."); activities.deleteDocument(documentIdentifier); } } // Now submit search requests for all the documents requiring fetch. Map<Long, Map<String, String>> documentPropertyMap = new HashMap<Long, Map<String, String>>(); // Only look up metadata if we need some! if (versionStrings.size() > 0 && categoryPropertyValues.length > 0) { long[] fetchIds = new long[versionStrings.size()]; i = 0; for (Long docKey : versionStrings.keySet()) { fetchIds[i++] = docKey; } /*================================================================= * Call the search, with the document specification and the list of * document ids - the search will never return more than exactly * one match per document id * * This call will return all the metadata that was specified in the * document specification for all the documents and * records in one call. *================================================================*/ searchResults = documentSpecificationSearch(spec, 0, 0, 1, fetchIds.length, fetchIds, categoryPropertyValues); // If we ask for a document and it is no longer there, we should treat this as a deletion. // The activity in that case is to delete the document. A similar thing should happen if // any of the other methods (like getting the document's content) also fail to find the // document. // Let's build a hash which contains all the document metadata returned. The form of // the hash will be: key = the document identifier, value = another hash, which is keyed // by the metadata category/property, and which has a value that is the metadata value. Map<Long, MutableInteger> counterMap = new HashMap<Long, MutableInteger>(); if (searchResults.dsDM != null) { SEARCHRESULTS_DOCUMENTS[] searchResultsDocuments = searchResults.dsDM .getSEARCHRESULTS_DOCUMENTS(); for (SEARCHRESULTS_DOCUMENTS searchResultsDocument : searchResultsDocuments) { long docId = searchResultsDocument.getDocId(); Long docKey = new Long(docId); MutableInteger counterMapItem = counterMap.get(docKey); if (counterMapItem == null) { counterMapItem = new MutableInteger(); counterMap.put(docKey, counterMapItem); } String propertyName = categoryPropertyStringValues[counterMapItem.getValue()]; counterMapItem.increment(); String propertyValue = searchResultsDocuments[i].getStr_value(); Map<String, String> propertyMap = documentPropertyMap.get(docKey); if (propertyMap == null) { propertyMap = new HashMap<String, String>(); documentPropertyMap.put(docKey, propertyMap); } if (propertyValue != null && propertyValue.length() > 0) propertyMap.put(propertyName, propertyValue); } } } // Okay, we are ready now to go through the individual documents and do the ingestion or deletion. for (String documentIdentifier : documentIdentifiers) { Long docKey = new Long(documentIdentifier); long docId = docKey.longValue(); String docVersion = versionStrings.get(docKey); if (docVersion != null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Processing document identifier '" + documentIdentifier + "' " + "with version string '" + docVersion + "'"); // For each document, be sure the job is still allowed to run. activities.checkJobStillActive(); RepositoryDocument repositoryDocument = new RepositoryDocument(); // Load the metadata items into the ingestion document object Map<String, String> docMetadataMap = documentPropertyMap.get(docKey); if (docMetadataMap != null) { for (String categoryPropertyName : categoryPropertyStringValues) { String propertyValue = docMetadataMap.get(categoryPropertyName); if (propertyValue != null && propertyValue.length() > 0) repositoryDocument.addField(categoryPropertyName, propertyValue); } } /*================================================================= * Construct the URL to the object * * HTTP://HOST:PORT/meridio/browse/downloadcontent.aspx?documentId=<docId>&launchMode=1&launchAs=0 * * I expect we need to add additional parameters to the configuration * specification *================================================================*/ String fileURL = urlBase + new Long(docId).toString(); if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug( "URL for document '" + new Long(docId).toString() + "' is '" + fileURL + "'"); /*================================================================= * Get the object's ACLs and owner information *================================================================*/ DMDataSet documentData = null; documentData = meridio_.getDocumentData((int) docId, true, true, false, false, DmVersionInfo.LATEST, false, false, false); if (null == documentData) { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("Meridio: Could not retrieve document data for document id '" + new Long(docId).toString() + "' in processDocuments method - deleting document."); activities.noDocument(documentIdentifier, docVersion); continue; } if (null == documentData.getDOCUMENTS() || documentData.getDOCUMENTS().length != 1) { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("Meridio: Could not retrieve document owner for document id '" + new Long(docId).toString() + "' in processDocuments method. No information or incorrect amount " + "of information was returned"); activities.noDocument(documentIdentifier, docVersion); continue; } // Do path metadata if (pathAttributeName != null && pathAttributeName.length() > 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Meridio: Path attribute name is " + pathAttributeName); RMDataSet partList; int recordType = documentData.getDOCUMENTS()[0].getPROP_recordType(); if (recordType == 0 || recordType == 4 || recordType == 19) partList = meridio_.getRecordPartList((int) docId, false, false); else partList = meridio_.getDocumentPartList((int) docId); if (partList != null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Meridio: Document '" + new Long(docId).toString() + "' has a part list with " + Integer.toString(partList.getRm2vPart().length) + " values"); for (int k = 0; k < partList.getRm2vPart().length; k++) { repositoryDocument.addField(pathAttributeName, matchMap.translate(partList.getRm2vPart()[k].getParentTitlePath())); } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Meridio: Document '" + new Long(docId).toString() + "' has no part list, so no path attribute"); } } // Process acls. If there are forced acls, use those, otherwise get them from Meridio. String[] allowAcls; String[] denyAcls; // forcedAcls will be null if security is off, or nonzero length if security is on but hard-wired if (acls != null && acls.length == 0) { ACCESSCONTROL[] documentAcls = documentData.getACCESSCONTROL(); List<String> allowAclsArrayList = new ArrayList<String>(); List<String> denyAclsArrayList = new ArrayList<String>(); // Allow a broken authority to disable all Meridio documents, even if the document is 'wide open', because // Meridio does not permit viewing of the document if the user does not exist (at least, I don't know of a way). denyAclsArrayList.add(defaultAuthorityDenyToken); if (documentAcls != null) { for (int j = 0; j < documentAcls.length; j++) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Object Id '" + documentAcls[j].getObjectId() + "' " + "Object Type '" + documentAcls[j].getObjectType() + "' " + "Permission '" + documentAcls[j].getPermission() + "' " + "User Id '" + documentAcls[j].getUserId() + "' " + "Group Id '" + documentAcls[j].getGroupId() + "'"); if (documentAcls[j].getPermission() == 0) // prohibit permission { if (documentAcls[j].getGroupId() > 0) { denyAclsArrayList.add("G" + documentAcls[j].getGroupId()); } else if (documentAcls[j].getUserId() > 0) { denyAclsArrayList.add("U" + documentAcls[j].getUserId()); } } else // read, amend or manage { if (documentAcls[j].getGroupId() > 0) { allowAclsArrayList.add("G" + documentAcls[j].getGroupId()); } else if (documentAcls[j].getUserId() > 0) { allowAclsArrayList.add("U" + documentAcls[j].getUserId()); } } } } DOCUMENTS document = documentData.getDOCUMENTS()[0]; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Document id '" + new Long(docId).toString() + "' is owned by owner id '" + document.getPROP_ownerId() + "' having the owner name '" + document.getPROP_ownerName() + "' Record Type is '" + document.getPROP_recordType() + "'"); if (document.getPROP_recordType() == 4 || document.getPROP_recordType() == 19) { RMDataSet rmds = meridio_.getRecord((int) docId, false, false, false); Rm2vRecord record = rmds.getRm2vRecord()[0]; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Record User Id Owner is '" + record.getOwnerID() + "' Record Group Owner Id is '" + record.getGroupOwnerID() + "'"); /*================================================================= * Either a group or a user owns a record, cannot be both and the * group takes priority if it is set *================================================================*/ if (record.getGroupOwnerID() > 0) { allowAclsArrayList.add("G" + record.getGroupOwnerID()); } else if (record.getOwnerID() > 0) { allowAclsArrayList.add("U" + record.getOwnerID()); } } else { allowAclsArrayList.add("U" + document.getPROP_ownerId()); } /*================================================================= * Set up the string arrays and then set the ACLs in the * repository document *================================================================*/ allowAcls = new String[allowAclsArrayList.size()]; for (int j = 0; j < allowAclsArrayList.size(); j++) { allowAcls[j] = allowAclsArrayList.get(j); if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("Meridio: Adding '" + allowAcls[j] + "' to allow ACLs"); } denyAcls = new String[denyAclsArrayList.size()]; for (int j = 0; j < denyAclsArrayList.size(); j++) { denyAcls[j] = denyAclsArrayList.get(j); if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Meridio: Adding '" + denyAcls[j] + "' to deny ACLs"); } } else { allowAcls = acls; if (allowAcls == null) denyAcls = null; else denyAcls = new String[] { defaultAuthorityDenyToken }; } repositoryDocument.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, allowAcls, denyAcls); /*================================================================= * Get the object's content, and ingest the document *================================================================*/ try { AttachmentPart ap = meridio_.getLatestVersionFile((int) docId); if (null == ap) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Meridio: Failed to get content for document '" + new Long(docId).toString() + "'"); // No document. Delete what's there activities.noDocument(documentIdentifier, docVersion); continue; } try { // Get the file name. String fileName = ap.getDataHandler().getName(); // Log what we are about to do. if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Meridio: File data is supposedly in " + fileName); File theTempFile = new File(fileName); if (theTempFile.isFile()) { long fileSize = theTempFile.length(); // ap.getSize(); if (activities.checkLengthIndexable(fileSize)) { InputStream is = new FileInputStream(theTempFile); // ap.getDataHandler().getInputStream(); try { repositoryDocument.setBinary(is, fileSize); if (null != activities) { activities.ingestDocumentWithException(documentIdentifier, docVersion, fileURL, repositoryDocument); } } finally { is.close(); } } else { activities.noDocument(documentIdentifier, docVersion); continue; } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug( "Meridio: Expected temporary file was not present - skipping document '" + new Long(docId).toString() + "'"); activities.deleteDocument(documentIdentifier); continue; } } finally { ap.dispose(); } } catch (java.net.SocketTimeoutException ioex) { throw new ManifoldCFException("Socket timeout exception: " + ioex.getMessage(), ioex); } catch (ConnectTimeoutException ioex) { throw new ManifoldCFException("Connect timeout exception: " + ioex.getMessage(), ioex); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (org.apache.axis.AxisFault e) { throw e; } catch (RemoteException e) { throw e; } catch (SOAPException soapEx) { throw new ManifoldCFException( "SOAP Exception encountered while retrieving document content: " + soapEx.getMessage(), soapEx); } catch (IOException ioex) { throw new ManifoldCFException("Input stream failure: " + ioex.getMessage(), ioex); } } } Logging.connectors.debug("Meridio: Exiting 'processDocuments' method"); return; } catch (org.apache.axis.AxisFault e) { long currentTime = System.currentTimeMillis(); if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/", "HTTP"))) { org.w3c.dom.Element elem = e.lookupFaultDetail( new javax.xml.namespace.QName("http://xml.apache.org/axis/", "HttpErrorCode")); if (elem != null) { elem.normalize(); String httpErrorCode = elem.getFirstChild().getNodeValue().trim(); throw new ManifoldCFException("Unexpected http error code " + httpErrorCode + " accessing Meridio: " + e.getMessage(), e); } throw new ManifoldCFException( "Unknown http error occurred while getting doc versions: " + e.getMessage(), e); } if (e.getFaultCode().equals(new javax.xml.namespace.QName( "http://schemas.xmlsoap.org/soap/envelope/", "Server.userException"))) { String exceptionName = e.getFaultString(); if (exceptionName.equals("java.lang.InterruptedException")) throw new ManifoldCFException("Interrupted", ManifoldCFException.INTERRUPTED); } if (e.getFaultCode().equals( new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/", "Server"))) { if (e.getFaultString().indexOf(" 23031#") != -1) { // This means that the session has expired, so reset it and retry meridio_ = null; continue; } } if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("Meridio: Got an unknown remote exception getting doc versions - axis fault = " + e.getFaultCode().getLocalPart() + ", detail = " + e.getFaultString() + " - retrying", e); throw new ServiceInterruption("Remote procedure exception: " + e.getMessage(), e, currentTime + 300000L, currentTime + 3 * 60 * 60000L, -1, false); } catch (RemoteException remoteException) { throw new ManifoldCFException("Meridio: A remote exception occurred while getting doc versions: " + remoteException.getMessage(), remoteException); } catch (MeridioDataSetException meridioDataSetException) { throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " + "Service XML: " + meridioDataSetException.getMessage(), meridioDataSetException); } } }
From source file:org.apache.camel.component.box.internal.LongPollingEventsManager.java
@SuppressWarnings("unused") public void poll(long streamPosition, final String streamType, final int limit, final EventCallback callback) throws BoxServerException, AuthFatalFailureException, BoxRestException { // get BoxClient Event Manager final IBoxEventsManager eventsManager = cachedBoxClient.getBoxClient().getEventsManager(); // get current stream position if requested if (BoxEventRequestObject.STREAM_POSITION_NOW == streamPosition) { streamPosition = getCurrentStreamPosition(eventsManager, streamPosition); }/*from w w w . j a va2s. c o m*/ // validate parameters ObjectHelper.notNull(streamPosition, "streamPosition"); ObjectHelper.notEmpty(streamType, "streamType"); ObjectHelper.notNull(callback, "eventCallback"); httpClient = new DefaultHttpClient(cachedBoxClient.getClientConnectionManager(), httpParams); // start polling thread LOG.info("Started event polling thread for " + cachedBoxClient); final long startStreamPosition = streamPosition; pollFuture = executorService.submit(new Runnable() { @Override public void run() { final ObjectMapper mapper = new ObjectMapper(); long currentStreamPosition = startStreamPosition; BoxRealTimeServer realTimeServer = null; boolean retry = false; int retries = 0; int maxRetries = 1; while (!done) { try { // set to true if no exceptions thrown retry = false; if (realTimeServer == null) { // get RTS URL realTimeServer = getBoxRealTimeServer(currentStreamPosition, eventsManager); // update HTTP timeout final int requestTimeout = Integer .parseInt(realTimeServer.getExtraData(RETRY_TIMEOUT).toString()); final HttpParams params = httpClient.getParams(); HttpConnectionParams.setSoTimeout(params, requestTimeout * 1000); // update maxRetries maxRetries = Integer.parseInt(realTimeServer.getExtraData(MAX_RETRIES).toString()); } // create HTTP request for RTS httpGet = getPollRequest(realTimeServer.getUrl(), currentStreamPosition); // execute RTS poll HttpResponse httpResponse = null; try { httpResponse = httpClient.execute(httpGet, (HttpContext) null); } catch (SocketTimeoutException e) { LOG.debug("Poll timed out, retrying for " + cachedBoxClient); } if (httpResponse != null) { // parse response final StatusLine statusLine = httpResponse.getStatusLine(); if (statusLine != null && statusLine.getStatusCode() == HttpStatus.SC_OK) { final HttpEntity entity = httpResponse.getEntity(); @SuppressWarnings("unchecked") Map<String, String> rtsResponse = mapper.readValue(entity.getContent(), Map.class); final String message = rtsResponse.get(MESSAGE); if (NEW_CHANGE.equals(message)) { // get events final BoxEventRequestObject requestObject = BoxEventRequestObject .getEventsRequestObject(currentStreamPosition); requestObject.setStreamType(streamType); requestObject.setLimit(limit); final BoxEventCollection events = eventsManager.getEvents(requestObject); // notify callback callback.onEvent(events); // update stream position currentStreamPosition = events.getNextStreamPosition(); } else if (RECONNECT.equals(message) || MAX_RETRIES.equals(message)) { LOG.debug("Long poll reconnect for " + cachedBoxClient); realTimeServer = null; } else if (OUT_OF_DATE.equals(message)) { // update currentStreamPosition LOG.debug("Long poll out of date for " + cachedBoxClient); currentStreamPosition = getCurrentStreamPosition(eventsManager, BoxEventRequestObject.STREAM_POSITION_NOW); realTimeServer = null; } else { throw new RuntimeCamelException("Unknown poll response " + message); } } else { String msg = "Unknown error"; if (statusLine != null) { msg = String.format("Error polling events for %s: code=%s, message=%s", cachedBoxClient, statusLine.getStatusCode(), statusLine.getReasonPhrase()); } throw new RuntimeCamelException(msg); } } // keep polling retry = true; } catch (InterruptedException e) { LOG.debug("Interrupted event polling thread for {}, exiting...", cachedBoxClient); } catch (BoxSDKException e) { callback.onException(e); } catch (RuntimeCamelException e) { callback.onException(e); } catch (SocketException e) { // TODO handle connection aborts!!! LOG.debug("Socket exception while event polling for {}", cachedBoxClient); retry = true; realTimeServer = null; } catch (Exception e) { callback.onException(new RuntimeCamelException( "Error while polling for " + cachedBoxClient + ": " + e.getMessage(), e)); } finally { // are we done yet? if (!retry) { done = true; } else { if (realTimeServer != null && (++retries > maxRetries)) { // make another option call realTimeServer = null; } } } } LOG.info("Stopped event polling thread for " + cachedBoxClient); } }); }
From source file:org.apache.manifoldcf.crawler.connectors.rss.RSSConnector.java
/** Handle an RSS feed document, using SAX to limit the memory impact */ protected void handleRSSFeedSAX(String documentIdentifier, IProcessActivity activities, Filter filter) throws ManifoldCFException, ServiceInterruption { // The SAX model uses parsing events to control parsing, which allows me to manage memory usage much better. // This is essential for when a feed contains dechromed content as well as links. // First, catch all flavors of IO exception, and handle them properly try {/*w ww. j a v a 2s . c om*/ // Open the input stream, and set up the parse InputStream is = cache.getData(documentIdentifier); if (is == null) { Logging.connectors.error("RSS: Document '" + documentIdentifier + "' should be in cache but isn't"); return; } try { Parser p = new Parser(); // Parse the document. This will cause various things to occur, within the instantiated XMLParsingContext class. XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState(); OuterContextClass c = new OuterContextClass(x, documentIdentifier, activities, filter); x.setContext(c); try { // Believe it or not, there are no parsing errors we can get back now. p.parseWithCharsetDetection(null, is, x); c.checkIfValidFeed(); c.setDefaultRescanTimeIfNeeded(); } finally { x.cleanup(); } } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException("Socket timeout error: " + e.getMessage(), e); } catch (ConnectTimeoutException e) { throw new ManifoldCFException("Socket connect timeout error: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO error: " + e.getMessage(), e); } }
From source file:org.apache.manifoldcf.crawler.connectors.rss.RSSConnector.java
/** Process a set of documents. * This is the method that should cause each document to be fetched, processed, and the results either added * to the queue of documents for the current job, and/or entered into the incremental ingestion manager. * The document specification allows this class to filter what is done based on the job. * The connector will be connected before this method can be called. *@param documentIdentifiers is the set of document identifiers to process. *@param statuses are the currently-stored document versions for each document in the set of document identifiers * passed in above./*from w ww . ja v a2 s . com*/ *@param activities is the interface this method should use to queue up new document references * and ingest documents. *@param jobMode is an integer describing how the job is being run, whether continuous or once-only. *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one. */ @Override public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption { getSession(); // The connection limit is designed to permit this connector to coexist with potentially other connectors, such as the web connector. // There is currently no good way to enforce connection limits across all installed connectors - this will require considerably more // thought to set up properly. int connectionLimit = 200; String[] fixedList = new String[2]; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In getDocumentVersions for " + Integer.toString(documentIdentifiers.length) + " documents"); Filter f = new Filter(spec, false); String[] acls = f.getAcls(); // Sort it, java.util.Arrays.sort(acls); // NOTE: There are two kinds of documents in here; documents that are RSS feeds (that presumably have a content-type // of text/xml), and documents that need to be indexed. // // For the latter, the metadata etc is part of the version string. For the former, the only thing that is part of the version string is the // document's checksum. // // The need to exclude documents from fetch based on whether they match an expression causes some difficulties, because we really // DON'T want this to apply to the feeds themselves. Since the distinguishing characteristic of a feed is that it is in the seed list, // and that its content-type is text/xml, we could use either of these characteristics to treat feeds differently from // fetchable urls. But the latter approach requires a fetch, which is forbidden. So - the spec will be used to characterize the url. // However, the spec might change, and the url might be dropped from the list - and then what?? // // The final solution is to simply not queue what cannot be mapped. int feedTimeout = f.getFeedTimeoutValue(); // The document specification has already been used to trim out documents that are not // allowed from appearing in the queue. So, even that has already been done. for (String documentIdentifier : documentIdentifiers) { // If it is in this list, we presume that it has been vetted against the map etc., so we don't do that again. We just fetch it. // And, if the content type is xml, we calculate the version as if it is a feed rather than a document. // Get the url String urlValue = documentIdentifier; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Getting version string for '" + urlValue + "'"); String versionString; String ingestURL = null; String[] pubDates = null; String[] sources = null; String[] titles = null; String[] authorNames = null; String[] authorEmails = null; String[] categories = null; String[] descriptions = null; try { // If there's a carrydown "data" value for this url, we use that value rather than actually fetching the document. This also means we don't need to // do a robots check, because we aren't actually crawling anything. So, ALWAYS do this first... CharacterInput[] dechromedData = activities.retrieveParentDataAsFiles(urlValue, "data"); try { if (dechromedData.length > 0) { // Data already available. The fetch cycle can be entirely avoided, as can the robots check. ingestURL = f.mapDocumentURL(urlValue); if (ingestURL != null) { // Open up an input stream corresponding to the carrydown data. The stream will be encoded as utf-8. try { InputStream is = dechromedData[0].getUtf8Stream(); try { StringBuilder sb = new StringBuilder(); long checkSum = cache.addData(activities, urlValue, "text/html", is); // Grab what we need from the passed-down data for the document. These will all become part // of the version string. pubDates = activities.retrieveParentData(urlValue, "pubdate"); sources = activities.retrieveParentData(urlValue, "source"); titles = activities.retrieveParentData(urlValue, "title"); authorNames = activities.retrieveParentData(urlValue, "authorname"); authorEmails = activities.retrieveParentData(urlValue, "authoremail"); categories = activities.retrieveParentData(urlValue, "category"); descriptions = activities.retrieveParentData(urlValue, "description"); java.util.Arrays.sort(pubDates); java.util.Arrays.sort(sources); java.util.Arrays.sort(titles); java.util.Arrays.sort(authorNames); java.util.Arrays.sort(authorEmails); java.util.Arrays.sort(categories); java.util.Arrays.sort(descriptions); if (sources.length == 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Warning; URL '" + ingestURL + "' doesn't seem to have any RSS feed source!"); } sb.append('+'); packList(sb, acls, '+'); if (acls.length > 0) { sb.append('+'); pack(sb, defaultAuthorityDenyToken, '+'); } else sb.append('-'); // The ingestion URL pack(sb, ingestURL, '+'); // The pub dates packList(sb, pubDates, '+'); // The titles packList(sb, titles, '+'); // The sources packList(sb, sources, '+'); // The categories packList(sb, categories, '+'); // The descriptions packList(sb, descriptions, '+'); // The author names packList(sb, authorNames, '+'); // The author emails packList(sb, authorEmails, '+'); // Do the checksum part, which does not need to be parseable. sb.append(new Long(checkSum).toString()); versionString = sb.toString(); } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException( "IO exception reading data from string: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException( "IO exception reading data from string: " + e.getMessage(), e); } } else { // Document a seed or unmappable; just skip if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping carry-down document '" + urlValue + "' because it is unmappable or is a seed."); } } else { // Get the old version string String oldVersionString = statuses.getIndexedVersionString(documentIdentifier); // Unpack the old version as much as possible. // We are interested in what the ETag and Last-Modified headers were last time. String lastETagValue = null; String lastModifiedValue = null; // Note well: Non-continuous jobs cannot use etag because the rss document MUST be fetched each time for such jobs, // or the documents it points at would get deleted. // // NOTE: I disabled this code because we really need the feed's TTL value in order to reschedule properly. I can't get the // TTL value without refetching the document - therefore ETag and Last-Modified cannot be used :-( if (false && jobMode == JOBMODE_CONTINUOUS && oldVersionString != null && oldVersionString.startsWith("-")) { // It's a feed, so the last etag and last-modified fields should be encoded in this version string. StringBuilder lastETagBuffer = new StringBuilder(); int unpackPos = unpack(lastETagBuffer, oldVersionString, 1, '+'); StringBuilder lastModifiedBuffer = new StringBuilder(); unpackPos = unpack(lastModifiedBuffer, oldVersionString, unpackPos, '+'); if (lastETagBuffer.length() > 0) lastETagValue = lastETagBuffer.toString(); if (lastModifiedBuffer.length() > 0) lastModifiedValue = lastModifiedBuffer.toString(); } if (Logging.connectors.isDebugEnabled() && (lastETagValue != null || lastModifiedValue != null)) Logging.connectors.debug( "RSS: Document '" + urlValue + "' was found to have a previous ETag value of '" + ((lastETagValue == null) ? "null" : lastETagValue) + "' and a previous Last-Modified value of '" + ((lastModifiedValue == null) ? "null" : lastModifiedValue) + "'"); // Robots check. First, we need to separate the url into its components URL url; try { url = new URL(urlValue); } catch (MalformedURLException e) { Logging.connectors.debug("RSS: URL '" + urlValue + "' is malformed; skipping", e); activities.deleteDocument(documentIdentifier); continue; } String protocol = url.getProtocol(); int port = url.getPort(); String hostName = url.getHost(); String pathPart = url.getFile(); // Check with robots to see if it's allowed if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext, throttleGroupName, protocol, port, hostName, url.getPath(), userAgent, from, proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities, connectionLimit)) { activities.recordActivity(null, ACTIVITY_FETCH, null, urlValue, Integer.toString(-2), "Robots exclusion", null); if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: Skipping url '" + urlValue + "' because robots.txt says to"); activities.deleteDocument(documentIdentifier); continue; } // Now, use the fetcher, and get the file. IThrottledConnection connection = fetcher.createConnection(currentContext, throttleGroupName, hostName, connectionLimit, feedTimeout, proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities); try { // Begin the fetch connection.beginFetch("Data"); try { // Execute the request. // Use the connect timeout from the document specification! int status = connection.executeFetch(protocol, port, pathPart, userAgent, from, lastETagValue, lastModifiedValue); switch (status) { case IThrottledConnection.STATUS_NOCHANGE: versionString = oldVersionString; break; case IThrottledConnection.STATUS_OK: try { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Successfully fetched " + urlValue); // Document successfully fetched! // If its content is xml, presume it's a feed... String contentType = connection.getResponseHeader("Content-Type"); // Some sites have multiple content types. We just look at the LAST one in that case. if (contentType != null) { String[] contentTypes = contentType.split(","); if (contentTypes.length > 0) contentType = contentTypes[contentTypes.length - 1].trim(); else contentType = null; } String strippedContentType = contentType; if (strippedContentType != null) { int pos = strippedContentType.indexOf(";"); if (pos != -1) strippedContentType = strippedContentType.substring(0, pos).trim(); } boolean isXML = (strippedContentType != null && xmlContentTypes.contains(strippedContentType)); ingestURL = null; if (!isXML) { // If the chromed content mode is set to "skip", and we got here, it means // we should not include the content. if (f.getChromedContentMode() == CHROMED_SKIP) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing url '" + urlValue + "' because it no longer has dechromed content available"); versionString = null; break; } // Decide whether to exclude this document based on what we see here. // Basically, we want to get rid of everything that we don't know what // to do with in the ingestion system. if (!activities.checkMimeTypeIndexable(contentType)) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing url '" + urlValue + "' because it had the wrong content type: " + ((contentType == null) ? "null" : "'" + contentType + "'")); versionString = null; break; } ingestURL = f.mapDocumentURL(urlValue); } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: The url '" + urlValue + "' is a feed"); if (!f.isSeed(urlValue)) { // Remove the feed from consideration, since it has left the list of seeds if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing feed url '" + urlValue + "' because it is not a seed."); versionString = null; break; } } InputStream is = connection.getResponseBodyStream(); try { long checkSum = cache.addData(activities, urlValue, contentType, is); StringBuilder sb = new StringBuilder(); if (ingestURL != null) { // We think it is ingestable. The version string accordingly starts with a "+". // Grab what we need from the passed-down data for the document. These will all become part // of the version string. pubDates = activities.retrieveParentData(urlValue, "pubdate"); sources = activities.retrieveParentData(urlValue, "source"); titles = activities.retrieveParentData(urlValue, "title"); authorNames = activities.retrieveParentData(urlValue, "authorname"); authorEmails = activities.retrieveParentData(urlValue, "authoremail"); categories = activities.retrieveParentData(urlValue, "category"); descriptions = activities.retrieveParentData(urlValue, "description"); java.util.Arrays.sort(pubDates); java.util.Arrays.sort(sources); java.util.Arrays.sort(titles); java.util.Arrays.sort(authorNames); java.util.Arrays.sort(authorEmails); java.util.Arrays.sort(categories); java.util.Arrays.sort(descriptions); if (sources.length == 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Warning; URL '" + ingestURL + "' doesn't seem to have any RSS feed source!"); } sb.append('+'); packList(sb, acls, '+'); if (acls.length > 0) { sb.append('+'); pack(sb, defaultAuthorityDenyToken, '+'); } else sb.append('-'); // The ingestion URL pack(sb, ingestURL, '+'); // The pub dates packList(sb, pubDates, '+'); // The titles packList(sb, titles, '+'); // The sources packList(sb, sources, '+'); // The categories packList(sb, categories, '+'); // The descriptions packList(sb, descriptions, '+'); // The author names packList(sb, authorNames, '+'); // The author emails packList(sb, authorEmails, '+'); } else { sb.append('-'); String etag = connection.getResponseHeader("ETag"); if (etag == null) pack(sb, "", '+'); else pack(sb, etag, '+'); String lastModified = connection.getResponseHeader("Last-Modified"); if (lastModified == null) pack(sb, "", '+'); else pack(sb, lastModified, '+'); } // Do the checksum part, which does not need to be parseable. sb.append(new Long(checkSum).toString()); versionString = sb.toString(); } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { Logging.connectors .warn("RSS: Socket timeout exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } catch (ConnectTimeoutException e) { Logging.connectors .warn("RSS: Connecto timeout exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { Logging.connectors.warn("RSS: IO exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } break; case IThrottledConnection.STATUS_SITEERROR: case IThrottledConnection.STATUS_PAGEERROR: default: // Record an *empty* version. // This signals the processDocuments() method that we really don't want to ingest this document, but we also don't // want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times. versionString = ""; break; } } finally { connection.doneFetch(activities); } } finally { connection.close(); } if (versionString == null) { activities.deleteDocument(documentIdentifier); continue; } if (!(versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier, versionString))) continue; // Process document! if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Processing '" + urlValue + "'"); // The only links we extract come from documents that we think are RSS feeds. // When we think that's the case, we attempt to parse it as RSS XML. if (ingestURL == null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Interpreting document '" + urlValue + "' as a feed"); // We think it is a feed. // If this is a continuous job, AND scanonly is true, it means that the document was either identical to the // previous fetch, or was not fetched at all. In that case, it may not even be there, and we *certainly* don't // want to attempt to process it in any case. // // NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost. If the // TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds. if (true || jobMode != JOBMODE_CONTINUOUS) { handleRSSFeedSAX(urlValue, activities, f); if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Extraction of feed '" + urlValue + "' complete"); // Record the feed's version string, so we won't refetch unless needed. // This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to // keep track of the adaptive parameters. activities.recordDocument(documentIdentifier, versionString); } else { // The problem here is that we really do need to set the rescan time to something reasonable. // But we might not even have read the feed! So what to do?? // One answer is to build a connector-specific table that carries the last value of every feed around. // Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified). if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Feed '" + urlValue + "' does not appear to differ from previous fetch for a continuous job; not extracting!"); long currentTime = System.currentTimeMillis(); Long defaultRescanTime = f.getDefaultRescanTime(currentTime); if (defaultRescanTime != null) { Long minimumTime = f.getMinimumRescanTime(currentTime); if (minimumTime != null) { if (defaultRescanTime.longValue() < minimumTime.longValue()) defaultRescanTime = minimumTime; } } activities.setDocumentScheduleBounds(urlValue, defaultRescanTime, defaultRescanTime, null, null); } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Interpreting '" + urlValue + "' as a document"); String errorCode = null; String errorDesc = null; long startTime = System.currentTimeMillis(); Long fileLengthLong = null; try { long documentLength = cache.getDataLength(documentIdentifier); if (!activities.checkLengthIndexable(documentLength)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_LENGTH; errorDesc = "Document rejected because of length (" + documentLength + ")"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its length was rejected (" + documentLength + ")"); continue; } if (!activities.checkURLIndexable(documentIdentifier)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_URL; errorDesc = "Document rejected because of URL ('" + documentIdentifier + "')"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its URL was rejected ('" + documentIdentifier + "')"); continue; } // Check if it's a recognized content type String contentType = cache.getContentType(documentIdentifier); // Some sites have multiple content types. We just look at the LAST one in that case. if (contentType != null) { String[] contentTypes = contentType.split(","); if (contentTypes.length > 0) contentType = contentTypes[contentTypes.length - 1].trim(); else contentType = null; } if (!activities.checkMimeTypeIndexable(contentType)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_MIMETYPE; errorDesc = "Document rejected because of mime type (" + contentType + ")"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its mime type was rejected ('" + contentType + "')"); continue; } // Treat it as an ingestable document. long dataSize = cache.getDataLength(urlValue); RepositoryDocument rd = new RepositoryDocument(); // Set content type if (contentType != null) rd.setMimeType(contentType); // Turn into acls and add into description String[] denyAcls; if (acls == null) denyAcls = null; else if (acls.length == 0) denyAcls = new String[0]; else denyAcls = new String[] { defaultAuthorityDenyToken }; if (acls != null && denyAcls != null) rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, acls, denyAcls); if (titles != null && titles.length > 0) rd.addField("title", titles); if (authorNames != null && authorNames.length > 0) rd.addField("authorname", authorNames); if (authorEmails != null && authorEmails.length > 0) rd.addField("authoremail", authorEmails); if (descriptions != null && descriptions.length > 0) rd.addField("summary", descriptions); if (sources != null && sources.length > 0) rd.addField("source", sources); if (categories != null && categories.length > 0) rd.addField("category", categories); // The pubdates are a ms since epoch value; we want the minimum one for the origination time. Long minimumOrigTime = null; if (pubDates != null && pubDates.length > 0) { String[] pubDateValuesISO = new String[pubDates.length]; TimeZone tz = TimeZone.getTimeZone("UTC"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'"); df.setTimeZone(tz); for (int k = 0; k < pubDates.length; k++) { String pubDate = pubDates[k]; try { Long pubDateLong = new Long(pubDate); if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue()) minimumOrigTime = pubDateLong; pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue())); } catch (NumberFormatException e) { // Do nothing; the version string seems to not mean anything pubDateValuesISO[k] = ""; } } rd.addField("pubdate", pubDates); rd.addField("pubdateiso", pubDateValuesISO); } if (minimumOrigTime != null) activities.setDocumentOriginationTime(urlValue, minimumOrigTime); InputStream is = cache.getData(urlValue); if (is != null) { try { rd.setBinary(is, dataSize); try { activities.ingestDocumentWithException(documentIdentifier, versionString, ingestURL, rd); errorCode = "OK"; fileLengthLong = new Long(dataSize); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "reading data"); } } finally { try { is.close(); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "closing stream"); } } } } catch (ManifoldCFException e) { if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) errorCode = null; throw e; } finally { if (errorCode != null) activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, null, urlValue, errorCode, errorDesc, null); } } } } finally { for (CharacterInput ci : dechromedData) { if (ci != null) ci.discard(); } } } finally { // Remove any fetched documents. cache.deleteData(documentIdentifier); } } }
From source file:org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector.java
/** Is the document text, as far as we can tell? */ protected boolean isDocumentText(String documentURI) throws ManifoldCFException { try {//from w w w . j a va 2 s . c o m // Look at the first 4K byte[] byteBuffer = new byte[4096]; int amt; // Open file for reading. InputStream is = cache.getData(documentURI); if (is == null) return false; try { amt = 0; while (amt < byteBuffer.length) { int incr = is.read(byteBuffer, amt, byteBuffer.length - amt); if (incr == -1) break; amt += incr; } } finally { is.close(); } if (amt == 0) return false; return isText(byteBuffer, amt); } catch (SocketTimeoutException e) { throw new ManifoldCFException("Socket timeout exception accessing cached document: " + e.getMessage(), e); } catch (ConnectTimeoutException e) { throw new ManifoldCFException("Socket timeout exception accessing cached document: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO exception accessing cached document: " + e.getMessage(), e); } }
From source file:org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector.java
/** Handle document references from XML. Right now we only understand RSS. */ protected void handleXML(String documentURI, IXMLHandler handler) throws ManifoldCFException, ServiceInterruption { try {/*from w w w .j av a2 s . c o m*/ int responseCode = cache.getResponseCode(documentURI); if (responseCode != 200) return; // We ONLY look for XML if the content type *says* it is XML. String contentType = extractContentType(cache.getContentType(documentURI)); String mimeType = extractMimeType(contentType); boolean isXML = mimeType.equals("text/xml") || mimeType.equals("application/rss+xml") || mimeType.equals("application/xml") || mimeType.equals("application/atom+xml") || mimeType.equals("application/xhtml+xml") || mimeType.equals("text/XML") || mimeType.equals("application/rdf+xml") || mimeType.equals("text/application") || mimeType.equals("XML"); if (!isXML) return; // OK, it's XML. Now what? Well, we get the encoding, and we verify that it is text, then we try to get links // from it presuming it is an RSS feed. String encoding = extractEncoding(contentType); InputStream is = cache.getData(documentURI); if (is == null) { Logging.connectors.error("WEB: Document '" + documentURI + "' should be in cache but isn't"); return; } try { // Parse the document. This will cause various things to occur, within the instantiated XMLParsingContext class. Parser p = new Parser(); XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState(); OuterContextClass c = new OuterContextClass(x, documentURI, handler); x.setContext(c); try { p.parseWithCharsetDetection(encoding, is, x); c.checkIfValidFeed(); } finally { x.cleanup(); } } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException("Socket timeout exception: " + e.getMessage(), e); } catch (ConnectTimeoutException e) { throw new ManifoldCFException("Socket connect timeout exception: " + e.getMessage(), e); } catch (InterruptedIOException e) { //Logging.connectors.warn("IO interruption seen",e); throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO error: " + e.getMessage(), e); } }
From source file:org.zaproxy.zap.extension.ascanrulesAlpha.SQLInjectionSQLite.java
/** * scans for SQL Injection vulnerabilities, using SQLite specific syntax. If it doesn't use * specifically SQLite syntax, it does not belong in here, but in TestSQLInjection *//*from w w w . j a v a 2s . c om*/ @Override public void scan(HttpMessage originalMessage, String paramName, String originalParamValue) { try { // the original message passed to us never has the response populated. fix that by // re-retrieving it.. sendAndReceive(originalMessage, false); // do not follow redirects // Do time based SQL injection checks.. // Timing Baseline check: we need to get the time that it took the original query, to // know if the time based check is working correctly.. HttpMessage msgTimeBaseline = getNewMsg(); long originalTimeStarted = System.currentTimeMillis(); try { sendAndReceive(msgTimeBaseline); } catch (java.net.SocketTimeoutException e) { // to be expected occasionally, if the base query was one that contains some // parameters exploiting time based SQL injection? if (this.debugEnabled) log.debug("The Base Time Check timed out on [" + msgTimeBaseline.getRequestHeader().getMethod() + "] URL [" + msgTimeBaseline.getRequestHeader().getURI().getURI() + "]"); } long originalTimeUsed = System.currentTimeMillis() - originalTimeStarted; // if the time was very slow (because JSP was being compiled on first call, for // instance) // then the rest of the time based logic will fail. Lets double-check for that scenario // by requesting the url again. // If it comes back in a more reasonable time, we will use that time instead as our // baseline. If it come out in a slow fashion again, // we will abort the check on this URL, since we will only spend lots of time trying // request, when we will (very likely) not get positive results. if (originalTimeUsed > 5000) { long originalTimeStarted2 = System.currentTimeMillis(); try { sendAndReceive(msgTimeBaseline); } catch (java.net.SocketTimeoutException e) { // to be expected occasionally, if the base query was one that contains some // parameters exploiting time based SQL injection? if (this.debugEnabled) log.debug( "Base Time Check 2 timed out on [" + msgTimeBaseline.getRequestHeader().getMethod() + "] URL [" + msgTimeBaseline.getRequestHeader().getURI().getURI() + "]"); } long originalTimeUsed2 = System.currentTimeMillis() - originalTimeStarted2; if (originalTimeUsed2 > 5000) { // no better the second time around. we need to bale out. if (this.debugEnabled) log.debug("Both base time checks 1 and 2 for [" + msgTimeBaseline.getRequestHeader().getMethod() + "] URL [" + msgTimeBaseline.getRequestHeader().getURI().getURI() + "] are way too slow to be usable for the purposes of checking for time based SQL Injection checking. We are aborting the check on this particular url."); return; } else { // phew. the second time came in within the limits. use the later timing // details as the base time for the checks. originalTimeUsed = originalTimeUsed2; originalTimeStarted = originalTimeStarted2; } } // end of timing baseline check int countTimeBasedRequests = 0; if (this.debugEnabled) log.debug("Scanning URL [" + getBaseMsg().getRequestHeader().getMethod() + "] [" + getBaseMsg().getRequestHeader().getURI() + "], [" + paramName + "] with value [" + originalParamValue + "] for SQL Injection"); // SQLite specific time-based SQL injection checks boolean foundTimeBased = false; for (int timeBasedSQLindex = 0; timeBasedSQLindex < SQL_SQLITE_TIME_REPLACEMENTS.length && doTimeBased && countTimeBasedRequests < doTimeMaxRequests && !foundTimeBased; timeBasedSQLindex++) { // since we have no means to create a deterministic delay in SQLite, we need to take // a different approach: // in each iteration, increase the number of random blobs for SQLite to create. If // we can detect an increasing delay, we know // that the payload has been successfully injected. int numberOfSequentialIncreases = 0; String detectableDelayParameter = null; long detectableDelay = 0; String maxDelayParameter = null; long maxDelay = 0; HttpMessage detectableDelayMessage = null; long previousDelay = originalTimeUsed; boolean potentialTimeBasedSQLInjection = false; boolean timeExceeded = false; for (long numBlobsToCreate = minBlobBytes; numBlobsToCreate <= this.maxBlobBytes && !timeExceeded && numberOfSequentialIncreases < incrementalDelayIncreasesForAlert; numBlobsToCreate *= 10) { HttpMessage msgDelay = getNewMsg(); String newTimeBasedInjectionValue = SQL_SQLITE_TIME_REPLACEMENTS[timeBasedSQLindex] .replace("<<<<ORIGINALVALUE>>>>", originalParamValue); newTimeBasedInjectionValue = newTimeBasedInjectionValue.replace("<<<<NUMBLOBBYTES>>>>", Long.toString(numBlobsToCreate)); setParameter(msgDelay, paramName, newTimeBasedInjectionValue); if (this.debugEnabled) log.debug("\nTrying '" + newTimeBasedInjectionValue + "'. The number of Sequential Increases already is " + numberOfSequentialIncreases); // send it. long modifiedTimeStarted = System.currentTimeMillis(); try { sendAndReceive(msgDelay); countTimeBasedRequests++; } catch (java.net.SocketTimeoutException e) { // to be expected occasionally, if the contains some parameters exploiting // time based SQL injection if (this.debugEnabled) log.debug("The time check query timed out on [" + msgTimeBaseline.getRequestHeader().getMethod() + "] URL [" + msgTimeBaseline.getRequestHeader().getURI().getURI() + "] on field: [" + paramName + "]"); } long modifiedTimeUsed = System.currentTimeMillis() - modifiedTimeStarted; // before we do the time based checking, first check for a known error message // from the atatck, indicating a SQL injection vuln for (Pattern errorMessagePattern : errorMessagePatterns) { Matcher matcher = errorMessagePattern.matcher(msgDelay.getResponseBody().toString()); boolean errorFound = matcher.find(); if (errorFound) { // Likely an error based SQL Injection. Raise it String extraInfo = Constant.messages.getString( "ascanalpha.sqlinjection.sqlite.alert.errorbased.extrainfo", errorMessagePattern); // raise the alert bingo(Alert.RISK_HIGH, Alert.CONFIDENCE_MEDIUM, getName(), getDescription(), getBaseMsg().getRequestHeader().getURI().getURI(), // url paramName, newTimeBasedInjectionValue, extraInfo, getSolution(), errorMessagePattern.toString(), this.getCweId(), this.getWascId(), msgDelay); if (this.debugEnabled) log.debug("A likely Error Based SQL Injection Vulnerability has been found with [" + msgDelay.getRequestHeader().getMethod() + "] URL [" + msgDelay.getRequestHeader().getURI().getURI() + "] on field: [" + paramName + "], by matching for pattern [" + errorMessagePattern.toString() + "]"); foundTimeBased = true; // yeah, I know. we found an error based, while looking // for a time based. bale out anyways. break; // out of the loop } } // outta the time based loop.. if (foundTimeBased) break; // no error message detected from the time based attack.. continue looking for // time based injection point. // cap the time we will delay by to 10 seconds if (modifiedTimeUsed > 10000) timeExceeded = true; boolean parseTimeEquivalent = false; if (modifiedTimeUsed > previousDelay) { if (this.debugEnabled) log.debug("The response time " + modifiedTimeUsed + " is > the previous response time " + previousDelay); // in order to rule out false positives due to the increasing SQL parse time // for longer parameter values // we send a random (alphanumeric only) string value of the same length as // the attack parameter // we expect the response time for the SQLi attack to be greater than or // equal to the response time for // the random alphanumeric string parameter // if this is not the case, then we assume that the attack parameter is not // a potential SQL injection causing payload. HttpMessage msgParseDelay = getNewMsg(); String parseDelayCheckParameter = RandomStringUtils .random(newTimeBasedInjectionValue.length(), RANDOM_PARAMETER_CHARS); setParameter(msgParseDelay, paramName, parseDelayCheckParameter); long parseDelayTimeStarted = System.currentTimeMillis(); sendAndReceive(msgParseDelay); countTimeBasedRequests++; long parseDelayTimeUsed = System.currentTimeMillis() - parseDelayTimeStarted; // figure out if the attack delay and the (non-sql-injection) parse delay // are within X ms of each other.. parseTimeEquivalent = (Math .abs(modifiedTimeUsed - parseDelayTimeUsed) < this.parseDelayDifference); if (this.debugEnabled) log.debug("The parse time a random parameter of the same length is " + parseDelayTimeUsed + ", so the attack and random parameter are " + (parseTimeEquivalent ? "" : "NOT ") + "equivalent (given the user defined attack threshold)"); } if (modifiedTimeUsed > previousDelay && !parseTimeEquivalent) { maxDelayParameter = newTimeBasedInjectionValue; maxDelay = modifiedTimeUsed; // potential for SQL injection, detectable with "numBlobsToCreate" random // blobs being created.. numberOfSequentialIncreases++; if (!potentialTimeBasedSQLInjection) { if (log.isDebugEnabled()) log.debug("Setting the Detectable Delay parameter to '" + newTimeBasedInjectionValue + "'"); detectableDelayParameter = newTimeBasedInjectionValue; detectableDelay = modifiedTimeUsed; detectableDelayMessage = msgDelay; } potentialTimeBasedSQLInjection = true; } else { // either no SQL injection, invalid SQL syntax, or timing difference is not // detectable with "numBlobsToCreate" random blobs being created. // keep trying with larger numbers of "numBlobsToCreate", since that's the // thing we can most easily control and verify // note also: if for some reason, an earlier attack with a smaller number of // blobs indicated there might be a vulnerability // then this case will rule that out if it was a fluke... // the timing delay must keep increasing, as the number of blobs is // increased. potentialTimeBasedSQLInjection = false; numberOfSequentialIncreases = 0; detectableDelayParameter = null; detectableDelay = 0; detectableDelayMessage = null; maxDelayParameter = null; maxDelay = 0; // do not break at this point, since we may simply need to keep increasing // numBlobsToCreate to // a point where we can detect the resulting delay } if (this.debugEnabled) log.debug("Time Based SQL Injection test for " + numBlobsToCreate + " random blob bytes: [" + newTimeBasedInjectionValue + "] on field: [" + paramName + "] with value [" + newTimeBasedInjectionValue + "] took " + modifiedTimeUsed + "ms, where the original took " + originalTimeUsed + "ms"); previousDelay = modifiedTimeUsed; // bale out if we were asked nicely if (isStop()) { if (this.debugEnabled) log.debug("Stopping the scan due to a user request"); return; } } // end of for loop to increase the number of random blob bytes to create // the number of times that we could sequentially increase the delay by increasing // the "number of random blob bytes to create" // is the basis for the threshold of the alert. In some cases, the user may want to // see a solid increase in delay // for say 4 or 5 iterations, in order to be confident the vulnerability exists. In // other cases, the user may be happy with just 2 sequential increases... if (this.debugEnabled) log.debug("Number of sequential increases: " + numberOfSequentialIncreases); if (numberOfSequentialIncreases >= this.incrementalDelayIncreasesForAlert) { // Likely a SQL Injection. Raise it String extraInfo = Constant.messages.getString( "ascanalpha.sqlinjection.sqlite.alert.timebased.extrainfo", detectableDelayParameter, detectableDelay, maxDelayParameter, maxDelay, originalParamValue, originalTimeUsed); // raise the alert bingo(Alert.RISK_HIGH, Alert.CONFIDENCE_MEDIUM, getName(), getDescription(), getBaseMsg().getRequestHeader().getURI().getURI(), // url paramName, detectableDelayParameter, extraInfo, getSolution(), extraInfo /*as evidence*/, this.getCweId(), this.getWascId(), detectableDelayMessage); if (this.debugEnabled) log.debug("A likely Time Based SQL Injection Vulnerability has been found with [" + detectableDelayMessage.getRequestHeader().getMethod() + "] URL [" + detectableDelayMessage.getRequestHeader().getURI().getURI() + "] on field: [" + paramName + "]"); // outta the time based loop.. foundTimeBased = true; break; } // the user-define threshold has been exceeded. raise it. // outta the time based loop.. if (foundTimeBased) break; // bale out if we were asked nicely if (isStop()) { if (this.debugEnabled) log.debug("Stopping the scan due to a user request"); return; } } // for each time based SQL index // end of check for SQLite time based SQL Injection // TODO: fix this logic, cos it's broken already. it reports version 2.2 and 4.0.. // (false positives ahoy) doUnionBased = false; // try to get the version of SQLite, using a UNION based SQL injection vulnerability // do this regardless of whether we already found a vulnerability using another // technique. if (doUnionBased) { int unionRequests = 0; // catch 3.0, 3.0.1, 3.0.1.1, 3.7.16.2, etc Pattern versionNumberPattern = Pattern.compile( "[0-9]{1}\\.[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}|[0-9]{1}\\.[0-9]{1,2}\\.[0-9]{1,2}|[0-9]{1}\\.[0-9]{1,2}", PATTERN_PARAM); String candidateValues[] = { "", originalParamValue }; // shonky break label. labels the loop to break out of. I believe I just finished a // sentence with a preposition too. Oh My. unionLoops: for (String sqliteVersionFunction : SQLITE_VERSION_FUNCTIONS) { for (String statementTypeCloser : SYNTACTIC_PREVIOUS_STATEMENT_TYPE_CLOSERS) { for (String statementClauseCloser : SYNTACTIC_PREVIOUS_STATEMENT_CLAUSE_CLOSERS) { for (String unionAdditionalColms : UNION_ADDITIONAL_COLUMNS) { for (String nextStatementCommenter : SYNTACTIC_NEXT_STATEMENT_COMMENTER) { for (String statementUnionStatement : SYNTACTIC_UNION_STATEMENTS) { for (String value : candidateValues) { // are we out of lives yet? // TODO: fix so that the logic does not spin through the // loop headers to get out of all of the nested loops.. // without using the shonky break to label logic if (unionRequests > doUnionMaxRequests) { break unionLoops; } ; String unionAttack = UNION_ATTACK_TEMPLATE; unionAttack = unionAttack.replace("<<<<SQLITE_VERSION_FUNCTION>>>>", sqliteVersionFunction); unionAttack = unionAttack.replace( "<<<<SYNTACTIC_PREVIOUS_STATEMENT_TYPE_CLOSER>>>>", statementTypeCloser); unionAttack = unionAttack.replace( "<<<<SYNTACTIC_PREVIOUS_STATEMENT_CLAUSE_CLOSER>>>>", statementClauseCloser); unionAttack = unionAttack.replace("<<<<UNIONADDITIONALCOLUMNS>>>>", unionAdditionalColms); unionAttack = unionAttack.replace( "<<<<SYNTACTIC_NEXT_STATEMENT_COMMENTER>>>>", nextStatementCommenter); unionAttack = unionAttack.replace("<<<<UNIONSTATEMENT>>>>", statementUnionStatement); unionAttack = unionAttack.replace("<<<<VALUE>>>>", value); if (log.isDebugEnabled()) log.debug("About to try to determine the SQLite version with [" + unionAttack + "]"); HttpMessage unionAttackMessage = getNewMsg(); setParameter(unionAttackMessage, paramName, unionAttack); sendAndReceive(unionAttackMessage); unionRequests++; // check the response for the version information.. Matcher matcher = versionNumberPattern .matcher(unionAttackMessage.getResponseBody().toString()); while (matcher.find()) { String versionNumber = matcher.group(); Pattern actualVersionNumberPattern = Pattern .compile("\\Q" + versionNumber + "\\E", PATTERN_PARAM); if (log.isDebugEnabled()) log.debug("Found a candidate SQLite version number '" + versionNumber + "'. About to look for the absence of '" + actualVersionNumberPattern + "' in the (re-created) original response body (of length " + originalMessage.getResponseBody().toString().length() + ") to validate it"); // if the version number was not in the original* // response, we will call it.. Matcher matcherVersionInOriginal = actualVersionNumberPattern .matcher(originalMessage.getResponseBody().toString()); if (!matcherVersionInOriginal.find()) { // we have the SQLite version number.. if (log.isDebugEnabled()) log.debug( "We found SQLite version [" + versionNumber + "]"); String extraInfo = Constant.messages.getString( "ascanalpha.sqlinjection.sqlite.alert.versionnumber.extrainfo", versionNumber); // raise the alert bingo(Alert.RISK_HIGH, Alert.CONFIDENCE_MEDIUM, getName() + " - " + versionNumber, getDescription(), getBaseMsg().getRequestHeader().getURI().getURI(), // url paramName, unionAttack, extraInfo, getSolution(), versionNumber /*as evidence*/, this.getCweId(), this.getWascId(), unionAttackMessage); break unionLoops; } } // bale out if we were asked nicely if (isStop()) { if (this.debugEnabled) log.debug("Stopping the scan due to a user request"); return; } } } } } } } } } // end of doUnionBased } catch (InvalidRedirectLocationException | UnknownHostException | URIException e) { if (log.isDebugEnabled()) { log.debug("Failed to send HTTP message, cause: " + e.getMessage()); } } catch (Exception e) { // Do not try to internationalise this.. we need an error message in any event.. // if it's in English, it's still better than not having it at all. log.error("An error occurred checking a url for SQLite SQL Injection vulnerabilities", e); } }