Java tutorial
/* $Id: RSSConnector.java 994959 2010-09-08 10:04:42Z kwright $ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.connectors.rss; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.agents.interfaces.*; import org.apache.manifoldcf.crawler.interfaces.*; import org.apache.manifoldcf.crawler.system.Logging; import org.apache.manifoldcf.crawler.system.ManifoldCF; import org.apache.manifoldcf.connectorcommon.interfaces.*; import org.apache.manifoldcf.connectorcommon.fuzzyml.*; import org.apache.manifoldcf.core.common.DateParser; import org.apache.http.conn.ConnectTimeoutException; import org.apache.http.client.RedirectException; import org.apache.http.client.CircularRedirectException; import org.apache.http.NoHttpResponseException; import org.apache.http.HttpException; import java.io.*; import java.util.*; import java.net.*; import java.text.*; import java.util.regex.*; /** This is the RSS implementation of the IRepositoryConnector interface. * This connector basically looks at an RSS document in order to seed the * document queue. The document is always fetched from the same URL (it's * specified in the configuration parameters). The documents subsequently * crawled are not scraped for additional links; only the primary document is * ingested. On the other hand, redirections ARE honored, so that various * sites that use this trick can be supported (e.g. the BBC) * */ public class RSSConnector extends org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector { public static final String _rcsid = "@(#)$Id: RSSConnector.java 994959 2010-09-08 10:04:42Z kwright $"; protected final static String rssThrottleGroupType = "_RSS_"; // Usage flag values protected static final int ROBOTS_NONE = 0; protected static final int ROBOTS_DATA = 1; protected static final int ROBOTS_ALL = 2; /** Dechromed content mode - none */ public static final int DECHROMED_NONE = 0; /** Dechromed content mode - description field */ public static final int DECHROMED_DESCRIPTION = 1; /** Dechromed content mode - content field */ public static final int DECHROMED_CONTENT = 2; /** Chromed suppression mode - use chromed content if dechromed content not available */ public static final int CHROMED_USE = 0; /** Chromed suppression mode - skip documents if dechromed content not available */ public static final int CHROMED_SKIP = 1; /** Chromed suppression mode - index metadata only if dechromed content not available */ public static final int CHROMED_METADATA_ONLY = 2; /** Robots usage flag */ protected int robotsUsage = ROBOTS_ALL; /** The user-agent for this connector instance */ protected String userAgent = null; /** The email address for this connector instance */ protected String from = null; /** The minimum milliseconds between fetches */ protected long minimumMillisecondsPerFetchPerServer = -1L; /** The maximum open connections */ protected int maxOpenConnectionsPerServer = 0; /** The minimum milliseconds between bytes */ protected double minimumMillisecondsPerBytePerServer = 0.0; /** The throttle group name */ protected String throttleGroupName = null; /** The proxy host */ protected String proxyHost = null; /** The proxy port */ protected int proxyPort = -1; /** Proxy auth domain */ protected String proxyAuthDomain = null; /** Proxy auth username */ protected String proxyAuthUsername = null; /** Proxy auth password */ protected String proxyAuthPassword = null; /** The throttled fetcher used by this instance */ protected ThrottledFetcher fetcher = null; /** The robots object used by this instance */ protected Robots robots = null; /** Storage for fetcher objects */ protected static Map<String, ThrottledFetcher> fetcherMap = new HashMap<String, ThrottledFetcher>(); /** Storage for robots objects */ protected static Map robotsMap = new HashMap(); /** Flag indicating whether session data is initialized */ protected boolean isInitialized = false; // A couple of very important points. // The canonical document identifier is simply a URL. // Versions of the document are calculated using a checksum technique protected static DataCache cache = new DataCache(); protected static final Map understoodProtocols = new HashMap(); static { understoodProtocols.put("http", "http"); understoodProtocols.put("https", "https"); } // Activity types public final static String ACTIVITY_FETCH = "fetch"; public final static String ACTIVITY_ROBOTSPARSE = "robots parse"; public final static String ACTIVITY_PROCESS = "process"; /** Deny access token for default authority */ private final static String defaultAuthorityDenyToken = "DEAD_AUTHORITY"; /** Constructor. */ public RSSConnector() { } /** Establish a session */ protected void getSession() throws ManifoldCFException { if (!isInitialized) { String x; String emailAddress = params.getParameter(RSSConfig.PARAMETER_EMAIL); if (emailAddress == null) throw new ManifoldCFException("Missing email address"); userAgent = "Mozilla/5.0 (ApacheManifoldCFRSSFeedReader; " + ((emailAddress == null) ? "" : emailAddress) + ")"; from = emailAddress; String robotsUsageString = params.getParameter(RSSConfig.PARAMETER_ROBOTSUSAGE); robotsUsage = ROBOTS_ALL; if (robotsUsageString == null || robotsUsageString.length() == 0 || robotsUsageString.equals(RSSConfig.VALUE_ALL)) robotsUsage = ROBOTS_ALL; else if (robotsUsageString.equals(RSSConfig.VALUE_NONE)) robotsUsage = ROBOTS_NONE; else if (robotsUsageString.equals(RSSConfig.VALUE_DATA)) robotsUsage = ROBOTS_DATA; proxyHost = params.getParameter(RSSConfig.PARAMETER_PROXYHOST); String proxyPortString = params.getParameter(RSSConfig.PARAMETER_PROXYPORT); proxyAuthDomain = params.getParameter(RSSConfig.PARAMETER_PROXYAUTHDOMAIN); proxyAuthUsername = params.getParameter(RSSConfig.PARAMETER_PROXYAUTHUSERNAME); proxyAuthPassword = params.getObfuscatedParameter(RSSConfig.PARAMETER_PROXYAUTHPASSWORD); proxyPort = -1; if (proxyPortString != null && proxyPortString.length() > 0) { try { proxyPort = Integer.parseInt(proxyPortString); } catch (NumberFormatException e) { throw new ManifoldCFException(e.getMessage(), e); } } // Read throttling configuration parameters minimumMillisecondsPerBytePerServer = 0.0; maxOpenConnectionsPerServer = 10; minimumMillisecondsPerFetchPerServer = 0L; x = params.getParameter(RSSConfig.PARAMETER_BANDWIDTH); if (x != null && x.length() > 0) { try { int maxKBytesPerSecondPerServer = Integer.parseInt(x); if (maxKBytesPerSecondPerServer > 0) minimumMillisecondsPerBytePerServer = 1.0 / (double) maxKBytesPerSecondPerServer; } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number: " + e.getMessage(), e); } } x = params.getParameter(RSSConfig.PARAMETER_MAXOPEN); if (x != null && x.length() > 0) { try { maxOpenConnectionsPerServer = Integer.parseInt(x); } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number: " + e.getMessage(), e); } } x = params.getParameter(RSSConfig.PARAMETER_MAXFETCHES); if (x != null && x.length() > 0) { try { int maxFetches = Integer.parseInt(x); if (maxFetches == 0) maxFetches = 1; minimumMillisecondsPerFetchPerServer = 60000L / ((long) maxFetches); } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number: " + e.getMessage(), e); } } IThrottleGroups tg = ThrottleGroupsFactory.make(currentContext); // Create the throttle group tg.createOrUpdateThrottleGroup(rssThrottleGroupType, throttleGroupName, new ThrottleSpec(maxOpenConnectionsPerServer, minimumMillisecondsPerFetchPerServer, minimumMillisecondsPerBytePerServer)); isInitialized = true; } } /** Return the list of activities that this connector supports (i.e. writes into the log). *@return the list. */ @Override public String[] getActivitiesList() { return new String[] { ACTIVITY_FETCH, ACTIVITY_ROBOTSPARSE, ACTIVITY_PROCESS }; } /** Tell the world what model this connector uses for getDocumentIdentifiers(). * This must return a model value as specified above. *@return the model type value. */ @Override public int getConnectorModel() { // This connector is currently structured that the RSS feeds are the seeds. return MODEL_ALL; } // All methods below this line will ONLY be called if a connect() call succeeded // on this instance! /** Connect. The configuration parameters are included. *@param configParams are the configuration parameters for this connection. * Note well: There are no exceptions allowed from this call, since it is expected to mainly establish connection parameters. */ @Override public void connect(ConfigParams configParams) { super.connect(configParams); // Do the necessary bookkeeping around connection counting throttleGroupName = params.getParameter(RSSConfig.PARAMETER_THROTTLEGROUP); if (throttleGroupName == null) throttleGroupName = ""; fetcher = getFetcher(); robots = getRobots(fetcher); // Let the system know we have a connection. fetcher.noteConnectionEstablished(); robots.noteConnectionEstablished(); } /** This method is periodically called for all connectors that are connected but not * in active use. */ @Override public void poll() throws ManifoldCFException { fetcher.poll(); robots.poll(); } /** Check status of connection. */ @Override public String check() throws ManifoldCFException { getSession(); return super.check(); } /** Close the connection. Call this before discarding the repository connector. */ @Override public void disconnect() throws ManifoldCFException { isInitialized = false; // Let the system know we are freeing the connection robots.noteConnectionReleased(); fetcher.noteConnectionReleased(); userAgent = null; from = null; minimumMillisecondsPerFetchPerServer = -1L; maxOpenConnectionsPerServer = 0; minimumMillisecondsPerBytePerServer = 0.0; throttleGroupName = null; proxyHost = null; proxyPort = -1; proxyAuthDomain = null; proxyAuthUsername = null; proxyAuthPassword = null; super.disconnect(); } /** Get the bin name string for a document identifier. The bin name describes the queue to which the * document will be assigned for throttling purposes. Throttling controls the rate at which items in a * given queue are fetched; it does not say anything about the overall fetch rate, which may operate on * multiple queues or bins. * For example, if you implement a web crawler, a good choice of bin name would be the server name, since * that is likely to correspond to a real resource that will need real throttle protection. *@param documentIdentifier is the document identifier. *@return the bin name. */ @Override public String[] getBinNames(String documentIdentifier) { try { WebURL uri = new WebURL(documentIdentifier); return new String[] { uri.getHost() }; } catch (URISyntaxException e) { return new String[] { "" }; } } /** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object. * * This method can choose to find repository changes that happen only during the specified time interval. * The seeds recorded by this method will be viewed by the framework based on what the * getConnectorModel() method returns. * * It is not a big problem if the connector chooses to create more seeds than are * strictly necessary; it is merely a question of overall work required. * * The end time and seeding version string passed to this method may be interpreted for greatest efficiency. * For continuous crawling jobs, this method will * be called once, when the job starts, and at various periodic intervals as the job executes. * * When a job's specification is changed, the framework automatically resets the seeding version string to null. The * seeding version string may also be set to null on each job run, depending on the connector model returned by * getConnectorModel(). * * Note that it is always ok to send MORE documents rather than less to this method. * The connector will be connected before this method can be called. *@param activities is the interface this method should use to perform whatever framework actions are desired. *@param spec is a document specification (that comes from the job). *@param seedTime is the end of the time range of documents to consider, exclusive. *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string. *@param jobMode is an integer describing how the job is being run, whether continuous or once-only. *@return an updated seeding version string, to be stored with the job. */ @Override public String addSeedDocuments(ISeedingActivity activities, Specification spec, String lastSeedVersion, long seedTime, int jobMode) throws ManifoldCFException, ServiceInterruption { getSession(); Filter f = new Filter(spec, true); // Go through all the seeds. Iterator<String> iter = f.getSeeds(); while (iter.hasNext()) { String canonicalURL = iter.next(); activities.addSeedDocument(canonicalURL); } return ""; } /** Convert an absolute or relative URL to a document identifier. This may involve several steps at some point, * but right now it does NOT involve converting the host name to a canonical host name. * (Doing so would destroy the ability of virtually hosted sites to do the right thing, * since the original host name would be lost.) Thus, we do the conversion to IP address * right before we actually fetch the document. *@param policies are the canonicalization policies in effect. *@param parentIdentifier the identifier of the document in which the raw url was found, or null if none. *@param rawURL is the raw, un-normalized and un-canonicalized url. *@return the canonical URL (the document identifier), or null if the url was illegal. */ protected static String makeDocumentIdentifier(CanonicalizationPolicies policies, String parentIdentifier, String rawURL) throws ManifoldCFException { try { // First, find the matching canonicalization policy, if any CanonicalizationPolicy p = policies.findMatch(rawURL); // Filter out control characters StringBuilder sb = new StringBuilder(); int i = 0; while (i < rawURL.length()) { char x = rawURL.charAt(i++); // Only 7-bit ascii is allowed in URLs - and that has limits too (no control characters) if (x >= ' ' && x < 128) sb.append(x); } rawURL = sb.toString(); WebURL url; if (parentIdentifier != null) { WebURL parentURL = new WebURL(parentIdentifier); url = parentURL.resolve(rawURL); } else url = new WebURL(rawURL); String protocol = url.getScheme(); String host = url.getHost(); // The new URL better darn well have a host and a protocol, and we only know how to deal with // http and https. if (protocol == null || host == null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: Can't use url '" + rawURL + "' because it has no protocol or host"); return null; } if (understoodProtocols.get(protocol) == null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Can't use url '" + rawURL + "' because it has an unsupported protocol '" + protocol + "'"); return null; } // Canonicalization procedure. // The query part of the URL may contain bad parameters (session id's, for instance), or may be ordered in such a // way as to prevent an effectively identical URL from being matched. The anchor part of the URL should also be stripped. // This code performs both of these activities in a simple way; rewrites of various pieces may get more involved if we add // the ability to perform mappings using criteria specified in the UI. Right now we don't. String id = doCanonicalization(p, url); if (id == null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: Can't use url '" + rawURL + "' because it could not be canonicalized"); return null; } // As a last basic legality check, go through looking for illegal characters. i = 0; while (i < id.length()) { char x = id.charAt(i++); // Only 7-bit ascii is allowed in URLs - and that has limits too (no control characters) if (x < ' ' || x > 127) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug( "RSS: Can't use url '" + rawURL + "' because it has illegal characters in it"); return null; } } return id; } catch (java.net.URISyntaxException e) { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: Can't use url '" + rawURL + "' because it is badly formed: " + e.getMessage()); return null; } catch (java.lang.IllegalArgumentException e) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Can't use url '" + rawURL + "' because there was an argument error: " + e.getMessage(), e); return null; } catch (java.lang.NullPointerException e) { // This gets tossed by url.toAsciiString() for reasons I don't understand, but which have to do with a malformed URL. if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug( "RSS: Can't use url '" + rawURL + "' because it is missing fields: " + e.getMessage(), e); return null; } } /** Code to canonicalize a URL. If URL cannot be canonicalized (and is illegal) return null. */ protected static String doCanonicalization(CanonicalizationPolicy p, WebURL url) throws ManifoldCFException, java.net.URISyntaxException { // Note well: The java.net.URI class mistreats the query part of the URI, near as I can tell, in the following ways: // (1) It decodes the whole thing without regards to the argument interpretation, so the escaped ampersands etc in the arguments are converted // to non-escaped ones (ugh). This is why I changed the code below to parse the RAW query string and decode it myself. // (2) On reassembly of the query string, the class does not properly escape ":", "/", or a bunch of other characters the class description *says* // it will escape. This means it creates URI's that are illegal according to RFC 2396 - although it is true that RFC 2396 also contains // apparent errors. // // I've therefore opted to deal with this problem by doing much of the query string processing myself - including its final reassembly into the // URI at the end of the processing. // // To make the url be canonical, we need to strip off everything after the #. We also need to order the arguments in a canonical // way, and remove session identifiers of the types we know about. String queryString = url.getRawQuery(); if (queryString != null) { // Rewrite the query string. To do this, we first parse it (by looking for ampersands and equal signs), and then // we ditch any keys that we really don't want (session identifiers particularly). Finally, we go through the // keys in sorted order and reassemble the query, making sure that any arguments that have the same name // appear in the same order. // I don't use the 'split' operation because I think it's a lot more oomph (and performance loss) than is needed // for this simple parsing task. // When reordering a url, the following is done: // (1) The individual order of all arguments with the same name is preserved // (2) The arguments themselves appear in sorted order, minus any arguments that should be removed because they // are interpreted to be session arguments. // // When a url is NOT reordered, the following is done: // (1) Each argument is examined IN TURN. // (2) If the argument is a session argument and should be excluded, it is simply skipped. // Canonicalization note: Broadvision // // The format of Broadvision's urls is as follows: // http://blah/path/path?arg|arg|arg|BVSession@@@@=xxxx&more stuff // The session identifier is the BVSession@@@@. In theory I could strip this away, but I've found that // most Broadvision sites require session even for basic navigation! if (p == null || p.canReorder()) { // Reorder the arguments. HashMap argumentMap = new HashMap(); int index = 0; while (index < queryString.length()) { int newIndex = queryString.indexOf("&", index); if (newIndex == -1) newIndex = queryString.length(); String argument = queryString.substring(index, newIndex); int valueIndex = argument.indexOf("="); String key; if (valueIndex == -1) key = argument; else key = argument.substring(0, valueIndex); // If this is a disallowed argument, simply don't include it in the final map. boolean includeArgument = true; if ((p == null || p.canRemovePhpSession()) && key.equals("PHPSESSID")) includeArgument = false; if ((p == null || p.canRemoveBvSession()) && key.indexOf("BVSession@@@@") != -1) includeArgument = false; if (includeArgument) { ArrayList list = (ArrayList) argumentMap.get(key); if (list == null) { list = new ArrayList(); argumentMap.put(key, list); } list.add(argument); } if (newIndex < queryString.length()) index = newIndex + 1; else index = newIndex; } // Reassemble query string in sorted order String[] sortArray = new String[argumentMap.size()]; int i = 0; Iterator iter = argumentMap.keySet().iterator(); while (iter.hasNext()) { sortArray[i++] = (String) iter.next(); } java.util.Arrays.sort(sortArray); StringBuilder newString = new StringBuilder(); boolean isFirst = true; i = 0; while (i < sortArray.length) { String key = sortArray[i++]; ArrayList list = (ArrayList) argumentMap.get(key); int j = 0; while (j < list.size()) { if (isFirst == false) { newString.append("&"); } else isFirst = false; newString.append((String) list.get(j++)); } } queryString = newString.toString(); } else { // Do not reorder! StringBuilder newString = new StringBuilder(); int index = 0; boolean isFirst = true; while (index < queryString.length()) { int newIndex = queryString.indexOf("&", index); if (newIndex == -1) newIndex = queryString.length(); String argument = queryString.substring(index, newIndex); int valueIndex = argument.indexOf("="); String key; if (valueIndex == -1) key = argument; else key = argument.substring(0, valueIndex); // If this is a disallowed argument, simply don't include it in the final query. boolean includeArgument = true; if ((p == null || p.canRemovePhpSession()) && key.equals("PHPSESSID")) includeArgument = false; if ((p == null || p.canRemoveBvSession()) && key.indexOf("BVSession@@@@") != -1) includeArgument = false; if (includeArgument) { if (!isFirst) newString.append("&"); else isFirst = false; newString.append(argument); } if (newIndex < queryString.length()) index = newIndex + 1; else index = newIndex; } queryString = newString.toString(); } } // Now, rewrite path to get rid of jsessionid etc. String pathString = url.getPath(); if (pathString != null) { int index = pathString.indexOf(";jsessionid="); if ((p == null || p.canRemoveJavaSession()) && index != -1) { // There's a ";jsessionid=" // Strip the java session id pathString = pathString.substring(0, index); } if ((p == null || p.canRemoveAspSession()) && pathString.startsWith("/s(")) { // It's asp.net index = pathString.indexOf(")"); if (index != -1) pathString = pathString.substring(index + 1); } } // Put it back into the URL without the ref, and with the modified query and path parts. url = new WebURL(url.getScheme(), url.getHost(), url.getPort(), pathString, queryString); String rval = url.toASCIIString(); return rval; } protected static Set<String> xmlContentTypes; static { xmlContentTypes = new HashSet<String>(); xmlContentTypes.add("text/xml"); xmlContentTypes.add("application/rss+xml"); xmlContentTypes.add("application/xml"); xmlContentTypes.add("application/atom+xml"); xmlContentTypes.add("application/xhtml+xml"); xmlContentTypes.add("text/XML"); xmlContentTypes.add("application/rdf+xml"); xmlContentTypes.add("text/application"); xmlContentTypes.add("XML"); } /** Process a set of documents. * This is the method that should cause each document to be fetched, processed, and the results either added * to the queue of documents for the current job, and/or entered into the incremental ingestion manager. * The document specification allows this class to filter what is done based on the job. * The connector will be connected before this method can be called. *@param documentIdentifiers is the set of document identifiers to process. *@param statuses are the currently-stored document versions for each document in the set of document identifiers * passed in above. *@param activities is the interface this method should use to queue up new document references * and ingest documents. *@param jobMode is an integer describing how the job is being run, whether continuous or once-only. *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one. */ @Override public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption { getSession(); // The connection limit is designed to permit this connector to coexist with potentially other connectors, such as the web connector. // There is currently no good way to enforce connection limits across all installed connectors - this will require considerably more // thought to set up properly. int connectionLimit = 200; String[] fixedList = new String[2]; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In getDocumentVersions for " + Integer.toString(documentIdentifiers.length) + " documents"); Filter f = new Filter(spec, false); String[] acls = f.getAcls(); // Sort it, java.util.Arrays.sort(acls); // NOTE: There are two kinds of documents in here; documents that are RSS feeds (that presumably have a content-type // of text/xml), and documents that need to be indexed. // // For the latter, the metadata etc is part of the version string. For the former, the only thing that is part of the version string is the // document's checksum. // // The need to exclude documents from fetch based on whether they match an expression causes some difficulties, because we really // DON'T want this to apply to the feeds themselves. Since the distinguishing characteristic of a feed is that it is in the seed list, // and that its content-type is text/xml, we could use either of these characteristics to treat feeds differently from // fetchable urls. But the latter approach requires a fetch, which is forbidden. So - the spec will be used to characterize the url. // However, the spec might change, and the url might be dropped from the list - and then what?? // // The final solution is to simply not queue what cannot be mapped. int feedTimeout = f.getFeedTimeoutValue(); // The document specification has already been used to trim out documents that are not // allowed from appearing in the queue. So, even that has already been done. for (String documentIdentifier : documentIdentifiers) { // If it is in this list, we presume that it has been vetted against the map etc., so we don't do that again. We just fetch it. // And, if the content type is xml, we calculate the version as if it is a feed rather than a document. // Get the url String urlValue = documentIdentifier; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Getting version string for '" + urlValue + "'"); String versionString; String ingestURL = null; String[] pubDates = null; String[] sources = null; String[] titles = null; String[] authorNames = null; String[] authorEmails = null; String[] categories = null; String[] descriptions = null; try { // If there's a carrydown "data" value for this url, we use that value rather than actually fetching the document. This also means we don't need to // do a robots check, because we aren't actually crawling anything. So, ALWAYS do this first... CharacterInput[] dechromedData = activities.retrieveParentDataAsFiles(urlValue, "data"); try { if (dechromedData.length > 0) { // Data already available. The fetch cycle can be entirely avoided, as can the robots check. ingestURL = f.mapDocumentURL(urlValue); if (ingestURL != null) { // Open up an input stream corresponding to the carrydown data. The stream will be encoded as utf-8. try { InputStream is = dechromedData[0].getUtf8Stream(); try { StringBuilder sb = new StringBuilder(); long checkSum = cache.addData(activities, urlValue, "text/html", is); // Grab what we need from the passed-down data for the document. These will all become part // of the version string. pubDates = activities.retrieveParentData(urlValue, "pubdate"); sources = activities.retrieveParentData(urlValue, "source"); titles = activities.retrieveParentData(urlValue, "title"); authorNames = activities.retrieveParentData(urlValue, "authorname"); authorEmails = activities.retrieveParentData(urlValue, "authoremail"); categories = activities.retrieveParentData(urlValue, "category"); descriptions = activities.retrieveParentData(urlValue, "description"); java.util.Arrays.sort(pubDates); java.util.Arrays.sort(sources); java.util.Arrays.sort(titles); java.util.Arrays.sort(authorNames); java.util.Arrays.sort(authorEmails); java.util.Arrays.sort(categories); java.util.Arrays.sort(descriptions); if (sources.length == 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Warning; URL '" + ingestURL + "' doesn't seem to have any RSS feed source!"); } sb.append('+'); packList(sb, acls, '+'); if (acls.length > 0) { sb.append('+'); pack(sb, defaultAuthorityDenyToken, '+'); } else sb.append('-'); // The ingestion URL pack(sb, ingestURL, '+'); // The pub dates packList(sb, pubDates, '+'); // The titles packList(sb, titles, '+'); // The sources packList(sb, sources, '+'); // The categories packList(sb, categories, '+'); // The descriptions packList(sb, descriptions, '+'); // The author names packList(sb, authorNames, '+'); // The author emails packList(sb, authorEmails, '+'); // Do the checksum part, which does not need to be parseable. sb.append(new Long(checkSum).toString()); versionString = sb.toString(); } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException( "IO exception reading data from string: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException( "IO exception reading data from string: " + e.getMessage(), e); } } else { // Document a seed or unmappable; just skip if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping carry-down document '" + urlValue + "' because it is unmappable or is a seed."); } } else { // Get the old version string String oldVersionString = statuses.getIndexedVersionString(documentIdentifier); // Unpack the old version as much as possible. // We are interested in what the ETag and Last-Modified headers were last time. String lastETagValue = null; String lastModifiedValue = null; // Note well: Non-continuous jobs cannot use etag because the rss document MUST be fetched each time for such jobs, // or the documents it points at would get deleted. // // NOTE: I disabled this code because we really need the feed's TTL value in order to reschedule properly. I can't get the // TTL value without refetching the document - therefore ETag and Last-Modified cannot be used :-( if (false && jobMode == JOBMODE_CONTINUOUS && oldVersionString != null && oldVersionString.startsWith("-")) { // It's a feed, so the last etag and last-modified fields should be encoded in this version string. StringBuilder lastETagBuffer = new StringBuilder(); int unpackPos = unpack(lastETagBuffer, oldVersionString, 1, '+'); StringBuilder lastModifiedBuffer = new StringBuilder(); unpackPos = unpack(lastModifiedBuffer, oldVersionString, unpackPos, '+'); if (lastETagBuffer.length() > 0) lastETagValue = lastETagBuffer.toString(); if (lastModifiedBuffer.length() > 0) lastModifiedValue = lastModifiedBuffer.toString(); } if (Logging.connectors.isDebugEnabled() && (lastETagValue != null || lastModifiedValue != null)) Logging.connectors.debug( "RSS: Document '" + urlValue + "' was found to have a previous ETag value of '" + ((lastETagValue == null) ? "null" : lastETagValue) + "' and a previous Last-Modified value of '" + ((lastModifiedValue == null) ? "null" : lastModifiedValue) + "'"); // Robots check. First, we need to separate the url into its components URL url; try { url = new URL(urlValue); } catch (MalformedURLException e) { Logging.connectors.debug("RSS: URL '" + urlValue + "' is malformed; skipping", e); activities.deleteDocument(documentIdentifier); continue; } String protocol = url.getProtocol(); int port = url.getPort(); String hostName = url.getHost(); String pathPart = url.getFile(); // Check with robots to see if it's allowed if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext, throttleGroupName, protocol, port, hostName, url.getPath(), userAgent, from, proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities, connectionLimit)) { activities.recordActivity(null, ACTIVITY_FETCH, null, urlValue, Integer.toString(-2), "Robots exclusion", null); if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: Skipping url '" + urlValue + "' because robots.txt says to"); activities.deleteDocument(documentIdentifier); continue; } // Now, use the fetcher, and get the file. IThrottledConnection connection = fetcher.createConnection(currentContext, throttleGroupName, hostName, connectionLimit, feedTimeout, proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities); try { // Begin the fetch connection.beginFetch("Data"); try { // Execute the request. // Use the connect timeout from the document specification! int status = connection.executeFetch(protocol, port, pathPart, userAgent, from, lastETagValue, lastModifiedValue); switch (status) { case IThrottledConnection.STATUS_NOCHANGE: versionString = oldVersionString; break; case IThrottledConnection.STATUS_OK: try { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Successfully fetched " + urlValue); // Document successfully fetched! // If its content is xml, presume it's a feed... String contentType = connection.getResponseHeader("Content-Type"); // Some sites have multiple content types. We just look at the LAST one in that case. if (contentType != null) { String[] contentTypes = contentType.split(","); if (contentTypes.length > 0) contentType = contentTypes[contentTypes.length - 1].trim(); else contentType = null; } String strippedContentType = contentType; if (strippedContentType != null) { int pos = strippedContentType.indexOf(";"); if (pos != -1) strippedContentType = strippedContentType.substring(0, pos).trim(); } boolean isXML = (strippedContentType != null && xmlContentTypes.contains(strippedContentType)); ingestURL = null; if (!isXML) { // If the chromed content mode is set to "skip", and we got here, it means // we should not include the content. if (f.getChromedContentMode() == CHROMED_SKIP) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing url '" + urlValue + "' because it no longer has dechromed content available"); versionString = null; break; } // Decide whether to exclude this document based on what we see here. // Basically, we want to get rid of everything that we don't know what // to do with in the ingestion system. if (!activities.checkMimeTypeIndexable(contentType)) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing url '" + urlValue + "' because it had the wrong content type: " + ((contentType == null) ? "null" : "'" + contentType + "'")); versionString = null; break; } ingestURL = f.mapDocumentURL(urlValue); } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: The url '" + urlValue + "' is a feed"); if (!f.isSeed(urlValue)) { // Remove the feed from consideration, since it has left the list of seeds if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing feed url '" + urlValue + "' because it is not a seed."); versionString = null; break; } } InputStream is = connection.getResponseBodyStream(); try { long checkSum = cache.addData(activities, urlValue, contentType, is); StringBuilder sb = new StringBuilder(); if (ingestURL != null) { // We think it is ingestable. The version string accordingly starts with a "+". // Grab what we need from the passed-down data for the document. These will all become part // of the version string. pubDates = activities.retrieveParentData(urlValue, "pubdate"); sources = activities.retrieveParentData(urlValue, "source"); titles = activities.retrieveParentData(urlValue, "title"); authorNames = activities.retrieveParentData(urlValue, "authorname"); authorEmails = activities.retrieveParentData(urlValue, "authoremail"); categories = activities.retrieveParentData(urlValue, "category"); descriptions = activities.retrieveParentData(urlValue, "description"); java.util.Arrays.sort(pubDates); java.util.Arrays.sort(sources); java.util.Arrays.sort(titles); java.util.Arrays.sort(authorNames); java.util.Arrays.sort(authorEmails); java.util.Arrays.sort(categories); java.util.Arrays.sort(descriptions); if (sources.length == 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Warning; URL '" + ingestURL + "' doesn't seem to have any RSS feed source!"); } sb.append('+'); packList(sb, acls, '+'); if (acls.length > 0) { sb.append('+'); pack(sb, defaultAuthorityDenyToken, '+'); } else sb.append('-'); // The ingestion URL pack(sb, ingestURL, '+'); // The pub dates packList(sb, pubDates, '+'); // The titles packList(sb, titles, '+'); // The sources packList(sb, sources, '+'); // The categories packList(sb, categories, '+'); // The descriptions packList(sb, descriptions, '+'); // The author names packList(sb, authorNames, '+'); // The author emails packList(sb, authorEmails, '+'); } else { sb.append('-'); String etag = connection.getResponseHeader("ETag"); if (etag == null) pack(sb, "", '+'); else pack(sb, etag, '+'); String lastModified = connection.getResponseHeader("Last-Modified"); if (lastModified == null) pack(sb, "", '+'); else pack(sb, lastModified, '+'); } // Do the checksum part, which does not need to be parseable. sb.append(new Long(checkSum).toString()); versionString = sb.toString(); } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { Logging.connectors .warn("RSS: Socket timeout exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } catch (ConnectTimeoutException e) { Logging.connectors .warn("RSS: Connecto timeout exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { Logging.connectors.warn("RSS: IO exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } break; case IThrottledConnection.STATUS_SITEERROR: case IThrottledConnection.STATUS_PAGEERROR: default: // Record an *empty* version. // This signals the processDocuments() method that we really don't want to ingest this document, but we also don't // want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times. versionString = ""; break; } } finally { connection.doneFetch(activities); } } finally { connection.close(); } if (versionString == null) { activities.deleteDocument(documentIdentifier); continue; } if (!(versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier, versionString))) continue; // Process document! if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Processing '" + urlValue + "'"); // The only links we extract come from documents that we think are RSS feeds. // When we think that's the case, we attempt to parse it as RSS XML. if (ingestURL == null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Interpreting document '" + urlValue + "' as a feed"); // We think it is a feed. // If this is a continuous job, AND scanonly is true, it means that the document was either identical to the // previous fetch, or was not fetched at all. In that case, it may not even be there, and we *certainly* don't // want to attempt to process it in any case. // // NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost. If the // TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds. if (true || jobMode != JOBMODE_CONTINUOUS) { handleRSSFeedSAX(urlValue, activities, f); if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Extraction of feed '" + urlValue + "' complete"); // Record the feed's version string, so we won't refetch unless needed. // This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to // keep track of the adaptive parameters. activities.recordDocument(documentIdentifier, versionString); } else { // The problem here is that we really do need to set the rescan time to something reasonable. // But we might not even have read the feed! So what to do?? // One answer is to build a connector-specific table that carries the last value of every feed around. // Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified). if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Feed '" + urlValue + "' does not appear to differ from previous fetch for a continuous job; not extracting!"); long currentTime = System.currentTimeMillis(); Long defaultRescanTime = f.getDefaultRescanTime(currentTime); if (defaultRescanTime != null) { Long minimumTime = f.getMinimumRescanTime(currentTime); if (minimumTime != null) { if (defaultRescanTime.longValue() < minimumTime.longValue()) defaultRescanTime = minimumTime; } } activities.setDocumentScheduleBounds(urlValue, defaultRescanTime, defaultRescanTime, null, null); } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Interpreting '" + urlValue + "' as a document"); String errorCode = null; String errorDesc = null; long startTime = System.currentTimeMillis(); Long fileLengthLong = null; try { long documentLength = cache.getDataLength(documentIdentifier); if (!activities.checkLengthIndexable(documentLength)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_LENGTH; errorDesc = "Document rejected because of length (" + documentLength + ")"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its length was rejected (" + documentLength + ")"); continue; } if (!activities.checkURLIndexable(documentIdentifier)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_URL; errorDesc = "Document rejected because of URL ('" + documentIdentifier + "')"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its URL was rejected ('" + documentIdentifier + "')"); continue; } // Check if it's a recognized content type String contentType = cache.getContentType(documentIdentifier); // Some sites have multiple content types. We just look at the LAST one in that case. if (contentType != null) { String[] contentTypes = contentType.split(","); if (contentTypes.length > 0) contentType = contentTypes[contentTypes.length - 1].trim(); else contentType = null; } if (!activities.checkMimeTypeIndexable(contentType)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_MIMETYPE; errorDesc = "Document rejected because of mime type (" + contentType + ")"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its mime type was rejected ('" + contentType + "')"); continue; } // Treat it as an ingestable document. long dataSize = cache.getDataLength(urlValue); RepositoryDocument rd = new RepositoryDocument(); // Set content type if (contentType != null) rd.setMimeType(contentType); // Turn into acls and add into description String[] denyAcls; if (acls == null) denyAcls = null; else if (acls.length == 0) denyAcls = new String[0]; else denyAcls = new String[] { defaultAuthorityDenyToken }; if (acls != null && denyAcls != null) rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, acls, denyAcls); if (titles != null && titles.length > 0) rd.addField("title", titles); if (authorNames != null && authorNames.length > 0) rd.addField("authorname", authorNames); if (authorEmails != null && authorEmails.length > 0) rd.addField("authoremail", authorEmails); if (descriptions != null && descriptions.length > 0) rd.addField("summary", descriptions); if (sources != null && sources.length > 0) rd.addField("source", sources); if (categories != null && categories.length > 0) rd.addField("category", categories); // The pubdates are a ms since epoch value; we want the minimum one for the origination time. Long minimumOrigTime = null; if (pubDates != null && pubDates.length > 0) { String[] pubDateValuesISO = new String[pubDates.length]; TimeZone tz = TimeZone.getTimeZone("UTC"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'"); df.setTimeZone(tz); for (int k = 0; k < pubDates.length; k++) { String pubDate = pubDates[k]; try { Long pubDateLong = new Long(pubDate); if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue()) minimumOrigTime = pubDateLong; pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue())); } catch (NumberFormatException e) { // Do nothing; the version string seems to not mean anything pubDateValuesISO[k] = ""; } } rd.addField("pubdate", pubDates); rd.addField("pubdateiso", pubDateValuesISO); } if (minimumOrigTime != null) activities.setDocumentOriginationTime(urlValue, minimumOrigTime); InputStream is = cache.getData(urlValue); if (is != null) { try { rd.setBinary(is, dataSize); try { activities.ingestDocumentWithException(documentIdentifier, versionString, ingestURL, rd); errorCode = "OK"; fileLengthLong = new Long(dataSize); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "reading data"); } } finally { try { is.close(); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "closing stream"); } } } } catch (ManifoldCFException e) { if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) errorCode = null; throw e; } finally { if (errorCode != null) activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, null, urlValue, errorCode, errorDesc, null); } } } } finally { for (CharacterInput ci : dechromedData) { if (ci != null) ci.discard(); } } } finally { // Remove any fetched documents. cache.deleteData(documentIdentifier); } } } protected static void handleIOException(IOException e, String context) throws ManifoldCFException, ServiceInterruption { if (e instanceof java.net.SocketTimeoutException) throw new ManifoldCFException("IO error " + context + ": " + e.getMessage(), e); else if (e instanceof InterruptedIOException) throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); else throw new ManifoldCFException("IO error " + context + ": " + e.getMessage(), e); } // UI support methods. // // These support methods come in two varieties. The first bunch is involved in setting up connection configuration information. The second bunch // is involved in presenting and editing document specification information for a job. The two kinds of methods are accordingly treated differently, // in that the first bunch cannot assume that the current connector object is connected, while the second bunch can. That is why the first bunch // receives a thread context argument for all UI methods, while the second bunch does not need one (since it has already been applied via the connect() // method, above). /** Output the configuration header section. * This method is called in the head section of the connector's configuration page. Its purpose is to add the required tabs to the list, and to output any * javascript methods that might be needed by the configuration editing HTML. *@param threadContext is the local thread context. *@param out is the output to which any HTML should be sent. *@param parameters are the configuration parameters, as they currently exist, for this connection being configured. *@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector. */ @Override public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, List<String> tabsArray) throws ManifoldCFException, IOException { tabsArray.add(Messages.getString(locale, "RSSConnector.Email")); tabsArray.add(Messages.getString(locale, "RSSConnector.Robots")); tabsArray.add(Messages.getString(locale, "RSSConnector.Bandwidth")); tabsArray.add(Messages.getString(locale, "RSSConnector.Proxy")); out.print("<script type=\"text/javascript\">\n" + "<!--\n" + "function checkConfig()\n" + "{\n" + " if (editconnection.email.value != \"\" && editconnection.email.value.indexOf(\"@\") == -1)\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.NeedAValidEmailAddress") + "\");\n" + " editconnection.email.focus();\n" + " return false;\n" + " }\n" + " if (editconnection.bandwidth.value != \"\" && !isInteger(editconnection.bandwidth.value))\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.EnterAValidNumberOrBlankForNoLimit") + "\");\n" + " editconnection.bandwidth.focus();\n" + " return false;\n" + " }\n" + " if (editconnection.connections.value == \"\" || !isInteger(editconnection.connections.value))\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.EnterAValidNumberForTheMaxNumberOfOpenConnectionsPerServer") + "\");\n" + " editconnection.connections.focus();\n" + " return false;\n" + " }\n" + " if (editconnection.fetches.value != \"\" && !isInteger(editconnection.fetches.value))\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.EnterAValidNumberOrBlankForNoLimit") + "\");\n" + " editconnection.fetches.focus();\n" + " return false;\n" + " }\n" + " return true;\n" + "}\n" + "\n" + "function checkConfigForSave()\n" + "{\n" + " if (editconnection.email.value == \"\")\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.EmailAddressRequiredToBeIncludedInAllRequestHeaders") + "\");\n" + " SelectTab(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.Email") + "\");\n" + " editconnection.email.focus();\n" + " return false;\n" + " }\n" + " return true;\n" + "}\n" + "\n" + "//-->\n" + "</script>\n"); } /** Output the configuration body section. * This method is called in the body section of the connector's configuration page. Its purpose is to present the required form elements for editing. * The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the * form is "editconnection". *@param threadContext is the local thread context. *@param out is the output to which any HTML should be sent. *@param parameters are the configuration parameters, as they currently exist, for this connection being configured. *@param tabName is the current tab name. */ @Override public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, String tabName) throws ManifoldCFException, IOException { String email = parameters.getParameter(RSSConfig.PARAMETER_EMAIL); if (email == null) email = ""; String robotsUsage = parameters.getParameter(RSSConfig.PARAMETER_ROBOTSUSAGE); if (robotsUsage == null) robotsUsage = RSSConfig.VALUE_ALL; String bandwidth = parameters.getParameter(RSSConfig.PARAMETER_BANDWIDTH); if (bandwidth == null) bandwidth = "64"; String connections = parameters.getParameter(RSSConfig.PARAMETER_MAXOPEN); if (connections == null) connections = "2"; String fetches = parameters.getParameter(RSSConfig.PARAMETER_MAXFETCHES); if (fetches == null) fetches = "12"; String throttleGroup = parameters.getParameter(RSSConfig.PARAMETER_THROTTLEGROUP); if (throttleGroup == null) throttleGroup = ""; String proxyHost = parameters.getParameter(RSSConfig.PARAMETER_PROXYHOST); if (proxyHost == null) proxyHost = ""; String proxyPort = parameters.getParameter(RSSConfig.PARAMETER_PROXYPORT); if (proxyPort == null) proxyPort = ""; String proxyAuthDomain = parameters.getParameter(RSSConfig.PARAMETER_PROXYAUTHDOMAIN); if (proxyAuthDomain == null) proxyAuthDomain = ""; String proxyAuthUsername = parameters.getParameter(RSSConfig.PARAMETER_PROXYAUTHUSERNAME); if (proxyAuthUsername == null) proxyAuthUsername = ""; String proxyAuthPassword = parameters.getObfuscatedParameter(RSSConfig.PARAMETER_PROXYAUTHPASSWORD); if (proxyAuthPassword == null) proxyAuthPassword = ""; else proxyAuthPassword = out.mapPasswordToKey(proxyAuthPassword); // Email tab if (tabName.equals(Messages.getString(locale, "RSSConnector.Email"))) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.EmailAddressToContactColon") + "</nobr></td><td class=\"value\"><input type=\"text\" size=\"32\" name=\"email\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(email) + "\"/></td>\n" + " </tr>\n" + "</table>\n"); } else { out.print("<input type=\"hidden\" name=\"email\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(email) + "\"/>\n"); } // Robots tab if (tabName.equals(Messages.getString(locale, "RSSConnector.Robots"))) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RobotsTxtUsageColon") + "</nobr></td>\n" + " <td class=\"value\">\n" + " <select name=\"robotsusage\" size=\"3\">\n" + " <option value=\"none\" " + (robotsUsage.equals(RSSConfig.VALUE_NONE) ? "selected=\"selected\"" : "") + ">" + Messages.getBodyString(locale, "RSSConnector.DontLookAtRobotsTxt") + "</option>\n" + " <option value=\"data\" " + (robotsUsage.equals(RSSConfig.VALUE_DATA) ? "selected=\"selected\"" : "") + ">" + Messages.getBodyString(locale, "RSSConnector.ObeyRobotsTxtForDataFetchesOnly") + "</option>\n" + " <option value=\"all\" " + (robotsUsage.equals(RSSConfig.VALUE_ALL) ? "selected=\"selected\"" : "") + ">" + Messages.getBodyString(locale, "RSSConnector.ObeyRobotsTxtForAllFetches") + "</option>\n" + " </select>\n" + " </td>\n" + " </tr>\n" + "</table>\n"); } else { out.print("<input type=\"hidden\" name=\"robotsusage\" value=\"" + robotsUsage + "\"/>\n"); } // Bandwidth tab if (tabName.equals(Messages.getString(locale, "RSSConnector.Bandwidth"))) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.MaxKBytesPerSecondPerServerColon") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"6\" name=\"bandwidth\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(bandwidth) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.MaxConnectionsPerServerColon") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"4\" name=\"connections\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(connections) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.MaxFetchesPerMinutePerServerColon") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"4\" name=\"fetches\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(fetches) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.ThrottleGroupNameColon") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"32\" name=\"throttlegroup\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(throttleGroup) + "\"/></td>\n" + " </tr>\n" + "</table>\n"); } else { out.print("<input type=\"hidden\" name=\"bandwidth\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(bandwidth) + "\"/>\n" + "<input type=\"hidden\" name=\"connections\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(connections) + "\"/>\n" + "<input type=\"hidden\" name=\"fetches\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(fetches) + "\"/>\n" + "<input type=\"hidden\" name=\"throttlegroup\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(throttleGroup) + "\"/>\n"); } // Proxy tab if (tabName.equals(Messages.getString(locale, "RSSConnector.Proxy"))) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.ProxyHostColon") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"40\" name=\"proxyhost\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyHost) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.ProxyPortColon") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"5\" name=\"proxyport\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyPort) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.ProxyAuthenticationDomainColon") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"32\" name=\"proxyauthdomain\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthDomain) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.ProxyAuthenticationUserNameColon") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"32\" name=\"proxyauthusername\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthUsername) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.ProxyAuthenticationPasswordColon") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"password\" size=\"16\" name=\"proxyauthpassword\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthPassword) + "\"/></td>\n" + " </tr>\n" + "</table>\n"); } else { out.print("<input type=\"hidden\" name=\"proxyhost\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyHost) + "\"/>\n" + "<input type=\"hidden\" name=\"proxyport\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyPort) + "\"/>\n" + "<input type=\"hidden\" name=\"proxyauthusername\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthUsername) + "\"/>\n" + "<input type=\"hidden\" name=\"proxyauthdomain\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthDomain) + "\"/>\n" + "<input type=\"hidden\" name=\"proxyauthpassword\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthPassword) + "\"/>\n"); } } /** Process a configuration post. * This method is called at the start of the connector's configuration page, whenever there is a possibility that form data for a connection has been * posted. Its purpose is to gather form information and modify the configuration parameters accordingly. * The name of the posted form is "editconnection". *@param threadContext is the local thread context. *@param variableContext is the set of variables available from the post, including binary file post information. *@param parameters are the configuration parameters, as they currently exist, for this connection being configured. *@return null if all is well, or a string error message if there is an error that should prevent saving of the connection (and cause a redirection to an error page). */ @Override public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext, Locale locale, ConfigParams parameters) throws ManifoldCFException { String email = variableContext.getParameter("email"); if (email != null) parameters.setParameter(RSSConfig.PARAMETER_EMAIL, email); String robotsUsage = variableContext.getParameter("robotsusage"); if (robotsUsage != null) parameters.setParameter(RSSConfig.PARAMETER_ROBOTSUSAGE, robotsUsage); String bandwidth = variableContext.getParameter("bandwidth"); if (bandwidth != null) parameters.setParameter(RSSConfig.PARAMETER_BANDWIDTH, bandwidth); String connections = variableContext.getParameter("connections"); if (connections != null) parameters.setParameter(RSSConfig.PARAMETER_MAXOPEN, connections); String fetches = variableContext.getParameter("fetches"); if (fetches != null) parameters.setParameter(RSSConfig.PARAMETER_MAXFETCHES, fetches); String throttleGroup = variableContext.getParameter("throttlegroup"); if (throttleGroup != null) parameters.setParameter(RSSConfig.PARAMETER_THROTTLEGROUP, throttleGroup); String proxyHost = variableContext.getParameter("proxyhost"); if (proxyHost != null) parameters.setParameter(RSSConfig.PARAMETER_PROXYHOST, proxyHost); String proxyPort = variableContext.getParameter("proxyport"); if (proxyPort != null) parameters.setParameter(RSSConfig.PARAMETER_PROXYPORT, proxyPort); String proxyAuthDomain = variableContext.getParameter("proxyauthdomain"); if (proxyAuthDomain != null) parameters.setParameter(RSSConfig.PARAMETER_PROXYAUTHDOMAIN, proxyAuthDomain); String proxyAuthUsername = variableContext.getParameter("proxyauthusername"); if (proxyAuthUsername != null) parameters.setParameter(RSSConfig.PARAMETER_PROXYAUTHUSERNAME, proxyAuthUsername); String proxyAuthPassword = variableContext.getParameter("proxyauthpassword"); if (proxyAuthPassword != null) parameters.setObfuscatedParameter(RSSConfig.PARAMETER_PROXYAUTHPASSWORD, variableContext.mapKeyToPassword(proxyAuthPassword)); return null; } /** View configuration. * This method is called in the body section of the connector's view configuration page. Its purpose is to present the connection information to the user. * The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags. *@param threadContext is the local thread context. *@param out is the output to which any HTML should be sent. *@param parameters are the configuration parameters, as they currently exist, for this connection being configured. */ @Override public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters) throws ManifoldCFException, IOException { out.print("<table class=\"displaytable\">\n" + " <tr>\n" + " <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.ParametersColon") + "</nobr></td>\n" + " <td class=\"value\" colspan=\"3\">\n"); Iterator iter = parameters.listParameters(); while (iter.hasNext()) { String param = (String) iter.next(); String value = parameters.getParameter(param); if (param.length() >= "password".length() && param.substring(param.length() - "password".length()).equalsIgnoreCase("password")) { out.print(" <nobr>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param) + "=********</nobr><br/>\n"); } else if (param.length() >= "keystore".length() && param.substring(param.length() - "keystore".length()).equalsIgnoreCase("keystore")) { IKeystoreManager kmanager = KeystoreManagerFactory.make("", value); out.print(" <nobr>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param) + "=<" + Integer.toString(kmanager.getContents().length) + Messages.getBodyString(locale, "RSSConnector.certificates") + "></nobr><br/>\n"); } else { out.print(" <nobr>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param) + "=" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(value) + "</nobr><br/>\n"); } } out.print(" </td>\n" + " </tr>\n" + "</table>\n"); } /** Output the specification header section. * This method is called in the head section of a job page which has selected a repository connection of the * current type. Its purpose is to add the required tabs to the list, and to output any javascript methods * that might be needed by the job editing HTML. * The connector will be connected before this method can be called. *@param out is the output to which any HTML should be sent. *@param locale is the locale the output is preferred to be in. *@param ds is the current document specification for this job. *@param connectionSequenceNumber is the unique number of this connection within the job. *@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector. */ @Override public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification ds, int connectionSequenceNumber, List<String> tabsArray) throws ManifoldCFException, IOException { tabsArray.add(Messages.getString(locale, "RSSConnector.URLs")); tabsArray.add(Messages.getString(locale, "RSSConnector.Canonicalization")); tabsArray.add(Messages.getString(locale, "RSSConnector.URLMappings")); tabsArray.add(Messages.getString(locale, "RSSConnector.Exclusions")); tabsArray.add(Messages.getString(locale, "RSSConnector.TimeValues")); tabsArray.add(Messages.getString(locale, "RSSConnector.Security")); tabsArray.add(Messages.getString(locale, "RSSConnector.DechromedContent")); String seqPrefix = "s" + connectionSequenceNumber + "_"; out.print("<script type=\"text/javascript\">\n" + "<!--\n" + "function " + seqPrefix + "SpecOp(n, opValue, anchorvalue)\n" + "{\n" + " eval(\"editjob.\"+n+\".value = \\\"\"+opValue+\"\\\"\");\n" + " postFormSetAnchor(anchorvalue);\n" + "}\n" + "\n" + "function " + seqPrefix + "AddRegexp(anchorvalue)\n" + "{\n" + " if (editjob." + seqPrefix + "rssmatch.value == \"\")\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.MatchMustHaveARegexpValue") + "\");\n" + " editjob." + seqPrefix + "rssmatch.focus();\n" + " return;\n" + " }\n" + "\n" + " " + seqPrefix + "SpecOp(\"" + seqPrefix + "rssop\",\"Add\",anchorvalue);\n" + "}\n" + "\n" + "function " + seqPrefix + "RemoveRegexp(index, anchorvalue)\n" + "{\n" + " editjob." + seqPrefix + "rssindex.value = index;\n" + " " + seqPrefix + "SpecOp(\"" + seqPrefix + "rssop\",\"Delete\",anchorvalue);\n" + "}\n" + "\n" + "function " + seqPrefix + "SpecAddToken(anchorvalue)\n" + "{\n" + " if (editjob." + seqPrefix + "spectoken.value == \"\")\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.TypeInAnAccessToken") + "\");\n" + " editjob." + seqPrefix + "spectoken.focus();\n" + " return;\n" + " }\n" + " " + seqPrefix + "SpecOp(\"" + seqPrefix + "accessop\",\"Add\",anchorvalue);\n" + "}\n" + "\n" + "function " + seqPrefix + "URLRegexpDelete(index, anchorvalue)\n" + "{\n" + " editjob." + seqPrefix + "urlregexpnumber.value = index;\n" + " " + seqPrefix + "SpecOp(\"" + seqPrefix + "urlregexpop\",\"Delete\",anchorvalue);\n" + "}\n" + "\n" + "function " + seqPrefix + "URLRegexpAdd(anchorvalue)\n" + "{\n" + " " + seqPrefix + "SpecOp(\"" + seqPrefix + "urlregexpop\",\"Add\",anchorvalue);\n" + "}\n" + "\n" + "function " + seqPrefix + "checkSpecification()\n" + "{\n" + " if (editjob." + seqPrefix + "feedtimeout.value == \"\" || !isInteger(editjob." + seqPrefix + "feedtimeout.value))\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.ATimeoutValueInSecondsIsRequired") + "\");\n" + " editjob." + seqPrefix + "feedtimeout.focus();\n" + " return false;\n" + " }\n" + " if (editjob." + seqPrefix + "feedrefetch.value == \"\" || !isInteger(editjob." + seqPrefix + "feedrefetch.value))\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.ARefetchIntervalInMinutesIsRequired") + "\");\n" + " editjob." + seqPrefix + "feedrefetch.focus();\n" + " return false;\n" + " }\n" + " if (editjob." + seqPrefix + "minfeedrefetch.value == \"\" || !isInteger(editjob." + seqPrefix + "minfeedrefetch.value))\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.AMinimumRefetchIntervalInMinutesIsRequire") + "\");\n" + " editjob." + seqPrefix + "minfeedrefetch.focus();\n" + " return false;\n" + " }\n" + " if (editjob." + seqPrefix + "badfeedrefetch.value != \"\" && !isInteger(editjob." + seqPrefix + "badfeedrefetch.value))\n" + " {\n" + " alert(\"" + Messages.getBodyJavascriptString(locale, "RSSConnector.ABadFeedRefetchIntervalInMinutesIsRequired") + "\");\n" + " editjob." + seqPrefix + "badfeedrefetch.focus();\n" + " return false;\n" + " }\n" + "\n" + " return true;\n" + "}\n" + "\n" + "//-->\n" + "</script>\n"); } /** Output the specification body section. * This method is called in the body section of a job page which has selected a repository connection of the * current type. Its purpose is to present the required form elements for editing. * The coder can presume that the HTML that is output from this configuration will be within appropriate * <html>, <body>, and <form> tags. The name of the form is always "editjob". * The connector will be connected before this method can be called. *@param out is the output to which any HTML should be sent. *@param locale is the locale the output is preferred to be in. *@param ds is the current document specification for this job. *@param connectionSequenceNumber is the unique number of this connection within the job. *@param actualSequenceNumber is the connection within the job that has currently been selected. *@param tabName is the current tab name. (actualSequenceNumber, tabName) form a unique tuple within * the job. */ @Override public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification ds, int connectionSequenceNumber, int actualSequenceNumber, String tabName) throws ManifoldCFException, IOException { String seqPrefix = "s" + connectionSequenceNumber + "_"; int i; int k; // Build the url seed string, and the url regexp match and map StringBuilder sb = new StringBuilder(); ArrayList regexp = new ArrayList(); ArrayList matchStrings = new ArrayList(); int feedTimeoutValue = 60; int feedRefetchValue = 60; int minFeedRefetchValue = 15; Integer badFeedRefetchValue = null; String exclusions = ""; // Now, loop through paths i = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i++); if (sn.getType().equals(RSSConfig.NODE_FEED)) { String rssURL = sn.getAttributeValue(RSSConfig.ATTR_URL); if (rssURL != null) { sb.append(rssURL).append("\n"); } } else if (sn.getType().equals(RSSConfig.NODE_EXCLUDES)) { exclusions = sn.getValue(); if (exclusions == null) exclusions = ""; } else if (sn.getType().equals(RSSConfig.NODE_MAP)) { String match = sn.getAttributeValue(RSSConfig.ATTR_MATCH); String map = sn.getAttributeValue(RSSConfig.ATTR_MAP); if (match != null) { regexp.add(match); if (map == null) map = ""; matchStrings.add(map); } } else if (sn.getType().equals(RSSConfig.NODE_FEEDTIMEOUT)) { String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); feedTimeoutValue = Integer.parseInt(value); } else if (sn.getType().equals(RSSConfig.NODE_FEEDRESCAN)) { String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); feedRefetchValue = Integer.parseInt(value); } else if (sn.getType().equals(RSSConfig.NODE_MINFEEDRESCAN)) { String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); minFeedRefetchValue = Integer.parseInt(value); } else if (sn.getType().equals(RSSConfig.NODE_BADFEEDRESCAN)) { String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); badFeedRefetchValue = new Integer(value); } } // URLs tab if (tabName.equals(Messages.getString(locale, "RSSConnector.URLs")) && connectionSequenceNumber == actualSequenceNumber) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"value\" colspan=\"2\">\n" + " <textarea rows=\"25\" cols=\"80\" name=\"" + seqPrefix + "rssurls\">" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(sb.toString()) + "</textarea>\n" + " </td>\n" + " </tr>\n" + "</table>\n"); } else { out.print("<input type=\"hidden\" name=\"" + seqPrefix + "rssurls\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(sb.toString()) + "\"/>\n"); } // Exclusions tab if (tabName.equals(Messages.getString(locale, "RSSConnector.Exclusions")) && connectionSequenceNumber == actualSequenceNumber) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.Exclude") + "</nobr></td>\n" + " <td class=\"value\" colspan=\"1\">\n" + " <textarea rows=\"25\" cols=\"60\" name=\"" + seqPrefix + "exclusions\">" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(exclusions) + "</textarea>\n" + " </td>\n" + " </tr>\n" + "</table>\n"); } else { out.print("<input type=\"hidden\" name=\"" + seqPrefix + "exclusions\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(exclusions) + "\"/>\n"); } // Canonicalization tab if (tabName.equals(Messages.getString(locale, "RSSConnector.Canonicalization")) && connectionSequenceNumber == actualSequenceNumber) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"boxcell\" colspan=\"2\">\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexpop\" value=\"Continue\"/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexpnumber\" value=\"\"/>\n" + " <table class=\"formtable\">\n" + " <tr class=\"formheaderrow\">\n" + " <td class=\"formcolumnheader\"></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.URLRegularExpression") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.Description") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.Reorder") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RemoveJSPSessions") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RemoveASPSessions") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RemovePHPSessions") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RemoveBVSessions") + "</nobr></td>\n" + " </tr>\n"); int q = 0; int l = 0; while (q < ds.getChildCount()) { SpecificationNode specNode = ds.getChild(q++); if (specNode.getType().equals(RSSConfig.NODE_URLSPEC)) { // Ok, this node matters to us String regexpString = specNode.getAttributeValue(RSSConfig.ATTR_REGEXP); String description = specNode.getAttributeValue(RSSConfig.ATTR_DESCRIPTION); if (description == null) description = ""; String allowReorder = specNode.getAttributeValue(RSSConfig.ATTR_REORDER); if (allowReorder == null || allowReorder.length() == 0) allowReorder = RSSConfig.VALUE_NO; String allowJavaSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL); if (allowJavaSessionRemoval == null || allowJavaSessionRemoval.length() == 0) allowJavaSessionRemoval = RSSConfig.VALUE_NO; String allowASPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL); if (allowASPSessionRemoval == null || allowASPSessionRemoval.length() == 0) allowASPSessionRemoval = RSSConfig.VALUE_NO; String allowPHPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL); if (allowPHPSessionRemoval == null || allowPHPSessionRemoval.length() == 0) allowPHPSessionRemoval = RSSConfig.VALUE_NO; String allowBVSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL); if (allowBVSessionRemoval == null || allowBVSessionRemoval.length() == 0) allowBVSessionRemoval = RSSConfig.VALUE_NO; out.print(" <tr class=\"" + (((l % 2) == 0) ? "evenformrow" : "oddformrow") + "\">\n" + " <td class=\"formcolumncell\">\n" + " <a name=\"" + seqPrefix + "urlregexp_" + Integer.toString(l) + "\">\n" + " <input type=\"button\" value=\"Delete\" alt=\"" + Messages.getAttributeString(locale, "RSSConnector.DeleteUrlRegexp") + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(regexpString) + "\" onclick='javascript:" + seqPrefix + "URLRegexpDelete(" + Integer.toString(l) + ",\"" + seqPrefix + "urlregexp_" + Integer.toString(l) + "\");'/>\n" + " </a>\n" + " </td>\n" + " <td class=\"formcolumncell\">\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexp_" + Integer.toString(l) + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(regexpString) + "\"/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexpdesc_" + Integer.toString(l) + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(description) + "\"/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexpreorder_" + Integer.toString(l) + "\" value=\"" + allowReorder + "\"/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexpjava_" + Integer.toString(l) + "\" value=\"" + allowJavaSessionRemoval + "\"/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexpasp_" + Integer.toString(l) + "\" value=\"" + allowASPSessionRemoval + "\"/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexpphp_" + Integer.toString(l) + "\" value=\"" + allowPHPSessionRemoval + "\"/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexpbv_" + Integer.toString(l) + "\" value=\"" + allowBVSessionRemoval + "\"/>\n" + " <nobr>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(regexpString) + "</nobr>\n" + " </td>\n" + " <td class=\"formcolumncell\">" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(description) + "</td>\n" + " <td class=\"formcolumncell\">" + allowReorder + "</td>\n" + " <td class=\"formcolumncell\">" + allowJavaSessionRemoval + "</td>\n" + " <td class=\"formcolumncell\">" + allowASPSessionRemoval + "</td>\n" + " <td class=\"formcolumncell\">" + allowPHPSessionRemoval + "</td>\n" + " <td class=\"formcolumncell\">" + allowBVSessionRemoval + "</td>\n" + " </tr>\n"); l++; } } if (l == 0) { out.print(" <tr class=\"formrow\"><td colspan=\"8\" class=\"formcolumnmessage\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.NoCanonicalizationSpecified") + "</nobr></td></tr>\n"); } out.print(" <tr class=\"formrow\"><td colspan=\"8\" class=\"formseparator\"><hr/></td></tr>\n" + " <tr class=\"formrow\">\n" + " <td class=\"formcolumncell\">\n" + " <a name=\"" + seqPrefix + "urlregexp_" + Integer.toString(l) + "\">\n" + " <input type=\"button\" value=\"Add\" alt=\"" + Messages.getAttributeString(locale, "RSSConnector.AddUlRegexp") + "\" onclick='javascript:" + seqPrefix + "URLRegexpAdd(\"" + seqPrefix + "urlregexp_" + Integer.toString(l + 1) + "\");'/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "urlregexpcount\" value=\"" + Integer.toString(l) + "\"/>\n" + " </a>\n" + " </td>\n" + " <td class=\"formcolumncell\"><input type=\"text\" name=\"" + seqPrefix + "urlregexp\" size=\"30\" value=\"\"/></td>\n" + " <td class=\"formcolumncell\"><input type=\"text\" name=\"" + seqPrefix + "urlregexpdesc\" size=\"30\" value=\"\"/></td>\n" + " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"" + seqPrefix + "urlregexpreorder\" value=\"yes\"/></td>\n" + " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"" + seqPrefix + "urlregexpjava\" value=\"yes\" checked=\"true\"/></td>\n" + " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"" + seqPrefix + "urlregexpasp\" value=\"yes\" checked=\"true\"/></td>\n" + " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"" + seqPrefix + "urlregexpphp\" value=\"yes\" checked=\"true\"/></td>\n" + " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"" + seqPrefix + "urlregexpbv\" value=\"yes\" checked=\"true\"/></td>\n" + " </tr>\n" + " </table>\n" + " </td>\n" + " </tr>\n" + "</table>\n"); } else { // Post the canonicalization specification int q = 0; int l = 0; while (q < ds.getChildCount()) { SpecificationNode specNode = ds.getChild(q++); if (specNode.getType().equals(RSSConfig.NODE_URLSPEC)) { // Ok, this node matters to us String regexpString = specNode.getAttributeValue(RSSConfig.ATTR_REGEXP); String description = specNode.getAttributeValue(RSSConfig.ATTR_DESCRIPTION); if (description == null) description = ""; String allowReorder = specNode.getAttributeValue(RSSConfig.ATTR_REORDER); if (allowReorder == null || allowReorder.length() == 0) allowReorder = RSSConfig.VALUE_NO; String allowJavaSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL); if (allowJavaSessionRemoval == null || allowJavaSessionRemoval.length() == 0) allowJavaSessionRemoval = RSSConfig.VALUE_NO; String allowASPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL); if (allowASPSessionRemoval == null || allowASPSessionRemoval.length() == 0) allowASPSessionRemoval = RSSConfig.VALUE_NO; String allowPHPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL); if (allowPHPSessionRemoval == null || allowPHPSessionRemoval.length() == 0) allowPHPSessionRemoval = RSSConfig.VALUE_NO; String allowBVSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL); if (allowBVSessionRemoval == null || allowBVSessionRemoval.length() == 0) allowBVSessionRemoval = RSSConfig.VALUE_NO; out.print("<input type=\"hidden\" name=\"" + seqPrefix + "urlregexp_" + Integer.toString(l) + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(regexpString) + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "urlregexpdesc_" + Integer.toString(l) + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(description) + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "urlregexpreorder_" + Integer.toString(l) + "\" value=\"" + allowReorder + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "urlregexpjava_" + Integer.toString(l) + "\" value=\"" + allowJavaSessionRemoval + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "urlregexpasp_" + Integer.toString(l) + "\" value=\"" + allowASPSessionRemoval + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "urlregexpphp_" + Integer.toString(l) + "\" value=\"" + allowPHPSessionRemoval + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "urlregexpbv_" + Integer.toString(l) + "\" value=\"" + allowBVSessionRemoval + "\"/>\n"); l++; } } out.print("<input type=\"hidden\" name=\"" + seqPrefix + "urlregexpcount\" value=\"" + Integer.toString(l) + "\"/>\n"); } // Mappings tab if (tabName.equals(Messages.getString(locale, "RSSConnector.URLMappings")) && connectionSequenceNumber == actualSequenceNumber) { out.print("<input type=\"hidden\" name=\"" + seqPrefix + "rssop\" value=\"\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "rssindex\" value=\"\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "rssmapcount\" value=\"" + Integer.toString(regexp.size()) + "\"/>\n" + "\n" + "<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"4\"><hr/></td></tr>\n"); i = 0; while (i < regexp.size()) { String prefix = seqPrefix + "rssregexp_" + Integer.toString(i) + "_"; out.print(" <tr>\n" + " <td class=\"value\">\n" + " <a name=\"" + seqPrefix + "regexp_" + Integer.toString(i) + "\">\n" + " <input type=\"button\" value=\"" + Messages.getAttributeString(locale, "RSSConnector.Remove") + "\" onclick='javascript:" + seqPrefix + "RemoveRegexp(" + Integer.toString(i) + ",\"" + seqPrefix + "regexp_" + Integer.toString(i) + "\")' alt=\"" + Messages.getAttributeString(locale, "RSSConnector.RemoveRegexp") + Integer.toString(i) + "\"/>\n" + " </a>\n" + " </td>\n" + " <td class=\"value\"><input type=\"hidden\" name=\"" + prefix + "match" + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape((String) regexp.get(i)) + "\"/>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape((String) regexp.get(i)) + "</td>\n" + " <td class=\"value\">--></td>\n" + " <td class=\"value\">\n"); String match = (String) matchStrings.get(i); out.print(" <input type=\"hidden\" name=\"" + prefix + "map" + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(match) + "\"/>\n"); if (match.length() == 0) { out.print(" <as is>\n"); } else { out.print(" " + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(match) + "\n"); } out.print(" </td>\n" + " </tr>\n"); i++; } out.print(" <tr>\n" + " <td class=\"value\"><a name=\"" + seqPrefix + "regexp_" + Integer.toString(i) + "\"><input type=\"button\" value=\"" + Messages.getAttributeString(locale, "RSSConnector.Add") + "\" onclick='javascript:" + seqPrefix + "AddRegexp(\"" + seqPrefix + "regexp_" + Integer.toString(i + 1) + "\")' alt=\"" + Messages.getAttributeString(locale, "RSSConnector.AddRegexp") + "\"/></a></td>\n" + " <td class=\"value\"><input type=\"text\" name=\"" + seqPrefix + "rssmatch\" size=\"16\" value=\"\"/></td>\n" + " <td class=\"value\">--></td>\n" + " <td class=\"value\"><input type=\"text\" name=\"" + seqPrefix + "rssmap\" size=\"16\" value=\"\"/></td>\n" + " </tr>\n" + "</table>\n"); } else { out.print("<input type=\"hidden\" name=\"" + seqPrefix + "rssmapcount\" value=\"" + Integer.toString(regexp.size()) + "\"/>\n"); i = 0; while (i < regexp.size()) { String prefix = seqPrefix + "rssregexp_" + Integer.toString(i) + "_"; String match = (String) matchStrings.get(i); out.print("<input type=\"hidden\" name=\"" + prefix + "match" + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape((String) regexp.get(i)) + "\"/>\n" + "<input type=\"hidden\" name=\"" + prefix + "map" + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(match) + "\"/>\n"); i++; } } // Timeout Value tab if (tabName.equals(Messages.getString(locale, "RSSConnector.TimeValues")) && connectionSequenceNumber == actualSequenceNumber) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.FeedConnectTimeout") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"5\" name=\"" + seqPrefix + "feedtimeout\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedTimeoutValue)) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.DefaultFeedRefetchTime") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"5\" name=\"" + seqPrefix + "feedrefetch\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedRefetchValue)) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.MinimumFeedRefetchTime") + "</nobr></td>\n" + " <td class=\"value\"><input type=\"text\" size=\"5\" name=\"" + seqPrefix + "minfeedrefetch\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(minFeedRefetchValue)) + "\"/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.BadFeedRefetchTime") + "</nobr></td>\n" + " <td class=\"value\">\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "badfeedrefetch_present\" value=\"true\"/>\n" + " <input type=\"text\" size=\"5\" name=\"" + seqPrefix + "badfeedrefetch\" value=\"" + ((badFeedRefetchValue == null) ? "" : org.apache.manifoldcf.ui.util.Encoder.attributeEscape(badFeedRefetchValue.toString())) + "\"/>\n" + " </td>\n" + " </tr>\n" + "\n" + "</table>\n"); } else { out.print("<input type=\"hidden\" name=\"" + seqPrefix + "feedtimeout\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedTimeoutValue)) + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "feedrefetch\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedRefetchValue)) + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "minfeedrefetch\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(minFeedRefetchValue)) + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "badfeedrefetch_present\" value=\"true\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "badfeedrefetch\" value=\"" + ((badFeedRefetchValue == null) ? "" : org.apache.manifoldcf.ui.util.Encoder.attributeEscape(badFeedRefetchValue.toString())) + "\"/>\n"); } // Dechromed content tab String dechromedMode = RSSConfig.VALUE_NONE; String chromedMode = RSSConfig.VALUE_USE; i = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i++); if (sn.getType().equals(RSSConfig.NODE_DECHROMEDMODE)) dechromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE); else if (sn.getType().equals(RSSConfig.NODE_CHROMEDMODE)) chromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE); } if (tabName.equals(Messages.getString(locale, "RSSConnector.DechromedContent")) && connectionSequenceNumber == actualSequenceNumber) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"1\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"value\"><nobr><input type=\"radio\" name=\"" + seqPrefix + "dechromedmode\" value=\"none\" " + (dechromedMode.equals(RSSConfig.VALUE_NONE) ? "checked=\"true\"" : "") + "/>" + Messages.getBodyString(locale, "RSSConnector.NoDechromedContent") + "</nobr></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"value\"><nobr><input type=\"radio\" name=\"" + seqPrefix + "dechromedmode\" value=\"description\" " + (dechromedMode.equals(RSSConfig.VALUE_DESCRIPTION) ? "checked=\"true\"" : "") + "/>" + Messages.getBodyString(locale, "RSSConnector.DechromedContentIfPresentInDescriptionField") + "</nobr></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"value\"><nobr><input type=\"radio\" name=\"" + seqPrefix + "dechromedmode\" value=\"content\" " + (dechromedMode.equals(RSSConfig.VALUE_CONTENT) ? "checked=\"true\"" : "") + "/>" + Messages.getBodyString(locale, "RSSConnector.DechromedContentIfPresentInContentField") + "</nobr></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"separator\"><hr/></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"value\"><nobr><input type=\"radio\" name=\"" + seqPrefix + "chromedmode\" value=\"use\" " + (chromedMode.equals(RSSConfig.VALUE_USE) ? "checked=\"true\"" : "") + "/>" + Messages.getBodyString(locale, "RSSConnector.UseChromedContentIfNoDechromedContentFound") + "</nobr></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"value\"><nobr><input type=\"radio\" name=\"" + seqPrefix + "chromedmode\" value=\"skip\" " + (chromedMode.equals(RSSConfig.VALUE_SKIP) ? "checked=\"true\"" : "") + "/>" + Messages.getBodyString(locale, "RSSConnector.NeverUseChromedContent") + "</nobr></td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"value\"><nobr><input type=\"radio\" name=\"" + seqPrefix + "chromedmode\" value=\"metadata\" " + (chromedMode.equals(RSSConfig.VALUE_METADATA) ? "checked=\"true\"" : "") + "/>" + Messages.getBodyString(locale, "RSSConnector.NoContentMetadataOnly") + "</nobr></td>\n" + " </tr>\n" + "</table>\n"); } else { out.print("<input type=\"hidden\" name=\"" + seqPrefix + "dechromedmode\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(dechromedMode) + "\"/>\n" + "<input type=\"hidden\" name=\"" + seqPrefix + "chromedmode\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(chromedMode) + "\"/>\n"); } // Security tab // There is no native security, so all we care about are the tokens. i = 0; if (tabName.equals(Messages.getString(locale, "RSSConnector.Security")) && connectionSequenceNumber == actualSequenceNumber) { out.print("<table class=\"displaytable\">\n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"); // Go through forced ACL i = 0; k = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i++); if (sn.getType().equals(RSSConfig.NODE_ACCESS)) { String accessDescription = "_" + Integer.toString(k); String accessOpName = seqPrefix + "accessop" + accessDescription; String token = sn.getAttributeValue(RSSConfig.ATTR_TOKEN); out.print(" <tr>\n" + " <td class=\"description\">\n" + " <input type=\"hidden\" name=\"" + accessOpName + "\" value=\"\"/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "spectoken" + accessDescription + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(token) + "\"/>\n" + " <a name=\"" + seqPrefix + "token_" + Integer.toString(k) + "\">\n" + " <input type=\"button\" value=\"Delete\" onClick='Javascript:" + seqPrefix + "SpecOp(\"" + accessOpName + "\",\"Delete\",\"" + seqPrefix + "token_" + Integer.toString(k) + "\")' alt=\"" + Messages.getAttributeString(locale, "RSSConnector.DeleteToken") + Integer.toString(k) + "\"/>\n" + " </a> \n" + " </td>\n" + " <td class=\"value\">\n" + " " + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(token) + "\n" + " </td>\n" + " </tr>\n"); k++; } } if (k == 0) { out.print(" <tr>\n" + " <td class=\"message\" colspan=\"2\">" + Messages.getBodyString(locale, "RSSConnector.NoAccessTokensPresent") + "</td>\n" + " </tr>\n"); } out.print(" <tr><td class=\"lightseparator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"description\">\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "tokencount\" value=\"" + Integer.toString(k) + "\"/>\n" + " <input type=\"hidden\" name=\"" + seqPrefix + "accessop\" value=\"\"/>\n" + " <a name=\"" + seqPrefix + "token_" + Integer.toString(k) + "\">\n" + " <input type=\"button\" value=\"Add\" onClick='Javascript:" + seqPrefix + "SpecAddToken(\"" + seqPrefix + "token_" + Integer.toString(k + 1) + "\")' alt=\"" + Messages.getAttributeString(locale, "RSSConnector.AddAccessToken") + "\"/>\n" + " </a> \n" + " </td>\n" + " <td class=\"value\">\n" + " <input type=\"text\" size=\"30\" name=\"" + seqPrefix + "spectoken\" value=\"\"/>\n" + " </td>\n" + " </tr>\n" + "</table>\n"); } else { // Finally, go through forced ACL i = 0; k = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i++); if (sn.getType().equals(RSSConfig.NODE_ACCESS)) { String accessDescription = "_" + Integer.toString(k); String token = sn.getAttributeValue(RSSConfig.ATTR_TOKEN); out.print("<input type=\"hidden\" name=\"" + seqPrefix + "spectoken" + accessDescription + "\" value=\"" + org.apache.manifoldcf.ui.util.Encoder.attributeEscape(token) + "\"/>\n"); k++; } } out.print("<input type=\"hidden\" name=\"" + seqPrefix + "tokencount\" value=\"" + Integer.toString(k) + "\"/>\n"); } } /** Process a specification post. * This method is called at the start of job's edit or view page, whenever there is a possibility that form * data for a connection has been posted. Its purpose is to gather form information and modify the * document specification accordingly. The name of the posted form is always "editjob". * The connector will be connected before this method can be called. *@param variableContext contains the post data, including binary file-upload information. *@param locale is the locale the output is preferred to be in. *@param ds is the current document specification for this job. *@param connectionSequenceNumber is the unique number of this connection within the job. *@return null if all is well, or a string error message if there is an error that should prevent saving of * the job (and cause a redirection to an error page). */ @Override public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification ds, int connectionSequenceNumber) throws ManifoldCFException { String seqPrefix = "s" + connectionSequenceNumber + "_"; // Get the map String value = variableContext.getParameter(seqPrefix + "rssmapcount"); if (value != null) { int mapsize = Integer.parseInt(value); // Clear it first int j = 0; while (j < ds.getChildCount()) { SpecificationNode sn = ds.getChild(j); if (sn.getType().equals(RSSConfig.NODE_MAP)) ds.removeChild(j); else j++; } // Grab the map values j = 0; while (j < mapsize) { String prefix = seqPrefix + "rssregexp_" + Integer.toString(j) + "_"; String match = variableContext.getParameter(prefix + "match"); String map = variableContext.getParameter(prefix + "map"); if (map == null) map = ""; // Add to the documentum specification SpecificationNode node = new SpecificationNode(RSSConfig.NODE_MAP); node.setAttribute(RSSConfig.ATTR_MATCH, match); node.setAttribute(RSSConfig.ATTR_MAP, map); ds.addChild(ds.getChildCount(), node); j++; } } // Get the cgiPath String rssURLSequence = variableContext.getParameter(seqPrefix + "rssurls"); if (rssURLSequence != null) { // Delete all url specs first int i = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i); if (sn.getType().equals(RSSConfig.NODE_FEED)) ds.removeChild(i); else i++; } try { java.io.Reader str = new java.io.StringReader(rssURLSequence); try { java.io.BufferedReader is = new java.io.BufferedReader(str); try { while (true) { String nextString = is.readLine(); if (nextString == null) break; if (nextString.length() == 0) continue; SpecificationNode node = new SpecificationNode(RSSConfig.NODE_FEED); node.setAttribute(RSSConfig.ATTR_URL, nextString); ds.addChild(ds.getChildCount(), node); } } finally { is.close(); } } finally { str.close(); } } catch (java.io.IOException e) { throw new ManifoldCFException("IO error: " + e.getMessage(), e); } } // Read the url specs String urlRegexpCount = variableContext.getParameter(seqPrefix + "urlregexpcount"); if (urlRegexpCount != null && urlRegexpCount.length() > 0) { int regexpCount = Integer.parseInt(urlRegexpCount); int j = 0; while (j < ds.getChildCount()) { SpecificationNode sn = ds.getChild(j); if (sn.getType().equals(RSSConfig.NODE_URLSPEC)) ds.removeChild(j); else j++; } // Grab the operation and the index (if any) String operation = variableContext.getParameter(seqPrefix + "urlregexpop"); if (operation == null) operation = "Continue"; int opIndex = -1; if (operation.equals("Delete")) opIndex = Integer.parseInt(variableContext.getParameter(seqPrefix + "urlregexpnumber")); // Reconstruct urlspec nodes j = 0; while (j < regexpCount) { // For each index, first look for a delete operation if (!operation.equals("Delete") || j != opIndex) { // Add the jth node String regexp = variableContext.getParameter(seqPrefix + "urlregexp_" + Integer.toString(j)); String regexpDescription = variableContext .getParameter(seqPrefix + "urlregexpdesc_" + Integer.toString(j)); String reorder = variableContext .getParameter(seqPrefix + "urlregexpreorder_" + Integer.toString(j)); String javaSession = variableContext .getParameter(seqPrefix + "urlregexpjava_" + Integer.toString(j)); String aspSession = variableContext .getParameter(seqPrefix + "urlregexpasp_" + Integer.toString(j)); String phpSession = variableContext .getParameter(seqPrefix + "urlregexpphp_" + Integer.toString(j)); String bvSession = variableContext .getParameter(seqPrefix + "urlregexpbv_" + Integer.toString(j)); SpecificationNode newSn = new SpecificationNode(RSSConfig.NODE_URLSPEC); newSn.setAttribute(RSSConfig.ATTR_REGEXP, regexp); if (regexpDescription != null && regexpDescription.length() > 0) newSn.setAttribute(RSSConfig.VALUE_DESCRIPTION, regexpDescription); if (reorder != null && reorder.length() > 0) newSn.setAttribute(RSSConfig.ATTR_REORDER, reorder); if (javaSession != null && javaSession.length() > 0) newSn.setAttribute(RSSConfig.ATTR_JAVASESSIONREMOVAL, javaSession); if (aspSession != null && aspSession.length() > 0) newSn.setAttribute(RSSConfig.ATTR_ASPSESSIONREMOVAL, aspSession); if (phpSession != null && phpSession.length() > 0) newSn.setAttribute(RSSConfig.ATTR_PHPSESSIONREMOVAL, phpSession); if (bvSession != null && bvSession.length() > 0) newSn.setAttribute(RSSConfig.ATTR_BVSESSIONREMOVAL, bvSession); ds.addChild(ds.getChildCount(), newSn); } j++; } if (operation.equals("Add")) { String regexp = variableContext.getParameter(seqPrefix + "urlregexp"); String regexpDescription = variableContext.getParameter(seqPrefix + "urlregexpdesc"); String reorder = variableContext.getParameter(seqPrefix + "urlregexpreorder"); String javaSession = variableContext.getParameter(seqPrefix + "urlregexpjava"); String aspSession = variableContext.getParameter(seqPrefix + "urlregexpasp"); String phpSession = variableContext.getParameter(seqPrefix + "urlregexpphp"); String bvSession = variableContext.getParameter(seqPrefix + "urlregexpbv"); // Add a new node at the end SpecificationNode newSn = new SpecificationNode(RSSConfig.NODE_URLSPEC); newSn.setAttribute(RSSConfig.ATTR_REGEXP, regexp); if (regexpDescription != null && regexpDescription.length() > 0) newSn.setAttribute(RSSConfig.VALUE_DESCRIPTION, regexpDescription); if (reorder != null && reorder.length() > 0) newSn.setAttribute(RSSConfig.ATTR_REORDER, reorder); if (javaSession != null && javaSession.length() > 0) newSn.setAttribute(RSSConfig.ATTR_JAVASESSIONREMOVAL, javaSession); if (aspSession != null && aspSession.length() > 0) newSn.setAttribute(RSSConfig.ATTR_ASPSESSIONREMOVAL, aspSession); if (phpSession != null && phpSession.length() > 0) newSn.setAttribute(RSSConfig.ATTR_PHPSESSIONREMOVAL, phpSession); if (bvSession != null && bvSession.length() > 0) newSn.setAttribute(RSSConfig.ATTR_BVSESSIONREMOVAL, bvSession); ds.addChild(ds.getChildCount(), newSn); } } // Get the exclusions String exclusions = variableContext.getParameter(seqPrefix + "exclusions"); if (exclusions != null) { // Delete existing exclusions record first int i = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i); if (sn.getType().equals(RSSConfig.NODE_EXCLUDES)) ds.removeChild(i); else i++; } SpecificationNode cn = new SpecificationNode(RSSConfig.NODE_EXCLUDES); cn.setValue(exclusions); ds.addChild(ds.getChildCount(), cn); } // Read the feed timeout, if present String feedTimeoutValue = variableContext.getParameter(seqPrefix + "feedtimeout"); if (feedTimeoutValue != null && feedTimeoutValue.length() > 0) { int j = 0; while (j < ds.getChildCount()) { SpecificationNode sn = ds.getChild(j); if (sn.getType().equals(RSSConfig.NODE_FEEDTIMEOUT)) ds.removeChild(j); else j++; } SpecificationNode node = new SpecificationNode(RSSConfig.NODE_FEEDTIMEOUT); node.setAttribute(RSSConfig.ATTR_VALUE, feedTimeoutValue); ds.addChild(ds.getChildCount(), node); } // Read the feed refetch interval, if present String feedRefetchValue = variableContext.getParameter(seqPrefix + "feedrefetch"); if (feedRefetchValue != null && feedRefetchValue.length() > 0) { int j = 0; while (j < ds.getChildCount()) { SpecificationNode sn = ds.getChild(j); if (sn.getType().equals(RSSConfig.NODE_FEEDRESCAN)) ds.removeChild(j); else j++; } SpecificationNode node = new SpecificationNode(RSSConfig.NODE_FEEDRESCAN); node.setAttribute(RSSConfig.ATTR_VALUE, feedRefetchValue); ds.addChild(ds.getChildCount(), node); } // Read the minimum feed refetch interval, if present String minFeedRefetchValue = variableContext.getParameter(seqPrefix + "minfeedrefetch"); if (minFeedRefetchValue != null && minFeedRefetchValue.length() > 0) { int j = 0; while (j < ds.getChildCount()) { SpecificationNode sn = ds.getChild(j); if (sn.getType().equals(RSSConfig.NODE_MINFEEDRESCAN)) ds.removeChild(j); else j++; } SpecificationNode node = new SpecificationNode(RSSConfig.NODE_MINFEEDRESCAN); node.setAttribute(RSSConfig.ATTR_VALUE, minFeedRefetchValue); ds.addChild(ds.getChildCount(), node); } // Read the bad feed refetch interval (which is allowed to be null) String badFeedRefetchValuePresent = variableContext.getParameter(seqPrefix + "badfeedrefetch_present"); if (badFeedRefetchValuePresent != null && badFeedRefetchValuePresent.length() > 0) { String badFeedRefetchValue = variableContext.getParameter(seqPrefix + "badfeedrefetch"); int k = 0; while (k < ds.getChildCount()) { SpecificationNode sn = ds.getChild(k); if (sn.getType().equals(RSSConfig.NODE_BADFEEDRESCAN)) ds.removeChild(k); else k++; } if (badFeedRefetchValue != null && badFeedRefetchValue.length() > 0) { SpecificationNode node = new SpecificationNode(RSSConfig.NODE_BADFEEDRESCAN); node.setAttribute(RSSConfig.ATTR_VALUE, badFeedRefetchValue); ds.addChild(ds.getChildCount(), node); } } // Read the dechromed mode String dechromedMode = variableContext.getParameter(seqPrefix + "dechromedmode"); if (dechromedMode != null && dechromedMode.length() > 0) { int j = 0; while (j < ds.getChildCount()) { SpecificationNode sn = ds.getChild(j); if (sn.getType().equals(RSSConfig.NODE_DECHROMEDMODE)) ds.removeChild(j); else j++; } SpecificationNode node = new SpecificationNode(RSSConfig.NODE_DECHROMEDMODE); node.setAttribute(RSSConfig.ATTR_MODE, dechromedMode); ds.addChild(ds.getChildCount(), node); } // Read the chromed mode String chromedMode = variableContext.getParameter(seqPrefix + "chromedmode"); if (chromedMode != null && chromedMode.length() > 0) { int j = 0; while (j < ds.getChildCount()) { SpecificationNode sn = ds.getChild(j); if (sn.getType().equals(RSSConfig.NODE_CHROMEDMODE)) ds.removeChild(j); else j++; } SpecificationNode node = new SpecificationNode(RSSConfig.NODE_CHROMEDMODE); node.setAttribute(RSSConfig.ATTR_MODE, chromedMode); ds.addChild(ds.getChildCount(), node); } // Now, do whatever action we were told to do. String rssop = variableContext.getParameter(seqPrefix + "rssop"); if (rssop != null && rssop.equals("Add")) { // Add a match to the end String match = variableContext.getParameter(seqPrefix + "rssmatch"); String map = variableContext.getParameter(seqPrefix + "rssmap"); SpecificationNode node = new SpecificationNode(RSSConfig.NODE_MAP); node.setAttribute(RSSConfig.ATTR_MATCH, match); node.setAttribute(RSSConfig.ATTR_MAP, map); ds.addChild(ds.getChildCount(), node); } else if (rssop != null && rssop.equals("Delete")) { int index = Integer.parseInt(variableContext.getParameter(seqPrefix + "rssindex")); int j = 0; while (j < ds.getChildCount()) { SpecificationNode sn = ds.getChild(j); if (sn.getType().equals(RSSConfig.NODE_MAP)) { if (index == 0) { ds.removeChild(j); break; } index--; } j++; } } String xc = variableContext.getParameter(seqPrefix + "tokencount"); if (xc != null) { // Delete all tokens first int i = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i); if (sn.getType().equals(RSSConfig.NODE_ACCESS)) ds.removeChild(i); else i++; } int accessCount = Integer.parseInt(xc); i = 0; while (i < accessCount) { String accessDescription = "_" + Integer.toString(i); String accessOpName = seqPrefix + "accessop" + accessDescription; xc = variableContext.getParameter(accessOpName); if (xc != null && xc.equals("Delete")) { // Next row i++; continue; } // Get the stuff we need String accessSpec = variableContext.getParameter(seqPrefix + "spectoken" + accessDescription); SpecificationNode node = new SpecificationNode(RSSConfig.NODE_ACCESS); node.setAttribute(RSSConfig.ATTR_TOKEN, accessSpec); ds.addChild(ds.getChildCount(), node); i++; } String op = variableContext.getParameter(seqPrefix + "accessop"); if (op != null && op.equals("Add")) { String accessspec = variableContext.getParameter(seqPrefix + "spectoken"); SpecificationNode node = new SpecificationNode(RSSConfig.NODE_ACCESS); node.setAttribute(RSSConfig.ATTR_TOKEN, accessspec); ds.addChild(ds.getChildCount(), node); } } return null; } /** View specification. * This method is called in the body section of a job's view page. Its purpose is to present the document * specification information to the user. The coder can presume that the HTML that is output from * this configuration will be within appropriate <html> and <body> tags. * The connector will be connected before this method can be called. *@param out is the output to which any HTML should be sent. *@param locale is the locale the output is preferred to be in. *@param ds is the current document specification for this job. *@param connectionSequenceNumber is the unique number of this connection within the job. */ @Override public void viewSpecification(IHTTPOutput out, Locale locale, Specification ds, int connectionSequenceNumber) throws ManifoldCFException, IOException { String exclusions = ""; out.print("<table class=\"displaytable\">\n"); int i = 0; boolean seenAny = false; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i++); if (sn.getType().equals(RSSConfig.NODE_FEED)) { if (seenAny == false) { out.print(" <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RSSUrls") + "</nobr></td>\n" + " <td class=\"value\">\n"); seenAny = true; } out.print(" <nobr>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(sn.getAttributeValue(RSSConfig.ATTR_URL)) + "</nobr><br/>\n"); } else if (sn.getType().equals(RSSConfig.NODE_EXCLUDES)) { exclusions = sn.getValue(); if (exclusions == null) exclusions = ""; } } if (seenAny) { out.print(" </td>\n" + " </tr>\n"); } else { out.print(" <tr><td class=\"message\" colspan=\"2\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.NoRSSUrlsSpecified") + "</nobr></td></tr>\n"); } out.print(" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"); i = 0; int l = 0; seenAny = false; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i++); if (sn.getType().equals(RSSConfig.NODE_URLSPEC)) { if (l == 0) { out.print(" <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.URLCanonicalization") + "</nobr></td>\n" + " <td class=\"value\">\n" + " <table class=\"formtable\">\n" + " <tr class=\"formheaderrow\">\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.URLRegexp") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.Description") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.Reorder") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RemoveJSPSessions") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RemoveASPSessions") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RemovePHPSessions") + "</nobr></td>\n" + " <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.RemoveBVSessions") + "</nobr></td>\n" + " </tr>\n"); } String regexpString = sn.getAttributeValue(RSSConfig.ATTR_REGEXP); String description = sn.getAttributeValue(RSSConfig.ATTR_DESCRIPTION); if (description == null) description = ""; String allowReorder = sn.getAttributeValue(RSSConfig.ATTR_REORDER); if (allowReorder == null || allowReorder.length() == 0) allowReorder = RSSConfig.VALUE_NO; String allowJavaSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL); if (allowJavaSessionRemoval == null || allowJavaSessionRemoval.length() == 0) allowJavaSessionRemoval = RSSConfig.VALUE_NO; String allowASPSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL); if (allowASPSessionRemoval == null || allowASPSessionRemoval.length() == 0) allowASPSessionRemoval = RSSConfig.VALUE_NO; String allowPHPSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL); if (allowPHPSessionRemoval == null || allowPHPSessionRemoval.length() == 0) allowPHPSessionRemoval = RSSConfig.VALUE_NO; String allowBVSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL); if (allowBVSessionRemoval == null || allowBVSessionRemoval.length() == 0) allowBVSessionRemoval = RSSConfig.VALUE_NO; out.print(" <tr class=\"" + (((l % 2) == 0) ? "evenformrow" : "oddformrow") + "\">\n" + " <td class=\"formcolumncell\"><nobr>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(regexpString) + "</nobr></td>\n" + " <td class=\"formcolumncell\">" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(description) + "</td>\n" + " <td class=\"formcolumncell\"><nobr>" + allowReorder + "</nobr></td>\n" + " <td class=\"formcolumncell\"><nobr>" + allowJavaSessionRemoval + "</nobr></td>\n" + " <td class=\"formcolumncell\"><nobr>" + allowASPSessionRemoval + "</nobr></td>\n" + " <td class=\"formcolumncell\"><nobr>" + allowPHPSessionRemoval + "</nobr></td>\n" + " <td class=\"formcolumncell\"><nobr>" + allowBVSessionRemoval + "</nobr></td>\n" + " </tr>\n"); l++; } } if (l > 0) { out.print(" </table>\n" + " </td>\n" + " </tr>\n"); } else { out.print(" <tr><td class=\"message\" colspan=\"2\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.NoCanonicalizationSpecified") + "</nobr></td></tr>\n"); } out.print(" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"); i = 0; seenAny = false; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i++); if (sn.getType().equals(RSSConfig.NODE_MAP)) { if (seenAny == false) { out.print(" <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.URLMappingsColon") + "</nobr></td>\n" + " <td class=\"value\">\n"); seenAny = true; } String match = sn.getAttributeValue(RSSConfig.ATTR_MATCH); String map = sn.getAttributeValue(RSSConfig.ATTR_MAP); out.print(" <nobr>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(match) + "</nobr>\n"); if (map != null && map.length() > 0) { out.print(" --> <nobr>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(map) + "</nobr>\n"); } out.print(" <br/>\n"); } } if (seenAny) { out.print(" </td>\n" + " </tr>\n"); } else { out.print(" <tr><td class=\"message\" colspan=\"2\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.NoMappingsSpecifiedWillAcceptAllUrls") + "</nobr></td></tr>\n"); } out.print(" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.Exclude") + "</nobr></td>\n" + " <td class=\"value\">\n"); try { java.io.Reader str = new java.io.StringReader(exclusions); try { java.io.BufferedReader is = new java.io.BufferedReader(str); try { while (true) { String nextString = is.readLine(); if (nextString == null) break; if (nextString.length() == 0) continue; out.print(" <nobr>" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(nextString) + "</nobr><br/>\n"); } } finally { is.close(); } } finally { str.close(); } } catch (java.io.IOException e) { throw new ManifoldCFException("IO error: " + e.getMessage(), e); } out.print(" </td>\n" + " </tr>\n"); out.print(" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"); String feedTimeoutValue = "60"; String feedRefetchValue = "60"; String minFeedRefetchValue = "15"; String badFeedRefetchValue = null; String dechromedMode = RSSConfig.VALUE_NONE; String chromedMode = RSSConfig.VALUE_USE; i = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i++); if (sn.getType().equals(RSSConfig.NODE_FEEDTIMEOUT)) { feedTimeoutValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE); } else if (sn.getType().equals(RSSConfig.NODE_FEEDRESCAN)) { feedRefetchValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE); } else if (sn.getType().equals(RSSConfig.NODE_MINFEEDRESCAN)) { minFeedRefetchValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE); } else if (sn.getType().equals(RSSConfig.NODE_BADFEEDRESCAN)) { badFeedRefetchValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE); } else if (sn.getType().equals(RSSConfig.NODE_DECHROMEDMODE)) { dechromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE); } else if (sn.getType().equals(RSSConfig.NODE_CHROMEDMODE)) { chromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE); } } out.print( " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.FeedConnectionTimeout") + "</nobr></td>\n" + " <td class=\"value\">" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(feedTimeoutValue) + "</td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.DefaultFeedRescanInterval") + "</nobr></td>\n" + " <td class=\"value\">" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(feedRefetchValue) + "</td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.MinimumFeedRescanInterval") + "</nobr></td>\n" + " <td class=\"value\">" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(minFeedRefetchValue) + "</td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.BadFeedRescanInterval") + "</nobr></td>\n" + " <td class=\"value\">" + ((badFeedRefetchValue == null) ? "(Default feed rescan value)" : org.apache.manifoldcf.ui.util.Encoder.bodyEscape(badFeedRefetchValue)) + "</td>\n" + " </tr>\n" + " \n" + " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" + "\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.DechromedContentSource") + "</nobr></td>\n" + " <td class=\"value\">" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(dechromedMode) + "</td>\n" + " </tr>\n" + " <tr>\n" + " <td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.ChromedContent") + "</nobr></td>\n" + " <td class=\"value\">" + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(chromedMode) + "</td>\n" + " </tr>\n" + "\n"); out.print(" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"); // Go through looking for access tokens seenAny = false; i = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i++); if (sn.getType().equals(RSSConfig.NODE_ACCESS)) { if (seenAny == false) { out.print(" <tr><td class=\"description\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.AccessTokens") + "</nobr></td>\n" + " <td class=\"value\">\n"); seenAny = true; } String token = sn.getAttributeValue(RSSConfig.ATTR_TOKEN); out.print(" " + org.apache.manifoldcf.ui.util.Encoder.bodyEscape(token) + "<br/>\n"); } } if (seenAny) { out.print(" </td>\n" + " </tr>\n"); } else { out.print(" <tr><td class=\"message\" colspan=\"2\"><nobr>" + Messages.getBodyString(locale, "RSSConnector.NoAccessTokensSpecified") + "</nobr></td></tr>\n"); } out.print("</table>\n"); } /** Handle an RSS feed document, using SAX to limit the memory impact */ protected void handleRSSFeedSAX(String documentIdentifier, IProcessActivity activities, Filter filter) throws ManifoldCFException, ServiceInterruption { // The SAX model uses parsing events to control parsing, which allows me to manage memory usage much better. // This is essential for when a feed contains dechromed content as well as links. // First, catch all flavors of IO exception, and handle them properly try { // Open the input stream, and set up the parse InputStream is = cache.getData(documentIdentifier); if (is == null) { Logging.connectors.error("RSS: Document '" + documentIdentifier + "' should be in cache but isn't"); return; } try { Parser p = new Parser(); // Parse the document. This will cause various things to occur, within the instantiated XMLParsingContext class. XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState(); OuterContextClass c = new OuterContextClass(x, documentIdentifier, activities, filter); x.setContext(c); try { // Believe it or not, there are no parsing errors we can get back now. p.parseWithCharsetDetection(null, is, x); c.checkIfValidFeed(); c.setDefaultRescanTimeIfNeeded(); } finally { x.cleanup(); } } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException("Socket timeout error: " + e.getMessage(), e); } catch (ConnectTimeoutException e) { throw new ManifoldCFException("Socket connect timeout error: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO error: " + e.getMessage(), e); } } /** This class handles the outermost XML context for the feed document. */ protected class OuterContextClass extends XMLParsingContext { /** Keep track of the number of valid feed signals we saw */ protected int outerTagCount = 0; /** The document identifier */ protected String documentIdentifier; /** Activities interface */ protected IProcessActivity activities; /** Filter */ protected Filter filter; /** Flag indicating the the rescan time was set for this feed */ protected boolean rescanTimeSet = false; public OuterContextClass(XMLFuzzyHierarchicalParseState theStream, String documentIdentifier, IProcessActivity activities, Filter filter) { super(theStream); this.documentIdentifier = documentIdentifier; this.activities = activities; this.filter = filter; } /** Check if feed was valid */ public void checkIfValidFeed() { if (outerTagCount == 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: RSS document '" + documentIdentifier + "' does not have rss, feed, or rdf:RDF tag - not valid feed"); } } /** Check if the rescan flag was set or not, and if not, make sure it gets set properly */ public void setDefaultRescanTimeIfNeeded() throws ManifoldCFException { if (rescanTimeSet == false) { // Set it! // Need to set the requeue parameters appropriately, since otherwise the feed reverts to default document // rescan or expire behavior. long currentTime = System.currentTimeMillis(); Long rescanTime = filter.getBadFeedRescanTime(currentTime); if (rescanTime == null) rescanTime = filter.getDefaultRescanTime(currentTime); if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug( "RSS: In RSS document '" + documentIdentifier + "' setting default rescan time to " + ((rescanTime == null) ? "null" : rescanTime.toString())); activities.setDocumentScheduleBounds(documentIdentifier, rescanTime, rescanTime, null, null); rescanTimeSet = true; } } /** Handle the tag beginning to set the correct second-level parsing context */ @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { if (localName.equals("rss")) { // RSS feed detected outerTagCount++; if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: Parsed bottom-level XML for RSS document '" + documentIdentifier + "'"); return new RSSContextClass(theStream, namespace, localName, qName, atts, documentIdentifier, activities, filter); } else if (localName.equals("RDF")) { // RDF/Atom feed detected outerTagCount++; return new RDFContextClass(theStream, namespace, localName, qName, atts, documentIdentifier, activities, filter); } else if (localName.equals("feed")) { // Basic feed detected outerTagCount++; return new FeedContextClass(theStream, namespace, localName, qName, atts, documentIdentifier, activities, filter); } else if (localName.equals("urlset") || localName.equals("sitemapindex")) { // Sitemap detected outerTagCount++; return new UrlsetContextClass(theStream, namespace, localName, qName, atts, documentIdentifier, activities, filter); } // The default action is to establish a new default context. return super.beginTag(namespace, localName, qName, atts); } /** Handle the tag ending */ @Override protected void endTag() throws ManifoldCFException { XMLParsingContext context = theStream.getContext(); String tagName = context.getLocalname(); if (tagName.equals("rss")) { rescanTimeSet = ((RSSContextClass) context).process(); } else if (tagName.equals("RDF")) { rescanTimeSet = ((RDFContextClass) context).process(); } else if (tagName.equals("feed")) { rescanTimeSet = ((FeedContextClass) context).process(); } else if (tagName.equals("urlset") || tagName.equals("sitemapindex")) { rescanTimeSet = ((UrlsetContextClass) context).process(); } else super.endTag(); } } protected class RSSContextClass extends XMLParsingContext { /** The document identifier */ protected String documentIdentifier; /** Activities interface */ protected IProcessActivity activities; /** Filter */ protected Filter filter; /** Rescan time set flag */ protected boolean rescanTimeSet = false; public RSSContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) { super(theStream, namespace, localName, qName, atts); this.documentIdentifier = documentIdentifier; this.activities = activities; this.filter = filter; } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { // Handle each channel if (localName.equals("channel")) { // Channel detected return new RSSChannelContextClass(theStream, namespace, localName, qName, atts, documentIdentifier, activities, filter); } // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } @Override protected void endTag() throws ManifoldCFException { // If it's our channel tag, process global channel information XMLParsingContext context = theStream.getContext(); String tagName = context.getLocalname(); if (tagName.equals("channel")) { rescanTimeSet = ((RSSChannelContextClass) context).process(); } else super.endTag(); } /** Process this data */ protected boolean process() throws ManifoldCFException { return rescanTimeSet; } } protected class RSSChannelContextClass extends XMLParsingContext { /** The document identifier */ protected String documentIdentifier; /** Activities interface */ protected IProcessActivity activities; /** Filter */ protected Filter filter; /** TTL value is set on a per-channel basis */ protected String ttlValue = null; public RSSChannelContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) { super(theStream, namespace, localName, qName, atts); this.documentIdentifier = documentIdentifier; this.activities = activities; this.filter = filter; } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { // The tags we care about are "ttl" and "item", nothing else. if (localName.equals("ttl")) { // TTL value seen. Prepare to record it, as a string. return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("item")) { // Item seen. We don't need any of the attributes etc., but we need to start a new context. return new RSSItemContextClass(theStream, namespace, localName, qName, atts, filter.getDechromedContentMode()); } // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } @Override protected void endTag() throws ManifoldCFException { XMLParsingContext theContext = theStream.getContext(); String theTag = theContext.getLocalname(); if (theTag.equals("ttl")) // If the current context must be the TTL one, record its data value. ttlValue = ((XMLStringParsingContext) theContext).getValue(); else if (theTag.equals("item")) { // It's an item. RSSItemContextClass itemContext = (RSSItemContextClass) theContext; // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including: // (1) File name (if any), containing dechromed content // (2) Link name(s) // (3) Pubdate // (4) Title // The job now is to pull this info out and call the activities interface appropriately. // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up // all dangling files etc. that need to be removed. // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup() // method will be called regardless. itemContext.process(documentIdentifier, activities, filter); } else super.endTag(); } /** Process this data, return true if rescan time was set */ protected boolean process() throws ManifoldCFException { // Deal with the ttlvalue, if it was found // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default. long currentTime = System.currentTimeMillis(); Long rescanTime = filter.getDefaultRescanTime(currentTime); if (ttlValue != null) { try { int minutes = Integer.parseInt(ttlValue); long nextTime = currentTime + minutes * 60000L; rescanTime = new Long(nextTime); // Set the upper bound time; we want to scan the feeds aggressively. if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In RSS document '" + documentIdentifier + "', found a ttl value of " + ttlValue + "; setting refetch time accordingly"); } catch (NumberFormatException e) { Logging.connectors.warn("RSS: RSS document '" + documentIdentifier + "' has illegal ttl value '" + ttlValue + "'"); } } if (rescanTime != null) { Long minimumTime = filter.getMinimumRescanTime(currentTime); if (minimumTime != null) { if (rescanTime.longValue() < minimumTime.longValue()) rescanTime = minimumTime; } } if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In RSS document '" + documentIdentifier + "' setting rescan time to " + ((rescanTime == null) ? "null" : rescanTime.toString())); activities.setDocumentScheduleBounds(documentIdentifier, rescanTime, rescanTime, null, null); return true; } } protected class RSSItemContextClass extends XMLParsingContext { protected int dechromedContentMode; protected String guidField = null; protected String linkField = null; protected String pubDateField = null; protected String titleField = null; protected String descriptionField = null; protected String authorEmailField = null; protected String authorNameField = null; protected ArrayList categoryField = new ArrayList(); protected File contentsFile = null; public RSSItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts, int dechromedContentMode) { super(theStream, namespace, localName, qName, atts); this.dechromedContentMode = dechromedContentMode; } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { // The tags we care about are "ttl" and "item", nothing else. if (localName.equals("link")) { // "link" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("guid")) { // "guid" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("pubdate")) { // "pubDate" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("title")) { // "title" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("category")) { // "category" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("author")) { // "author" tag, which contains email return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("creator")) { // "creator" tag which contains name (like dc:creator) return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else { // Handle potentially longer fields. Both "description" and "content" fields can potentially be large; they are thus // processed as temporary files. But the dance is complicated because (a) we only want one PRIMARY content source, // and (b) we want access to the description field, if it is not used as primary content. switch (dechromedContentMode) { case DECHROMED_NONE: if (localName.equals("description")) { return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } break; case DECHROMED_DESCRIPTION: if (localName.equals("description")) { try { File tempFile = File.createTempFile("_rssdata_", "tmp"); return new XMLFileParsingContext(theStream, namespace, localName, qName, atts, tempFile); } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } } break; case DECHROMED_CONTENT: if (localName.equals("content")) { try { File tempFile = File.createTempFile("_rssdata_", "tmp"); return new XMLFileParsingContext(theStream, namespace, localName, qName, atts, tempFile); } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } } else if (localName.equals("description")) { return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } break; default: break; } // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } } /** Convert the individual sub-fields of the item context into their final forms */ @Override protected void endTag() throws ManifoldCFException { XMLParsingContext theContext = theStream.getContext(); String theTag = theContext.getLocalname(); if (theTag.equals("link")) { linkField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("guid")) { guidField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("pubdate")) { pubDateField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("title")) { titleField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("category")) { categoryField.add(((XMLStringParsingContext) theContext).getValue()); } else if (theTag.equals("author")) { authorEmailField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("creator")) { authorNameField = ((XMLStringParsingContext) theContext).getValue(); } else { // What we want is: (a) if dechromed mode is NONE, just put the description file in the description field; (b) // if dechromed mode is "description", put the description field in the primary content field; (c) // if dechromed mode is "content", put the content field in the primary content field, and the description field in the description field. switch (dechromedContentMode) { case DECHROMED_NONE: if (theTag.equals("description")) { descriptionField = ((XMLStringParsingContext) theContext).getValue(); } break; case DECHROMED_DESCRIPTION: if (theTag.equals("description")) { // Content file has been written; retrieve it (being sure not to leak any files already hanging around!) tagCleanup(); contentsFile = ((XMLFileParsingContext) theContext).getCompletedFile(); return; } break; case DECHROMED_CONTENT: if (theTag.equals("content")) { tagCleanup(); // Retrieve content file contentsFile = ((XMLFileParsingContext) theContext).getCompletedFile(); return; } else if (theTag.equals("description")) { descriptionField = ((XMLStringParsingContext) theContext).getValue(); } break; default: break; } super.endTag(); } } protected void tagCleanup() throws ManifoldCFException { // Delete the contents file if it is there. if (contentsFile != null) { contentsFile.delete(); contentsFile = null; } } /** Process the data accumulated for this item */ public void process(String documentIdentifier, IProcessActivity activities, Filter filter) throws ManifoldCFException { if (linkField == null || linkField.length() == 0) linkField = guidField; if (linkField != null && linkField.length() > 0) { Date origDateDate = null; if (pubDateField != null && pubDateField.length() > 0) { origDateDate = DateParser.parseRFC822Date(pubDateField); // Special for China Daily News if (origDateDate == null) origDateDate = DateParser.parseChinaDate(pubDateField); // Special for LL if (origDateDate == null) origDateDate = DateParser.parseISO8601Date(pubDateField); } Long origDate; if (origDateDate != null) origDate = new Long(origDateDate.getTime()); else origDate = null; String[] links = linkField.split(", "); int l = 0; while (l < links.length) { String rawURL = links[l++].trim(); // Process the link String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(), documentIdentifier, rawURL); if (newIdentifier != null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In RSS document '" + documentIdentifier + "', found a link to '" + newIdentifier + "', which has origination date " + ((origDate == null) ? "null" : origDate.toString())); if (filter.isLegalURL(newIdentifier)) { if (contentsFile == null && filter.getChromedContentMode() != CHROMED_METADATA_ONLY) { // It's a reference! Add it. String[] dataNames = new String[] { "pubdate", "title", "source", "authoremail", "authorname", "category", "description" }; String[][] dataValues = new String[dataNames.length][]; if (origDate != null) dataValues[0] = new String[] { origDate.toString() }; if (titleField != null) dataValues[1] = new String[] { titleField }; dataValues[2] = new String[] { documentIdentifier }; if (authorEmailField != null) dataValues[3] = new String[] { authorEmailField }; if (authorNameField != null) dataValues[4] = new String[] { authorNameField }; dataValues[5] = new String[categoryField.size()]; int q = 0; while (q < categoryField.size()) { (dataValues[5])[q] = (String) categoryField.get(q); q++; } if (descriptionField != null) dataValues[6] = new String[] { descriptionField }; // Add document reference, not including the data to pass down, but including a description activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } else { // The issue here is that if a document is ingested without a jobqueue entry, the document will not // be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make // sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document // right here. // Since the dechromed data is available from the feed, the possibility remains of passing the document // Now, set up the carrydown info String[] dataNames = new String[] { "pubdate", "title", "source", "authoremail", "authorname", "category", "data", "description" }; Object[][] dataValues = new Object[dataNames.length][]; if (origDate != null) dataValues[0] = new String[] { origDate.toString() }; if (titleField != null) dataValues[1] = new String[] { titleField }; dataValues[2] = new String[] { documentIdentifier }; if (authorEmailField != null) dataValues[3] = new String[] { authorEmailField }; if (authorNameField != null) dataValues[4] = new String[] { authorNameField }; dataValues[5] = new String[categoryField.size()]; int q = 0; while (q < categoryField.size()) { (dataValues[5])[q] = (String) categoryField.get(q); q++; } if (descriptionField != null) dataValues[7] = new String[] { descriptionField }; if (contentsFile == null) { CharacterInput ci = new NullCharacterInput(); try { dataValues[6] = new Object[] { ci }; // Add document reference, including the data to pass down, and the dechromed content too activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } finally { ci.discard(); } } else { CharacterInput ci = new TempFileCharacterInput(contentsFile); try { contentsFile = null; dataValues[6] = new Object[] { ci }; // Add document reference, including the data to pass down, and the dechromed content too activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } finally { ci.discard(); } } } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Identifier '" + newIdentifier + "' is excluded"); } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In RSS document '" + documentIdentifier + "', found an unincluded URL '" + rawURL + "'"); } } } } } protected class RDFContextClass extends XMLParsingContext { /** The document identifier */ protected String documentIdentifier; /** Activities interface */ protected IProcessActivity activities; /** Filter */ protected Filter filter; /** ttl value */ protected String ttlValue = null; public RDFContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) { super(theStream, namespace, localName, qName, atts); this.documentIdentifier = documentIdentifier; this.activities = activities; this.filter = filter; } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { // The tags we care about are "ttl" and "item", nothing else. if (localName.equals("ttl")) { // TTL value seen. Prepare to record it, as a string. return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("item")) { // Item seen. We don't need any of the attributes etc., but we need to start a new context. return new RDFItemContextClass(theStream, namespace, localName, qName, atts, filter.getDechromedContentMode()); } // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } @Override protected void endTag() throws ManifoldCFException { XMLParsingContext theContext = theStream.getContext(); String theTag = theContext.getLocalname(); if (theTag.equals("ttl")) // If the current context must be the TTL one, record its data value. ttlValue = ((XMLStringParsingContext) theContext).getValue(); else if (theTag.equals("item")) { // It's an item. RDFItemContextClass itemContext = (RDFItemContextClass) theContext; // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including: // (1) File name (if any), containing dechromed content // (2) Link name(s) // (3) Pubdate // (4) Title // The job now is to pull this info out and call the activities interface appropriately. // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up // all dangling files etc. that need to be removed. // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup() // method will be called regardless. itemContext.process(documentIdentifier, activities, filter); } else super.endTag(); } /** Process this data */ protected boolean process() throws ManifoldCFException { // Deal with the ttlvalue, if it was found // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default. long currentTime = System.currentTimeMillis(); Long rescanTime = filter.getDefaultRescanTime(currentTime); if (ttlValue != null) { try { int minutes = Integer.parseInt(ttlValue); long nextTime = currentTime + minutes * 60000L; rescanTime = new Long(nextTime); // Set the upper bound time; we want to scan the feeds aggressively. if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In RDF document '" + documentIdentifier + "', found a ttl value of " + ttlValue + "; setting refetch time accordingly"); } catch (NumberFormatException e) { Logging.connectors.warn("RSS: RDF document '" + documentIdentifier + "' has illegal ttl value '" + ttlValue + "'"); } } if (rescanTime != null) { Long minimumTime = filter.getMinimumRescanTime(currentTime); if (minimumTime != null) { if (rescanTime.longValue() < minimumTime.longValue()) rescanTime = minimumTime; } } if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In RDF document '" + documentIdentifier + "' setting rescan time to " + ((rescanTime == null) ? "null" : rescanTime.toString())); activities.setDocumentScheduleBounds(documentIdentifier, rescanTime, rescanTime, null, null); return true; } } protected class RDFItemContextClass extends XMLParsingContext { protected int dechromedContentMode; protected String linkField = null; protected String pubDateField = null; protected String titleField = null; protected String authorNameField = null; protected String descriptionField = null; protected File contentsFile = null; public RDFItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts, int dechromedContentMode) { super(theStream, namespace, localName, qName, atts); this.dechromedContentMode = dechromedContentMode; } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { // The tags we care about are "ttl" and "item", nothing else. if (localName.equals("link")) { // "link" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("date")) { // "dc:date" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("title")) { // "title" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("creator")) { // "creator" tag (e.g. "dc:creator") return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else { switch (dechromedContentMode) { case DECHROMED_NONE: if (localName.equals("description")) { return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } break; case DECHROMED_DESCRIPTION: if (localName.equals("description")) { try { File tempFile = File.createTempFile("_rssdata_", "tmp"); return new XMLFileParsingContext(theStream, namespace, localName, qName, atts, tempFile); } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } } break; case DECHROMED_CONTENT: if (localName.equals("content")) { try { File tempFile = File.createTempFile("_rssdata_", "tmp"); return new XMLFileParsingContext(theStream, namespace, localName, qName, atts, tempFile); } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } } else if (localName.equals("description")) { return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } break; default: break; } // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } } /** Convert the individual sub-fields of the item context into their final forms */ @Override protected void endTag() throws ManifoldCFException { XMLParsingContext theContext = theStream.getContext(); String theTag = theContext.getLocalname(); if (theTag.equals("link")) { linkField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("date")) { pubDateField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("title")) { titleField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("creator")) { authorNameField = ((XMLStringParsingContext) theContext).getValue(); } else { switch (dechromedContentMode) { case DECHROMED_NONE: if (theTag.equals("description")) { descriptionField = ((XMLStringParsingContext) theContext).getValue(); } break; case DECHROMED_DESCRIPTION: if (theTag.equals("description")) { // Content file has been written; retrieve it (being sure not to leak any files already hanging around!) tagCleanup(); contentsFile = ((XMLFileParsingContext) theContext).getCompletedFile(); return; } break; case DECHROMED_CONTENT: if (theTag.equals("dc:content")) { // Retrieve content file tagCleanup(); contentsFile = ((XMLFileParsingContext) theContext).getCompletedFile(); return; } else if (theTag.equals("description")) { descriptionField = ((XMLStringParsingContext) theContext).getValue(); } break; default: break; } super.endTag(); } } protected void tagCleanup() throws ManifoldCFException { // Delete the contents file if it is there. if (contentsFile != null) { contentsFile.delete(); contentsFile = null; } } /** Process the data accumulated for this item */ public void process(String documentIdentifier, IProcessActivity activities, Filter filter) throws ManifoldCFException { if (linkField != null && linkField.length() > 0) { Date origDateDate = null; if (pubDateField != null && pubDateField.length() > 0) origDateDate = DateParser.parseISO8601Date(pubDateField); Long origDate; if (origDateDate != null) origDate = new Long(origDateDate.getTime()); else origDate = null; String[] links = linkField.split(", "); int l = 0; while (l < links.length) { String rawURL = links[l++].trim(); // Process the link String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(), documentIdentifier, rawURL); if (newIdentifier != null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In RDF document '" + documentIdentifier + "', found a link to '" + newIdentifier + "', which has origination date " + ((origDate == null) ? "null" : origDate.toString())); if (filter.isLegalURL(newIdentifier)) { if (contentsFile == null && filter.getChromedContentMode() != CHROMED_METADATA_ONLY) { // It's a reference! Add it. String[] dataNames = new String[] { "pubdate", "title", "source", "authorname", "description" }; String[][] dataValues = new String[dataNames.length][]; if (origDate != null) dataValues[0] = new String[] { origDate.toString() }; if (titleField != null) dataValues[1] = new String[] { titleField }; dataValues[2] = new String[] { documentIdentifier }; if (authorNameField != null) dataValues[3] = new String[] { authorNameField }; if (descriptionField != null) dataValues[4] = new String[] { descriptionField }; // Add document reference, including the data to pass down activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } else { // The issue here is that if a document is ingested without a jobqueue entry, the document will not // be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make // sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document // right here. // Now, set up the carrydown info String[] dataNames = new String[] { "pubdate", "title", "source", "authorname", "data", "description" }; Object[][] dataValues = new Object[dataNames.length][]; if (origDate != null) dataValues[0] = new String[] { origDate.toString() }; if (titleField != null) dataValues[1] = new String[] { titleField }; dataValues[2] = new String[] { documentIdentifier }; if (authorNameField != null) dataValues[3] = new String[] { authorNameField }; if (descriptionField != null) dataValues[5] = new String[] { descriptionField }; if (contentsFile == null) { CharacterInput ci = new NullCharacterInput(); try { dataValues[4] = new Object[] { ci }; // Add document reference, including the data to pass down, and the dechromed content too activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } finally { ci.discard(); } } else { CharacterInput ci = new TempFileCharacterInput(contentsFile); try { contentsFile = null; dataValues[4] = new Object[] { ci }; // Add document reference, including the data to pass down, and the dechromed content too activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } finally { ci.discard(); } } } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Identifier '" + newIdentifier + "' is excluded"); } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In RSS document '" + documentIdentifier + "', found an unincluded URL '" + rawURL + "'"); } } } } } protected class FeedContextClass extends XMLParsingContext { /** The document identifier */ protected String documentIdentifier; /** Activities interface */ protected IProcessActivity activities; /** Filter */ protected Filter filter; /** ttl value */ protected String ttlValue = null; public FeedContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) { super(theStream, namespace, localName, qName, atts); this.documentIdentifier = documentIdentifier; this.activities = activities; this.filter = filter; } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { // The tags we care about are "ttl" and "item", nothing else. if (localName.equals("ttl")) { // TTL value seen. Prepare to record it, as a string. return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("entry")) { // Item seen. We don't need any of the attributes etc., but we need to start a new context. return new FeedItemContextClass(theStream, namespace, localName, qName, atts, filter.getDechromedContentMode()); } // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } @Override protected void endTag() throws ManifoldCFException { XMLParsingContext theContext = theStream.getContext(); String theTag = theContext.getLocalname(); if (theTag.equals("ttl")) // If the current context must be the TTL one, record its data value. ttlValue = ((XMLStringParsingContext) theContext).getValue(); else if (theTag.equals("entry")) { // It's an item. FeedItemContextClass itemContext = (FeedItemContextClass) theContext; // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including: // (1) File name (if any), containing dechromed content // (2) Link name(s) // (3) Pubdate // (4) Title // The job now is to pull this info out and call the activities interface appropriately. // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up // all dangling files etc. that need to be removed. // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup() // method will be called regardless. itemContext.process(documentIdentifier, activities, filter); } else super.endTag(); } /** Process this data */ protected boolean process() throws ManifoldCFException { // Deal with the ttlvalue, if it was found // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default. long currentTime = System.currentTimeMillis(); Long rescanTime = filter.getDefaultRescanTime(currentTime); if (ttlValue != null) { try { int minutes = Integer.parseInt(ttlValue); long nextTime = currentTime + minutes * 60000L; rescanTime = new Long(nextTime); // Set the upper bound time; we want to scan the feeds aggressively. if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In Atom document '" + documentIdentifier + "', found a ttl value of " + ttlValue + "; setting refetch time accordingly"); } catch (NumberFormatException e) { Logging.connectors.warn("RSS: Atom document '" + documentIdentifier + "' has illegal ttl value '" + ttlValue + "'"); } } if (rescanTime != null) { Long minimumTime = filter.getMinimumRescanTime(currentTime); if (minimumTime != null) { if (rescanTime.longValue() < minimumTime.longValue()) rescanTime = minimumTime; } } if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In Atom document '" + documentIdentifier + "' setting rescan time to " + ((rescanTime == null) ? "null" : rescanTime.toString())); activities.setDocumentScheduleBounds(documentIdentifier, rescanTime, rescanTime, null, null); return true; } } protected class FeedItemContextClass extends XMLParsingContext { protected int dechromedContentMode; protected List<String> linkField = new ArrayList<String>(); protected String pubDateField = null; protected String titleField = null; protected String authorNameField = null; protected String authorEmailField = null; protected ArrayList categoryField = new ArrayList(); protected File contentsFile = null; protected String descriptionField = null; public FeedItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts, int dechromedContentMode) { super(theStream, namespace, localName, qName, atts); this.dechromedContentMode = dechromedContentMode; } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { // The tags we care about are "ttl" and "item", nothing else. if (localName.equals("link")) { // "link" tag String ref = atts.get("href"); if (ref != null && ref.length() > 0) linkField.add(ref); return super.beginTag(namespace, localName, qName, atts); } else if (localName.equals("published") || localName.equals("updated")) { // "published" pr "updated" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("title")) { // "title" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("author")) { return new FeedAuthorContextClass(theStream, namespace, localName, qName, atts); } else if (localName.equals("category")) { String category = atts.get("term"); if (category != null && category.length() > 0) categoryField.add(category); return super.beginTag(namespace, localName, qName, atts); } else { switch (dechromedContentMode) { case DECHROMED_NONE: if (localName.equals("subtitle")) { return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } break; case DECHROMED_DESCRIPTION: if (localName.equals("subtitle")) { try { File tempFile = File.createTempFile("_rssdata_", "tmp"); return new XMLFileParsingContext(theStream, namespace, localName, qName, atts, tempFile); } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } } break; case DECHROMED_CONTENT: if (localName.equals("content")) { try { File tempFile = File.createTempFile("_rssdata_", "tmp"); return new XMLFileParsingContext(theStream, namespace, localName, qName, atts, tempFile); } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException("IO exception creating temp file: " + e.getMessage(), e); } } else if (localName.equals("subtitle")) { return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } break; default: break; } // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } } /** Convert the individual sub-fields of the item context into their final forms */ @Override protected void endTag() throws ManifoldCFException { XMLParsingContext theContext = theStream.getContext(); String theTag = theContext.getLocalname(); if (theTag.equals("published") || theTag.equals("updated")) { pubDateField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("title")) { titleField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("author")) { FeedAuthorContextClass authorContext = (FeedAuthorContextClass) theContext; authorEmailField = authorContext.getAuthorEmail(); authorNameField = authorContext.getAuthorName(); } else { switch (dechromedContentMode) { case DECHROMED_NONE: if (theTag.equals("subtitle")) { titleField = ((XMLStringParsingContext) theContext).getValue(); } break; case DECHROMED_DESCRIPTION: if (theTag.equals("subtitle")) { // Content file has been written; retrieve it (being sure not to leak any files already hanging around!) tagCleanup(); contentsFile = ((XMLFileParsingContext) theContext).getCompletedFile(); return; } break; case DECHROMED_CONTENT: if (theTag.equals("content")) { // Retrieve content file tagCleanup(); contentsFile = ((XMLFileParsingContext) theContext).getCompletedFile(); return; } else if (theTag.equals("subtitle")) { titleField = ((XMLStringParsingContext) theContext).getValue(); } break; default: break; } super.endTag(); } } protected void tagCleanup() throws ManifoldCFException { // Delete the contents file if it is there. if (contentsFile != null) { contentsFile.delete(); contentsFile = null; } } /** Process the data accumulated for this item */ public void process(String documentIdentifier, IProcessActivity activities, Filter filter) throws ManifoldCFException { if (linkField.size() > 0) { Date origDateDate = null; if (pubDateField != null && pubDateField.length() > 0) origDateDate = DateParser.parseISO8601Date(pubDateField); Long origDate; if (origDateDate != null) origDate = new Long(origDateDate.getTime()); else origDate = null; for (String linkValue : linkField) { String[] links = linkValue.split(", "); int l = 0; while (l < links.length) { String rawURL = links[l++].trim(); // Process the link String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(), documentIdentifier, rawURL); if (newIdentifier != null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In Atom document '" + documentIdentifier + "', found a link to '" + newIdentifier + "', which has origination date " + ((origDate == null) ? "null" : origDate.toString())); if (filter.isLegalURL(newIdentifier)) { if (contentsFile == null && filter.getChromedContentMode() != CHROMED_METADATA_ONLY) { // It's a reference! Add it. String[] dataNames = new String[] { "pubdate", "title", "source", "category", "description" }; String[][] dataValues = new String[dataNames.length][]; if (origDate != null) dataValues[0] = new String[] { origDate.toString() }; if (titleField != null) dataValues[1] = new String[] { titleField }; dataValues[2] = new String[] { documentIdentifier }; dataValues[3] = new String[categoryField.size()]; int q = 0; while (q < categoryField.size()) { (dataValues[3])[q] = (String) categoryField.get(q); q++; } if (descriptionField != null) dataValues[4] = new String[] { descriptionField }; // Add document reference, including the data to pass down activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } else { // The issue here is that if a document is ingested without a jobqueue entry, the document will not // be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make // sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document // right here. // Now, set up the carrydown info String[] dataNames = new String[] { "pubdate", "title", "source", "category", "data", "description" }; Object[][] dataValues = new Object[dataNames.length][]; if (origDate != null) dataValues[0] = new String[] { origDate.toString() }; if (titleField != null) dataValues[1] = new String[] { titleField }; dataValues[2] = new String[] { documentIdentifier }; dataValues[3] = new String[categoryField.size()]; int q = 0; while (q < categoryField.size()) { (dataValues[3])[q] = (String) categoryField.get(q); q++; } if (descriptionField != null) dataValues[5] = new String[] { descriptionField }; if (contentsFile == null) { CharacterInput ci = new NullCharacterInput(); try { dataValues[4] = new Object[] { ci }; // Add document reference, including the data to pass down, and the dechromed content too activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } finally { ci.discard(); } } else { CharacterInput ci = new TempFileCharacterInput(contentsFile); try { contentsFile = null; dataValues[4] = new Object[] { ci }; // Add document reference, including the data to pass down, and the dechromed content too activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } finally { ci.discard(); } } } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Identifier '" + newIdentifier + "' is excluded"); } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In Atom document '" + documentIdentifier + "', found an unincluded URL '" + rawURL + "'"); } } } } } } protected class FeedAuthorContextClass extends XMLParsingContext { protected String authorNameField = null; protected String authorEmailField = null; public FeedAuthorContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts) { super(theStream, namespace, localName, qName, atts); } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { if (localName.equals("name")) { // "name" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("email")) { // "email" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else { // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } } /** Convert the individual sub-fields of the item context into their final forms */ @Override protected void endTag() throws ManifoldCFException { XMLParsingContext theContext = theStream.getContext(); String theTag = theContext.getLocalname(); if (theTag.equals("name")) { authorNameField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("email")) { authorEmailField = ((XMLStringParsingContext) theContext).getValue(); } else { super.endTag(); } } public String getAuthorName() { return authorNameField; } public String getAuthorEmail() { return authorEmailField; } } protected class UrlsetContextClass extends XMLParsingContext { /** The document identifier */ protected String documentIdentifier; /** Activities interface */ protected IProcessActivity activities; /** Filter */ protected Filter filter; /** ttl value */ protected String ttlValue = null; public UrlsetContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) { super(theStream, namespace, localName, qName, atts); this.documentIdentifier = documentIdentifier; this.activities = activities; this.filter = filter; } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { // The tags we care about are "url", nothing else. if (localName.equals("url") || localName.equals("sitemap")) { // Item seen. We don't need any of the attributes etc., but we need to start a new context. return new UrlsetItemContextClass(theStream, namespace, localName, qName, atts); } // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } @Override protected void endTag() throws ManifoldCFException { XMLParsingContext theContext = theStream.getContext(); String theTag = theContext.getLocalname(); if (theTag.equals("url") || theTag.equals("sitemap")) { // It's an item. UrlsetItemContextClass itemContext = (UrlsetItemContextClass) theContext; // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including: // (1) File name (if any), containing dechromed content // (2) Link name(s) // (3) Pubdate // (4) Title // The job now is to pull this info out and call the activities interface appropriately. // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up // all dangling files etc. that need to be removed. // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup() // method will be called regardless. itemContext.process(documentIdentifier, activities, filter); } else super.endTag(); } /** Process this data */ protected boolean process() throws ManifoldCFException { // Deal with the ttlvalue, if it was found // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default. long currentTime = System.currentTimeMillis(); Long rescanTime = filter.getDefaultRescanTime(currentTime); if (ttlValue != null) { try { int minutes = Integer.parseInt(ttlValue); long nextTime = currentTime + minutes * 60000L; rescanTime = new Long(nextTime); // Set the upper bound time; we want to scan the feeds aggressively. if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In SiteMap document '" + documentIdentifier + "', found a ttl value of " + ttlValue + "; setting refetch time accordingly"); } catch (NumberFormatException e) { Logging.connectors.warn("RSS: SiteMap document '" + documentIdentifier + "' has illegal ttl value '" + ttlValue + "'"); } } if (rescanTime != null) { Long minimumTime = filter.getMinimumRescanTime(currentTime); if (minimumTime != null) { if (rescanTime.longValue() < minimumTime.longValue()) rescanTime = minimumTime; } } if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In SiteMap document '" + documentIdentifier + "' setting rescan time to " + ((rescanTime == null) ? "null" : rescanTime.toString())); activities.setDocumentScheduleBounds(documentIdentifier, rescanTime, rescanTime, null, null); return true; } } protected class UrlsetItemContextClass extends XMLParsingContext { protected String linkField = null; protected String pubDateField = null; public UrlsetItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String, String> atts) { super(theStream, namespace, localName, qName, atts); } @Override protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String, String> atts) throws ManifoldCFException { // The tags we care about are "loc" and "lastmod", nothing else. if (localName.equals("loc")) { // "loc" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else if (localName.equals("lastmod")) { // "lastmod" tag return new XMLStringParsingContext(theStream, namespace, localName, qName, atts); } else { // Skip everything else. return super.beginTag(namespace, localName, qName, atts); } } /** Convert the individual sub-fields of the item context into their final forms */ @Override protected void endTag() throws ManifoldCFException { XMLParsingContext theContext = theStream.getContext(); String theTag = theContext.getLocalname(); if (theTag.equals("loc")) { linkField = ((XMLStringParsingContext) theContext).getValue(); } else if (theTag.equals("lastmod")) { pubDateField = ((XMLStringParsingContext) theContext).getValue(); } else { super.endTag(); } } protected void tagCleanup() throws ManifoldCFException { } /** Process the data accumulated for this item */ public void process(String documentIdentifier, IProcessActivity activities, Filter filter) throws ManifoldCFException { if (linkField != null && linkField.length() > 0) { Date origDateDate = null; if (pubDateField != null && pubDateField.length() > 0) origDateDate = DateParser.parseISO8601Date(pubDateField); Long origDate; if (origDateDate != null) origDate = new Long(origDateDate.getTime()); else origDate = null; String[] links = linkField.split(", "); int l = 0; while (l < links.length) { String rawURL = links[l++].trim(); // Process the link String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(), documentIdentifier, rawURL); if (newIdentifier != null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In SiteMap document '" + documentIdentifier + "', found a link to '" + newIdentifier + "', which has origination date " + ((origDate == null) ? "null" : origDate.toString())); if (filter.isLegalURL(newIdentifier)) { // It's a reference! Add it. String[] dataNames = new String[] { "pubdate", "source" }; String[][] dataValues = new String[dataNames.length][]; if (origDate != null) dataValues[0] = new String[] { origDate.toString() }; dataValues[1] = new String[] { documentIdentifier }; // Add document reference, including the data to pass down activities.addDocumentReference(newIdentifier, documentIdentifier, null, dataNames, dataValues, origDate); } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Identifier '" + newIdentifier + "' is excluded"); } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In SiteMap document '" + documentIdentifier + "', found an unincluded URL '" + rawURL + "'"); } } } } } /** Get the maximum number of documents to amalgamate together into one batch, for this connector. *@return the maximum number. 0 indicates "unlimited". */ public int getMaxDocumentRequest() { // RSS and the web in general do not batch well. Multiple chunks have no advantage over one-at-a-time requests. return 1; } // Protected methods and classes /** Given the current parameters, find the correct throttled fetcher object * (or create one if not there). */ protected ThrottledFetcher getFetcher() { synchronized (fetcherMap) { ThrottledFetcher tf = fetcherMap.get(throttleGroupName); if (tf == null) { tf = new ThrottledFetcher(); fetcherMap.put(throttleGroupName, tf); } return tf; } } /** Read a string as a sequence of individual expressions, urls, etc. */ protected static List<String> stringToArray(String input) { List<String> list = new ArrayList<String>(); try { java.io.Reader str = new java.io.StringReader(input); try { java.io.BufferedReader is = new java.io.BufferedReader(str); try { while (true) { String nextString = is.readLine(); if (nextString == null) break; if (nextString.length() == 0) continue; nextString.trim(); if (nextString.startsWith("#")) continue; list.add(nextString); } } finally { is.close(); } } finally { str.close(); } } catch (java.io.IOException e) { // Eat the exception and exit. } return list; } /** Compile all regexp entries in the passed in list, and add them to the output * list. */ protected static void compileList(List<Pattern> output, List<String> input) throws ManifoldCFException { for (String inputString : input) { try { output.add(Pattern.compile(inputString)); } catch (PatternSyntaxException e) { throw new ManifoldCFException( "Mapping regular expression '" + inputString + "' is illegal: " + e.getMessage(), e); } } } /** Given the current parameters, find the correct robots object (or create * one if none found). */ protected Robots getRobots(ThrottledFetcher fetcher) { synchronized (robotsMap) { Robots r = (Robots) robotsMap.get(throttleGroupName); if (r == null) { r = new Robots(fetcher); robotsMap.put(throttleGroupName, r); } return r; } } // Protected classes /** The throttle specification class. Each server name is a different bin in this model. */ protected static class ThrottleSpec implements IThrottleSpec { protected final int maxOpenConnectionsPerServer; protected final long minimumMillisecondsPerFetchPerServer; protected final double minimumMillisecondsPerBytePerServer; public ThrottleSpec(int maxOpenConnectionsPerServer, long minimumMillisecondsPerFetchPerServer, double minimumMillisecondsPerBytePerServer) { this.maxOpenConnectionsPerServer = maxOpenConnectionsPerServer; this.minimumMillisecondsPerFetchPerServer = minimumMillisecondsPerFetchPerServer; this.minimumMillisecondsPerBytePerServer = minimumMillisecondsPerBytePerServer; } /** Given a bin name, find the max open connections to use for that bin. *@return Integer.MAX_VALUE if no limit found. */ public int getMaxOpenConnections(String binName) { return maxOpenConnectionsPerServer; } /** Look up minimum milliseconds per byte for a bin. *@return 0.0 if no limit found. */ public double getMinimumMillisecondsPerByte(String binName) { return minimumMillisecondsPerBytePerServer; } /** Look up minimum milliseconds for a fetch for a bin. *@return 0 if no limit found. */ public long getMinimumMillisecondsPerFetch(String binName) { return minimumMillisecondsPerFetchPerServer; } } /** Name/value class */ protected static class NameValue { protected String name; protected String value; public NameValue(String name, String value) { this.name = name; this.value = value; } public String getName() { return name; } public String getValue() { return value; } } /** Evaluator token. */ protected static class EvaluatorToken { public final static int TYPE_GROUP = 0; public final static int TYPE_TEXT = 1; public final static int TYPE_COMMA = 2; public final static int GROUPSTYLE_NONE = 0; public final static int GROUPSTYLE_LOWER = 1; public final static int GROUPSTYLE_UPPER = 2; public final static int GROUPSTYLE_MIXED = 3; protected int type; protected int groupNumber = -1; protected int groupStyle = GROUPSTYLE_NONE; protected String textValue = null; public EvaluatorToken() { type = TYPE_COMMA; } public EvaluatorToken(int groupNumber, int groupStyle) { type = TYPE_GROUP; this.groupNumber = groupNumber; this.groupStyle = groupStyle; } public EvaluatorToken(String text) { type = TYPE_TEXT; this.textValue = text; } public int getType() { return type; } public int getGroupNumber() { return groupNumber; } public int getGroupStyle() { return groupStyle; } public String getTextValue() { return textValue; } } /** Token stream. */ protected static class EvaluatorTokenStream { protected String text; protected int pos; protected EvaluatorToken token = null; /** Constructor. */ public EvaluatorTokenStream(String text) { this.text = text; this.pos = 0; } /** Get current token. */ public EvaluatorToken peek() throws ManifoldCFException { if (token == null) { token = nextToken(); } return token; } /** Go on to next token. */ public void advance() { token = null; } protected EvaluatorToken nextToken() throws ManifoldCFException { char x; // Fetch the next token while (true) { if (pos == text.length()) return null; x = text.charAt(pos); if (x > ' ') break; pos++; } StringBuilder sb; if (x == '"') { // Parse text pos++; sb = new StringBuilder(); while (true) { if (pos == text.length()) break; x = text.charAt(pos); pos++; if (x == '"') { break; } if (x == '\\') { if (pos == text.length()) break; x = text.charAt(pos++); } sb.append(x); } return new EvaluatorToken(sb.toString()); } if (x == ',') { pos++; return new EvaluatorToken(); } // Eat number at beginning sb = new StringBuilder(); while (true) { if (pos == text.length()) break; x = text.charAt(pos); if (x >= '0' && x <= '9') { sb.append(x); pos++; continue; } break; } String numberValue = sb.toString(); int groupNumber = 0; if (numberValue.length() > 0) groupNumber = new Integer(numberValue).intValue(); // Save the next char position int modifierPos = pos; // Go to the end of the word while (true) { if (pos == text.length()) break; x = text.charAt(pos); if (x == ',' || x >= '0' && x <= '9' || x <= ' ' && x >= 0) break; pos++; } int style = EvaluatorToken.GROUPSTYLE_NONE; if (modifierPos != pos) { String modifier = text.substring(modifierPos, pos); if (modifier.startsWith("u")) style = EvaluatorToken.GROUPSTYLE_UPPER; else if (modifier.startsWith("l")) style = EvaluatorToken.GROUPSTYLE_LOWER; else if (modifier.startsWith("m")) style = EvaluatorToken.GROUPSTYLE_MIXED; else throw new ManifoldCFException("Unknown style: " + modifier); } return new EvaluatorToken(groupNumber, style); } } /** Class representing a URL regular expression match, for the purposes of determining canonicalization policy */ protected static class CanonicalizationPolicy { protected final Pattern matchPattern; protected final boolean reorder; protected final boolean removeJavaSession; protected final boolean removeAspSession; protected final boolean removePhpSession; protected final boolean removeBVSession; public CanonicalizationPolicy(Pattern matchPattern, boolean reorder, boolean removeJavaSession, boolean removeAspSession, boolean removePhpSession, boolean removeBVSession) { this.matchPattern = matchPattern; this.reorder = reorder; this.removeJavaSession = removeJavaSession; this.removeAspSession = removeAspSession; this.removePhpSession = removePhpSession; this.removeBVSession = removeBVSession; } public boolean checkMatch(String url) { Matcher matcher = matchPattern.matcher(url); return matcher.find(); } public boolean canReorder() { return reorder; } public boolean canRemoveJavaSession() { return removeJavaSession; } public boolean canRemoveAspSession() { return removeAspSession; } public boolean canRemovePhpSession() { return removePhpSession; } public boolean canRemoveBvSession() { return removeBVSession; } } /** Class representing a list of canonicalization rules */ protected static class CanonicalizationPolicies { protected final List<CanonicalizationPolicy> rules = new ArrayList<CanonicalizationPolicy>(); public CanonicalizationPolicies() { } public void addRule(CanonicalizationPolicy rule) { rules.add(rule); } public CanonicalizationPolicy findMatch(String url) { for (CanonicalizationPolicy rule : rules) { if (rule.checkMatch(url)) return rule; } return null; } } /** Class representing a mapping rule */ protected static class MappingRule { protected final Pattern matchPattern; protected final String evalExpression; public MappingRule(Pattern matchPattern, String evalExpression) { this.matchPattern = matchPattern; this.evalExpression = evalExpression; } public boolean checkMatch(String url) { Matcher matcher = matchPattern.matcher(url); return matcher.matches(); } public String map(String url) throws ManifoldCFException { // Create a matcher, and attempt to do a match Matcher matcher = matchPattern.matcher(url); if (!matcher.matches()) { return null; } // A match! Now, interpret the output expression if (evalExpression == null || evalExpression.length() == 0) return url; StringBuilder sb = new StringBuilder(); EvaluatorTokenStream et = new EvaluatorTokenStream(evalExpression); while (true) { EvaluatorToken t = et.peek(); if (t == null) break; switch (t.getType()) { case EvaluatorToken.TYPE_COMMA: et.advance(); break; case EvaluatorToken.TYPE_GROUP: et.advance(); String groupValue = matcher.group(t.getGroupNumber()); switch (t.getGroupStyle()) { case EvaluatorToken.GROUPSTYLE_NONE: sb.append(groupValue); break; case EvaluatorToken.GROUPSTYLE_LOWER: sb.append(groupValue.toLowerCase()); break; case EvaluatorToken.GROUPSTYLE_UPPER: sb.append(groupValue.toUpperCase()); break; case EvaluatorToken.GROUPSTYLE_MIXED: if (groupValue.length() > 0) { sb.append(groupValue.substring(0, 1).toUpperCase()); sb.append(groupValue.substring(1).toLowerCase()); } break; default: throw new ManifoldCFException("Illegal group style"); } break; case EvaluatorToken.TYPE_TEXT: et.advance(); sb.append(t.getTextValue()); break; default: throw new ManifoldCFException("Illegal token type"); } } return sb.toString(); } } /** Class that represents all mappings */ protected static class MappingRules { protected final List<MappingRule> mappings = new ArrayList<MappingRule>(); public MappingRules() { } public void add(MappingRule rule) { mappings.add(rule); } public boolean isMatch(String url) { if (mappings.size() == 0) return true; for (MappingRule p : mappings) { if (p.checkMatch(url)) return true; } return false; } public String map(String url) throws ManifoldCFException { if (mappings.size() == 0) return url; for (MappingRule p : mappings) { String rval = p.map(url); if (rval != null) return rval; } return null; } } /** Class that handles parsing and interpretation of the document specification. * Note that I believe it to be faster to do this once, gathering all the data, than to scan the document specification multiple times. * Therefore, this class contains the *entire* interpreted set of data from a document specification. */ protected static class Filter { protected final MappingRules mappings = new MappingRules(); protected final Set<String> seeds; protected Integer defaultRescanInterval = null; protected Integer minimumRescanInterval = null; protected Integer badFeedRescanInterval = null; protected int dechromedContentMode = DECHROMED_NONE; protected int chromedContentMode = CHROMED_USE; protected int feedTimeoutValue = 60000; protected final Set<String> acls = new HashSet<String>(); protected final CanonicalizationPolicies canonicalizationPolicies = new CanonicalizationPolicies(); /** The arraylist of exclude patterns */ protected final List<Pattern> excludePatterns = new ArrayList<Pattern>(); /** Constructor. */ public Filter(Specification spec, boolean warnOnBadSeed) throws ManifoldCFException { String excludes = ""; // To save allocation, preallocate the seeds map assuming that it will require 1.5x the number of nodes in the spec int initialSize = spec.getChildCount(); if (initialSize == 0) initialSize = 1; seeds = new HashSet<String>((initialSize * 3) >> 1); int i = 0; // First pass. Find all of the rules (which are necessary to canonicalize the seeds, etc.) while (i < spec.getChildCount()) { SpecificationNode n = spec.getChild(i++); if (n.getType().equals(RSSConfig.NODE_MAP)) { String match = n.getAttributeValue(RSSConfig.ATTR_MATCH); String map = n.getAttributeValue(RSSConfig.ATTR_MAP); if (match != null && match.length() > 0) { Pattern p; try { p = Pattern.compile(match); } catch (java.util.regex.PatternSyntaxException e) { throw new ManifoldCFException( "Regular expression '" + match + "' is illegal: " + e.getMessage(), e); } if (map == null) map = ""; mappings.add(new MappingRule(p, map)); } } else if (n.getType().equals(RSSConfig.NODE_EXCLUDES)) { excludes = n.getValue(); if (excludes == null) excludes = ""; } else if (n.getType().equals(RSSConfig.NODE_URLSPEC)) { String urlRegexp = n.getAttributeValue(RSSConfig.ATTR_REGEXP); if (urlRegexp == null) urlRegexp = ""; String reorder = n.getAttributeValue(RSSConfig.ATTR_REORDER); boolean reorderValue; if (reorder == null) reorderValue = false; else { if (reorder.equals(RSSConfig.VALUE_YES)) reorderValue = true; else reorderValue = false; } String javaSession = n.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL); boolean javaSessionValue; if (javaSession == null) javaSessionValue = false; else { if (javaSession.equals(RSSConfig.VALUE_YES)) javaSessionValue = true; else javaSessionValue = false; } String aspSession = n.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL); boolean aspSessionValue; if (aspSession == null) aspSessionValue = false; else { if (aspSession.equals(RSSConfig.VALUE_YES)) aspSessionValue = true; else aspSessionValue = false; } String phpSession = n.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL); boolean phpSessionValue; if (phpSession == null) phpSessionValue = false; else { if (phpSession.equals(RSSConfig.VALUE_YES)) phpSessionValue = true; else phpSessionValue = false; } String bvSession = n.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL); boolean bvSessionValue; if (bvSession == null) bvSessionValue = false; else { if (bvSession.equals(RSSConfig.VALUE_YES)) bvSessionValue = true; else bvSessionValue = false; } try { canonicalizationPolicies.addRule(new CanonicalizationPolicy(Pattern.compile(urlRegexp), reorderValue, javaSessionValue, aspSessionValue, phpSessionValue, bvSessionValue)); } catch (java.util.regex.PatternSyntaxException e) { throw new ManifoldCFException("Canonicalization regular expression '" + urlRegexp + "' is illegal: " + e.getMessage(), e); } } } compileList(excludePatterns, stringToArray(excludes)); // Second pass. Do the rest of the work, i = 0; while (i < spec.getChildCount()) { SpecificationNode n = spec.getChild(i++); if (n.getType().equals(RSSConfig.NODE_FEED)) { String rssURL = n.getAttributeValue(RSSConfig.ATTR_URL); if (rssURL != null && rssURL.length() > 0) { String canonicalURL = makeDocumentIdentifier(canonicalizationPolicies, null, rssURL); if (canonicalURL != null) { seeds.add(canonicalURL); } else { if (warnOnBadSeed) Logging.connectors.warn("RSS: Illegal seed feed '" + rssURL + "'"); } } } else if (n.getType().equals(RSSConfig.NODE_ACCESS)) { String token = n.getAttributeValue(RSSConfig.ATTR_TOKEN); acls.add(token); } else if (n.getType().equals(RSSConfig.NODE_FEEDRESCAN)) { String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE); if (interval != null && interval.length() > 0) { try { defaultRescanInterval = new Integer(interval); } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number: " + e.getMessage(), e); } } } else if (n.getType().equals(RSSConfig.NODE_MINFEEDRESCAN)) { String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE); if (interval != null && interval.length() > 0) { try { minimumRescanInterval = new Integer(interval); } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number: " + e.getMessage(), e); } } } else if (n.getType().equals(RSSConfig.NODE_BADFEEDRESCAN)) { String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE); if (interval != null && interval.length() > 0) { try { badFeedRescanInterval = new Integer(interval); } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number: " + e.getMessage(), e); } } } else if (n.getType().equals(RSSConfig.NODE_FEEDTIMEOUT)) { String value = n.getAttributeValue(RSSConfig.ATTR_VALUE); if (value != null && value.length() > 0) { try { feedTimeoutValue = Integer.parseInt(value) * 1000; } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number: " + e.getMessage(), e); } } } else if (n.getType().equals(RSSConfig.NODE_DECHROMEDMODE)) { String mode = n.getAttributeValue(RSSConfig.ATTR_MODE); if (mode != null && mode.length() > 0) { if (mode.equals(RSSConfig.VALUE_NONE)) dechromedContentMode = DECHROMED_NONE; else if (mode.equals(RSSConfig.VALUE_DESCRIPTION)) dechromedContentMode = DECHROMED_DESCRIPTION; else if (mode.equals(RSSConfig.VALUE_CONTENT)) dechromedContentMode = DECHROMED_CONTENT; } } else if (n.getType().equals(RSSConfig.NODE_CHROMEDMODE)) { String mode = n.getAttributeValue(RSSConfig.ATTR_MODE); if (mode != null && mode.length() > 0) { if (mode.equals(RSSConfig.VALUE_USE)) chromedContentMode = CHROMED_USE; else if (mode.equals(RSSConfig.VALUE_SKIP)) chromedContentMode = CHROMED_SKIP; else if (mode.equals(RSSConfig.VALUE_METADATA)) chromedContentMode = CHROMED_METADATA_ONLY; } } } } /** Check if document is a seed */ public boolean isSeed(String canonicalUrl) { return seeds.contains(canonicalUrl); } /** Iterate over all canonicalized seeds */ public Iterator<String> getSeeds() { return seeds.iterator(); } /** Get the acls */ public String[] getAcls() { String[] rval = new String[acls.size()]; Iterator<String> iter = acls.iterator(); int i = 0; while (iter.hasNext()) { rval[i++] = iter.next(); } return rval; } /** Get the feed timeout value */ public int getFeedTimeoutValue() { return feedTimeoutValue; } /** Get the dechromed content mode */ public int getDechromedContentMode() { return dechromedContentMode; } /** Get the chromed content mode */ public int getChromedContentMode() { return chromedContentMode; } /** Get the next time (by default) a feed should be scanned */ public Long getDefaultRescanTime(long currentTime) { if (defaultRescanInterval == null) return null; return new Long(defaultRescanInterval.intValue() * 60000L + currentTime); } /** Get the minimum next time a feed should be scanned */ public Long getMinimumRescanTime(long currentTime) { if (minimumRescanInterval == null) return null; return new Long(minimumRescanInterval.intValue() * 60000L + currentTime); } /** Get the next time a "bad feed" should be rescanned */ public Long getBadFeedRescanTime(long currentTime) { if (badFeedRescanInterval == null) return null; return new Long(badFeedRescanInterval.intValue() * 60000L + currentTime); } /** Check for legality of a url. * @return true if the passed-in url is either a seed, or a legal url, according to this * filter. */ public boolean isLegalURL(String url) { if (seeds.contains(url)) return true; if (mappings.isMatch(url) == false) { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: Url '" + url + "' is illegal because it did not match a mapping rule"); return false; } // Now make sure it's not in the exclude list. for (Pattern p : excludePatterns) { Matcher m = p.matcher(url); if (m.find()) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Url '" + url + "' is illegal because exclude pattern '" + p.toString() + "' matched it"); return false; } } return true; } /** Scan patterns and return the one that matches first. * @return null if the url doesn't match or should not be ingested, or the new string if it does. */ public String mapDocumentURL(String url) throws ManifoldCFException { if (seeds.contains(url)) return null; return mappings.map(url); } /** Get canonicalization policies */ public CanonicalizationPolicies getCanonicalizationPolicies() { return canonicalizationPolicies; } } }