List of usage examples for java.util.regex Pattern split
public String[] split(CharSequence input)
From source file:org.apache.ctakes.ytex.uima.annotators.NegexAnnotator.java
private void checkNegation2(JCas aJCas, Sentence s, IdentifiedAnnotation ne, boolean negPoss) { // Sorter s = new Sorter(); String sToReturn = ""; String sScope = ""; // String sentencePortion = ""; // ArrayList sortedRules = null; String filler = "_"; // boolean negationScope = true; // Sort the rules by length in descending order. // Rules need to be sorted so the longest rule is always tried to match // first./*from w w w .ja v a2s. co m*/ // Some of the rules overlap so without sorting first shorter rules // (some of them POSSIBLE or PSEUDO) // would match before longer legitimate negation rules. // // There is efficiency issue here. It is better if rules are sorted by // the // calling program once and used without sorting in GennegEx. // sortedRules = this.rules; // Process the sentence and tag each matched negation // rule with correct negation rule tag. // // At the same time check for the phrase that we want to decide // the negation status for and // tag the phrase with [PHRASE] ... [PHRASE] // In both the negation rules and in the phrase replace white space // with "filler" string. (This could cause problems if the sentences // we study has "filler" on their own.) // Sentence needs one character in the beginning and end to match. // We remove the extra characters after processing. // vng String sentence = "." + sentenceString + "."; String sentence = "." + s.getCoveredText() + "."; // Tag the phrases we want to detect for negation. // Should happen before rule detection. // vng String phrase = phraseString; String phrase = ne.getCoveredText(); Pattern pph = Pattern.compile(phrase.trim(), Pattern.CASE_INSENSITIVE); Matcher mph = pph.matcher(sentence); CharBuffer buf = CharBuffer.wrap(sentence.toCharArray()); while (mph.find() == true) { sentence = mph.replaceAll(" [PHRASE]" + mph.group().trim().replaceAll(" ", filler) + "[PHRASE]"); } for (NegexRule rule : this.listNegexRules) { Matcher m = rule.getPattern().matcher(sentence); while (m.find() == true) { sentence = m.replaceAll( " " + rule.getTag() + m.group().trim().replaceAll(" ", filler) + rule.getTag() + " "); } } // Exchange the [PHRASE] ... [PHRASE] tags for [NEGATED] ... [NEGATED] // based of PREN, POST rules and if flag is set to true // then based on PREP and POSP, as well. // Because PRENEGATION [PREN} is checked first it takes precedent over // POSTNEGATION [POST]. // Similarly POSTNEGATION [POST] takes precedent over POSSIBLE // PRENEGATION [PREP] // and [PREP] takes precedent over POSSIBLE POSTNEGATION [POSP]. Pattern pSpace = Pattern.compile("[\\s+]"); String[] sentenceTokens = pSpace.split(sentence); StringBuilder sb = new StringBuilder(); // Check for [PREN] for (int i = 0; i < sentenceTokens.length; i++) { sb.append(" " + sentenceTokens[i].trim()); if (sentenceTokens[i].trim().startsWith("[PREN]")) { for (int j = i + 1; j < sentenceTokens.length; j++) { if (sentenceTokens[j].trim().startsWith("[CONJ]") || sentenceTokens[j].trim().startsWith("[PSEU]") || sentenceTokens[j].trim().startsWith("[POST]") || sentenceTokens[j].trim().startsWith("[PREP]") || sentenceTokens[j].trim().startsWith("[POSP]")) { break; } if (sentenceTokens[j].trim().startsWith("[PHRASE]")) { sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[NEGATED]"); } } } } sentence = sb.toString(); pSpace = Pattern.compile("[\\s+]"); sentenceTokens = pSpace.split(sentence); StringBuilder sb2 = new StringBuilder(); // Check for [POST] for (int i = sentenceTokens.length - 1; i > 0; i--) { sb2.insert(0, sentenceTokens[i] + " "); if (sentenceTokens[i].trim().startsWith("[POST]")) { for (int j = i - 1; j > 0; j--) { if (sentenceTokens[j].trim().startsWith("[CONJ]") || sentenceTokens[j].trim().startsWith("[PSEU]") || sentenceTokens[j].trim().startsWith("[PREN]") || sentenceTokens[j].trim().startsWith("[PREP]") || sentenceTokens[j].trim().startsWith("[POSP]")) { break; } if (sentenceTokens[j].trim().startsWith("[PHRASE]")) { sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[NEGATED]"); } } } } sentence = sb2.toString(); // If POSSIBLE negation is detected as negation. // negatePossible being set to "true" then check for [PREP] and [POSP]. if (negPoss == true) { pSpace = Pattern.compile("[\\s+]"); sentenceTokens = pSpace.split(sentence); StringBuilder sb3 = new StringBuilder(); // Check for [PREP] for (int i = 0; i < sentenceTokens.length; i++) { sb3.append(" " + sentenceTokens[i].trim()); if (sentenceTokens[i].trim().startsWith("[PREP]")) { for (int j = i + 1; j < sentenceTokens.length; j++) { if (sentenceTokens[j].trim().startsWith("[CONJ]") || sentenceTokens[j].trim().startsWith("[PSEU]") || sentenceTokens[j].trim().startsWith("[POST]") || sentenceTokens[j].trim().startsWith("[PREN]") || sentenceTokens[j].trim().startsWith("[POSP]")) { break; } if (sentenceTokens[j].trim().startsWith("[PHRASE]")) { sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[POSSIBLE]"); } } } } sentence = sb3.toString(); pSpace = Pattern.compile("[\\s+]"); sentenceTokens = pSpace.split(sentence); StringBuilder sb4 = new StringBuilder(); // Check for [POSP] for (int i = sentenceTokens.length - 1; i > 0; i--) { sb4.insert(0, sentenceTokens[i] + " "); if (sentenceTokens[i].trim().startsWith("[POSP]")) { for (int j = i - 1; j > 0; j--) { if (sentenceTokens[j].trim().startsWith("[CONJ]") || sentenceTokens[j].trim().startsWith("[PSEU]") || sentenceTokens[j].trim().startsWith("[PREN]") || sentenceTokens[j].trim().startsWith("[PREP]") || sentenceTokens[j].trim().startsWith("[POST]")) { break; } if (sentenceTokens[j].trim().startsWith("[PHRASE]")) { sentenceTokens[j] = sentenceTokens[j].trim().replaceAll("\\[PHRASE\\]", "[POSSIBLE]"); } } } } sentence = sb4.toString(); } // Remove the filler character we used. sentence = sentence.replaceAll(filler, " "); // Remove the extra periods at the beginning // and end of the sentence. sentence = sentence.substring(0, sentence.trim().lastIndexOf('.')); sentence = sentence.replaceFirst(".", ""); // Get the scope of the negation for PREN and PREP if (sentence.contains("[PREN]") || sentence.contains("[PREP]")) { int startOffset = sentence.indexOf("[PREN]"); if (startOffset == -1) { startOffset = sentence.indexOf("[PREP]"); } int endOffset = sentence.indexOf("[CONJ]"); if (endOffset == -1) { endOffset = sentence.indexOf("[PSEU]"); } if (endOffset == -1) { endOffset = sentence.indexOf("[POST]"); } if (endOffset == -1) { endOffset = sentence.indexOf("[POSP]"); } if (endOffset == -1 || endOffset < startOffset) { endOffset = sentence.length() - 1; } sScope = sentence.substring(startOffset, endOffset + 1); } // Get the scope of the negation for POST and POSP if (sentence.contains("[POST]") || sentence.contains("[POSP]")) { int endOffset = sentence.lastIndexOf("[POST]"); if (endOffset == -1) { endOffset = sentence.lastIndexOf("[POSP]"); } int startOffset = sentence.lastIndexOf("[CONJ]"); if (startOffset == -1) { startOffset = sentence.lastIndexOf("[PSEU]"); } if (startOffset == -1) { startOffset = sentence.lastIndexOf("[PREN]"); } if (startOffset == -1) { startOffset = sentence.lastIndexOf("[PREP]"); } if (startOffset == -1) { startOffset = 0; } sScope = sentence.substring(startOffset, endOffset); } // Classify to: negated/possible/affirmed if (sentence.contains("[NEGATED]")) { sentence = sentence + "\t" + "negated" + "\t" + sScope; } else if (sentence.contains("[POSSIBLE]")) { sentence = sentence + "\t" + "possible" + "\t" + sScope; } else { sentence = sentence + "\t" + "affirmed" + "\t" + sScope; } sToReturn = sentence; System.out.println(sToReturn); }
From source file:org.soas.solr.update.processor.WhereDifferentUPF.java
@Override public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {//from w w w .ja v a 2 s.co m return new UpdateRequestProcessor(next) { @Override public void processAdd(AddUpdateCommand cmd) throws IOException { final SolrInputDocument doc = cmd.getSolrInputDocument(); Collection c = doc.getFieldValues(tagFieldName); if (c != null) { Iterator it = c.iterator(); while (it.hasNext()) { String next = (String) it.next(); if (doc.containsKey(posFieldName + "_" + next) && doc.containsKey(compareFieldName + "_" + next)) { String posFieldValue = (String) doc.getFieldValue(posFieldName + "_" + next); String[] pos = posFieldValue.split("\\s+"); String compareFieldValue = (String) doc.getFieldValue(compareFieldName + "_" + next); String[] compare = compareFieldValue.split("\\s+"); //log.info("posFieldValue = " + posFieldValue); //log.info("compareFieldValue = " + compareFieldValue); if (compare.length == pos.length && !(compare.length == 1 && compare[0].equals(compareFieldValue))) { //Pattern oneTag = Pattern.compile("\\[?([^\\]]+)\\]?"); Pattern splitter = Pattern.compile("\\]\\["); StringBuffer sbDiff = new StringBuffer(); StringBuffer sbChange = new StringBuffer(); for (int i = 0; i < compare.length; i++) { sbDiff.append(pos[i]); String tags = compare[i].substring(compare[i].indexOf('|') + 1); if (tags.charAt(0) == '[') { tags = tags.substring(1, tags.length() - 1); //strip [ and ] } //Matcher m = oneTag.matcher(tags); //if (m.matches()) { String[] tagList = splitter.split(tags); String posRef = pos[i].substring(pos[i].indexOf('|') + 1); boolean match = false; for (int k = 0; k < tagList.length; k++) { //String tag = m.group(1); //tags.substring(1, tags.length()-1); //if (!tag.equals(pos[i].substring(pos[i].indexOf('|')+1))) { if (tagList[k].equals(posRef)) { match = true; break; } } if (!match) { sbDiff.append(diffDelim); sbDiff.append(StringUtils.join(tagList, "~")); } /* sbDiff.append(diffDelim); sbDiff.append(tag); sbChange.append(pos[i].substring(0, pos[i].indexOf('|'))); sbChange.append(diffDelim); sbChange.append(tag); } else { sbChange.append(pos[i]); } } else { sbChange.append(pos[i]); } */ sbDiff.append(' '); sbChange.append(' '); } sbDiff.deleteCharAt(sbDiff.length() - 1); //remove final space sbChange.deleteCharAt(sbChange.length() - 1); //remove final space if (differentFieldName != null) { SolrInputField differentField = new SolrInputField( differentFieldName + "_" + next); differentField.setValue(sbDiff.toString(), 1.0f); doc.put(differentFieldName + "_" + next, differentField); } if (changeFieldName != null) { SolrInputField changeField = new SolrInputField(changeFieldName + "_" + next); changeField.setValue(sbChange.toString(), 1.0f); doc.put(changeFieldName + "_" + next, changeField); } } } } } super.processAdd(cmd); } }; }
From source file:net.duckling.ddl.service.export.impl.ExportServiceImpl.java
private String processFileOrPageLink(String html, String type, VWBContext context, String path, ArchiveOutputStream out, Map<String, String> id2Title, List<String> allPages, boolean isEpub) { String regex;/*from w w w . j a v a 2 s . c o m*/ if (LynxConstants.TYPE_FILE.equals(type)) { regex = "/file/([0-9]+)"; } else { regex = "/page/([0-9]+)"; } Pattern p = Pattern.compile(regex); String[] cells = p.split(html); if (cells.length == 1) { return html; } for (int i = 0; i < cells.length; i++) { int indexHref = cells[i].lastIndexOf("href=\""); if (i < cells.length - 1 && indexHref > 0) { cells[i] = cells[i].substring(0, indexHref); } int indexQuote = cells[i].indexOf('"'); if (i > 0) { cells[i] = cells[i].substring(indexQuote + 1); } } Matcher m = p.matcher(html); StringBuilder sb = new StringBuilder(); sb.append(cells[0]); int index = 1; while (m.find()) { int attId = Integer.parseInt(m.group(1)); String resKey = attId + "_" + VWBContext.getCurrentTid() + "_" + type; String tagname = path.substring(0, path.lastIndexOf("/")); String resPath = getRelativeResPath(resKey, tagname); if (null == resPath) {// ??? if (regex.contains("file")) { writeAttFile(path, VWBContext.getCurrentTid(), attId, context, out); } else { Resource res = resourceService.getResource(attId, context.getTid()); if (null != res) { List<FileVersion> attFiles = fileVersionService.getFilesOfPage(res.getRid(), VWBContext.getCurrentTid()); for (FileVersion file : attFiles) { writeAttFile(path, VWBContext.getCurrentTid(), file.getRid(), context, out); } writePage(path, res.getRid(), context, out, id2Title, allPages, isEpub); } } resPath = getResNoTagPath(resKey); resPath = (null == resPath) ? "#" : resPath; } sb.append("href=\"" + resPath + "\""); sb.append(cells[index++]); } return sb.toString(); }
From source file:org.codehaus.mojo.VeraxxMojo.java
protected OutputStream getOutputStreamErr() { String OutputReportName = new String(); if (reportsfileDir.isAbsolute()) { OutputReportName = reportsfileDir.getAbsolutePath() + "/" + getReportFileName(); } else {/*w w w .j a va 2 s . c om*/ OutputReportName = basedir.getAbsolutePath() + "/" + reportsfileDir.getPath() + "/" + getReportFileName(); } getLog().info("Vera++ report location " + OutputReportName); OutputStream output = System.err; File file = new File(OutputReportName); try { new File(file.getParent()).mkdirs(); file.createNewFile(); output = new FileOutputStream(file); } catch (IOException e) { getLog().error("Vera++ report redirected to stderr since " + OutputReportName + " can't be opened"); return output; } final DataOutputStream out = new DataOutputStream(output); try { out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); out.writeBytes("<checkstyle version=\"5.0\">\n"); } catch (IOException e) { getLog().error("Vera++ xml report write failure"); } OutputStream outErrFilter = new OutputStream() { StringBuffer sb = new StringBuffer(); public void write(int b) throws IOException { if ((b == '\n') || (b == '\r')) { transformCurrentLine(); // cleanup for next line sb.delete(0, sb.length()); } else { sb.append((char) b); } } public void flush() throws IOException { transformCurrentLine(); getLog().debug("Vera++ xml flush() called"); if (!StringUtils.isEmpty(lastfile)) { out.writeBytes("\t</file>\n"); } out.writeBytes("</checkstyle>\n"); out.flush(); } String lastfile; private void transformCurrentLine() { if (sb.length() > 0) { // parse current line // try to replace ' (RULENumber) ' with 'RULENumber:' String p = "^(.+) \\((.+)\\) (.+)$"; Pattern pattern = Pattern.compile(p); Matcher matcher = pattern.matcher(sb); getLog().debug("match " + sb + " on " + p); boolean bWinPath = false; if (sb.charAt(1) == ':') { bWinPath = true; sb.setCharAt(1, '_'); } if (matcher.matches()) { String sLine = matcher.group(1) + matcher.group(2) + ":" + matcher.group(3); getLog().debug("rebuild line = " + sLine); // extract informations pattern = Pattern.compile(":"); String[] items = pattern.split(sLine); String file, line, rule, comment, severity; file = items.length > 0 ? items[0] : ""; line = items.length > 1 ? items[1] : ""; rule = items.length > 2 ? items[2] : ""; comment = items.length > 3 ? items[3] : ""; severity = "warning"; if (bWinPath) { StringBuilder s = new StringBuilder(file); s.setCharAt(1, ':'); file = s.toString(); } // output Xml errors try { // handle <file/> tags if (!file.equals(lastfile)) { if (!StringUtils.isEmpty(lastfile)) { out.writeBytes("\t</file>\n"); } out.writeBytes("\t<file name=\"" + file + "\">\n"); lastfile = file; } out.writeBytes("\t\t<error line=\"" + line + "\" severity=\"" + severity + "\" message=\"" + comment + "\" source=\"" + rule + "\"/>\n"); } catch (IOException e) { getLog().error("Vera++ xml report write failure"); } } } } }; return outErrFilter; }
From source file:org.apache.maven.plugin.cxx.VeraxxMojo.java
@Override protected OutputStream getOutputStreamErr() { String outputReportName = new String(); if (reportsfileDir.isAbsolute()) { outputReportName = reportsfileDir.getAbsolutePath() + File.separator + getReportFileName(); } else {//from w w w . ja v a 2s . c o m outputReportName = basedir.getAbsolutePath() + File.separator + reportsfileDir.getPath() + File.separator + getReportFileName(); } getLog().info("Vera++ report location " + outputReportName); OutputStream output = System.err; File file = new File(outputReportName); try { new File(file.getParent()).mkdirs(); file.createNewFile(); output = new FileOutputStream(file); } catch (IOException e) { getLog().error("Vera++ report redirected to stderr since " + outputReportName + " can't be opened"); return output; } final DataOutputStream out = new DataOutputStream(output); try { out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); out.writeBytes("<checkstyle version=\"5.0\">\n"); } catch (IOException e) { getLog().error("Vera++ xml report write failure"); } OutputStream outErrFilter = new OutputStream() { StringBuffer sb = new StringBuffer(); public void write(int b) throws IOException { if ((b == '\n') || (b == '\r')) { transformCurrentLine(); // cleanup for next line sb.delete(0, sb.length()); } else { sb.append((char) b); } } public void flush() throws IOException { transformCurrentLine(); getLog().debug("Vera++ xml flush() called"); if (!StringUtils.isEmpty(lastfile)) { out.writeBytes("\t</file>\n"); } out.writeBytes("</checkstyle>\n"); out.flush(); } String lastfile; private void transformCurrentLine() { if (sb.length() > 0) { // parse current line // try to replace ' (RULENumber) ' with 'RULENumber:' String p = "^(.+) \\((.+)\\) (.+)$"; Pattern pattern = Pattern.compile(p); Matcher matcher = pattern.matcher(sb); getLog().debug("match " + sb + " on " + p); boolean bWinPath = false; if (sb.charAt(1) == ':') { bWinPath = true; sb.setCharAt(1, '_'); } if (matcher.matches()) { String sLine = matcher.group(1) + matcher.group(2) + ":" + matcher.group(3); getLog().debug("rebuild line = " + sLine); // extract informations pattern = Pattern.compile(":"); String[] items = pattern.split(sLine); String file, line, rule, comment, severity; file = items.length > 0 ? items[0] : ""; line = items.length > 1 ? items[1] : ""; rule = items.length > 2 ? items[2] : ""; comment = items.length > 3 ? items[3] : ""; severity = "warning"; if (bWinPath) { StringBuilder s = new StringBuilder(file); s.setCharAt(1, ':'); file = s.toString(); } // output Xml errors try { // handle <file/> tags if (!file.equals(lastfile)) { if (!StringUtils.isEmpty(lastfile)) { out.writeBytes("\t</file>\n"); } out.writeBytes("\t<file name=\"" + file + "\">\n"); lastfile = file; } out.writeBytes("\t\t<error line=\"" + line + "\" severity=\"" + severity + "\" message=\"" + comment + "\" source=\"" + rule + "\"/>\n"); } catch (IOException e) { getLog().error("Vera++ xml report write failure"); } } } } }; return outErrFilter; }
From source file:com.cyberway.issue.crawler.frontier.AbstractFrontier.java
/** * @param name Name of this frontier./* w ww .ja v a2s . co m*/ * @param description Description for this frontier. */ public AbstractFrontier(String name, String description) { super(name, description); addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR, "How many multiples of last fetch elapsed time to wait before " + "recontacting same server", DEFAULT_DELAY_FACTOR)); addElementToDefinition( new SimpleType(ATTR_MAX_DELAY, "Never wait more than this long.", DEFAULT_MAX_DELAY)); addElementToDefinition(new SimpleType(ATTR_MIN_DELAY, "Always wait this long after one completion before recontacting " + "same server.", DEFAULT_MIN_DELAY)); addElementToDefinition(new SimpleType(ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS, "Respect a Crawl-Delay directive in a site's robots.txt " + "up to this value in seconds. (If longer, simply " + "respect this value.) Default is 300 seconds (5 minutes).", DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS)); addElementToDefinition( new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved. " + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES)); addElementToDefinition( new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a" + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY)); addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS, "Number of embedded (or redirected) hops up to which " + "a URI has higher priority scheduling. For example, if set " + "to 1 (the default), items such as inline images (1-hop " + "embedded resources) will be scheduled ahead of all regular " + "links (or many-hop resources, like nested frames). If set to " + "zero, no preferencing will occur, and embeds/redirects are " + "scheduled the same as regular links.", DEFAULT_PREFERENCE_EMBED_HOPS)); Type t; t = addElementToDefinition(new SimpleType(ATTR_MAX_OVERALL_BANDWIDTH_USAGE, "The maximum average bandwidth the crawler is allowed to use. " + "The actual read speed is not affected by this setting, it only " + "holds back new URIs from being processed when the bandwidth " + "usage has been to high. 0 means no bandwidth limitation.", DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE)); t.setOverrideable(false); t = addElementToDefinition(new SimpleType(ATTR_MAX_HOST_BANDWIDTH_USAGE, "The maximum average bandwidth the crawler is allowed to use per " + "host. The actual read speed is not affected by this setting, " + "it only holds back new URIs from being processed when the " + "bandwidth usage has been to high. 0 means no bandwidth " + "limitation.", DEFAULT_MAX_HOST_BANDWIDTH_USAGE)); t.setExpertSetting(true); // Read the list of permissible choices from heritrix.properties. // Its a list of space- or comma-separated values. String queueStr = System.getProperty(AbstractFrontier.class.getName() + "." + ATTR_QUEUE_ASSIGNMENT_POLICY, HostnameQueueAssignmentPolicy.class.getName() + " " + IPQueueAssignmentPolicy.class.getName() + " " + BucketQueueAssignmentPolicy.class.getName() + " " + SurtAuthorityQueueAssignmentPolicy.class.getName() + " " + TopmostAssignedSurtQueueAssignmentPolicy.class.getName()); Pattern p = Pattern.compile("\\s*,\\s*|\\s+"); String[] queues = p.split(queueStr); if (queues.length <= 0) { throw new RuntimeException("Failed parse of " + " assignment queue policy string: " + queueStr); } t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY, "Defines how to assign URIs to queues. Can assign by host, " + "by ip, and into one of a fixed set of buckets (1k).", queues[0], queues)); t.setExpertSetting(true); t.setOverrideable(true); t = addElementToDefinition(new SimpleType(ATTR_FORCE_QUEUE, "The queue name into which to force URIs. Should " + "be left blank at global level. Specify a " + "per-domain/per-host override to force URIs into " + "a particular named queue, regardless of the assignment " + "policy in effect (domain or ip-based politeness). " + "This could be used on domains known to all be from " + "the same small set of IPs (eg blogspot, dailykos, etc.) " + "to simulate IP-based politeness, or it could be used if " + "you wanted to enforce politeness over a whole domain, even " + "though the subdomains are split across many IPs.", DEFAULT_FORCE_QUEUE)); t.setOverrideable(true); t.setExpertSetting(true); t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING, "This field must contain only alphanumeric " + "characters plus period, dash, comma, colon, or underscore.")); t = addElementToDefinition(new SimpleType(ATTR_PAUSE_AT_START, "Whether to pause when the crawl begins, before any URIs " + "are tried. This gives the operator a chance to verify or " + "adjust the crawl before actual work begins. " + "Default is false.", DEFAULT_PAUSE_AT_START)); t = addElementToDefinition(new SimpleType(ATTR_PAUSE_AT_FINISH, "Whether to pause when the crawl appears finished, rather " + "than immediately end the crawl. This gives the operator an " + "opportunity to view crawl results, and possibly add URIs or " + "adjust settings, while the crawl state is still available. " + "Default is false.", DEFAULT_PAUSE_AT_FINISH)); t.setOverrideable(false); t = addElementToDefinition(new SimpleType(ATTR_SOURCE_TAG_SEEDS, "Whether to tag seeds with their own URI as a heritable " + "'source' String, which will be carried-forward to all URIs " + "discovered on paths originating from that seed. When " + "present, such source tags appear in the second-to-last " + "crawl.log field.", DEFAULT_SOURCE_TAG_SEEDS)); t.setOverrideable(false); t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED, "Set to false to disable recovery log writing. Do this if " + "you you are using the checkpoint feature for recovering " + "crashed crawls.", DEFAULT_ATTR_RECOVERY_ENABLED)); t.setExpertSetting(true); // No sense in it being overrideable. t.setOverrideable(false); }
From source file:com.cyberway.issue.crawler.frontier.AdaptiveRevisitFrontier.java
public AdaptiveRevisitFrontier(String name, String description) { super(Frontier.ATTR_NAME, description); addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR, "How many multiples of last fetch elapsed time to wait before " + "recontacting same server", DEFAULT_DELAY_FACTOR));/*from w w w . jav a 2s. c o m*/ addElementToDefinition(new SimpleType(ATTR_MAX_DELAY, "Never wait more than this long, regardless of multiple", DEFAULT_MAX_DELAY)); addElementToDefinition( new SimpleType(ATTR_MIN_DELAY, "Always wait this long after one completion before recontacting " + "same server, regardless of multiple", DEFAULT_MIN_DELAY)); addElementToDefinition( new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved.\n" + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES)); addElementToDefinition( new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a" + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY)); addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS, "Number of embedded (or redirected) hops up to which " + "a URI has higher priority scheduling. For example, if set " + "to 1 (the default), items such as inline images (1-hop " + "embedded resources) will be scheduled ahead of all regular " + "links (or many-hop resources, like nested frames). If set to " + "zero, no preferencing will occur, and embeds/redirects are " + "scheduled the same as regular links.", DEFAULT_PREFERENCE_EMBED_HOPS)); Type t; t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE, "Maximum number of simultaneous requests to a single" + " host.", DEFAULT_HOST_VALENCE)); t.setExpertSetting(true); t = addElementToDefinition( new SimpleType(ATTR_QUEUE_IGNORE_WWW, "If true then documents from x.com, www.x.com and any " + "www[0-9]+.x.com will be assigned to the same queue.", DEFAULT_QUEUE_IGNORE_WWW)); t.setExpertSetting(true); t = addElementToDefinition(new SimpleType(ATTR_FORCE_QUEUE, "The queue name into which to force URIs. Should " + "be left blank at global level. Specify a " + "per-domain/per-host override to force URIs into " + "a particular named queue, regardless of the assignment " + "policy in effect (domain or ip-based politeness). " + "This could be used on domains known to all be from " + "the same small set of IPs (eg blogspot, dailykos, etc.) " + "to simulate IP-based politeness, or it could be used if " + "you wanted to enforce politeness over a whole domain, even " + "though the subdomains are split across many IPs.", DEFAULT_FORCE_QUEUE)); t.setOverrideable(true); t.setExpertSetting(true); t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING, "This field must contain only alphanumeric " + "characters plus period, dash, comma, colon, or underscore.")); t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER, "If true then the Frontier will use a seperate " + "datastructure to detect and eliminate duplicates.\n" + "This is required for Canonicalization rules to work.", DEFAULT_USE_URI_UNIQ_FILTER)); t.setExpertSetting(true); t.setOverrideable(false); // Read the list of permissible choices from heritrix.properties. // Its a list of space- or comma-separated values. String queueStr = System.getProperty(AbstractFrontier.class.getName() + "." + ATTR_QUEUE_ASSIGNMENT_POLICY, HostnameQueueAssignmentPolicy.class.getName() + " " + IPQueueAssignmentPolicy.class.getName() + " " + BucketQueueAssignmentPolicy.class.getName() + " " + SurtAuthorityQueueAssignmentPolicy.class.getName() + " " + TopmostAssignedSurtQueueAssignmentPolicy.class.getName()); Pattern p = Pattern.compile("\\s*,\\s*|\\s+"); String[] queues = p.split(queueStr); if (queues.length <= 0) { throw new RuntimeException("Failed parse of " + " assignment queue policy string: " + queueStr); } t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY, "Defines how to assign URIs to queues. Can assign by host, " + "by ip, and into one of a fixed set of buckets (1k). NOTE: " + "Use of policies other than the default " + "HostnameQueueAssignmentPolicy is untested and provided " + "for use at your own risk. Further, changing this policy " + "during a crawl, or between restarts using the same data " + "directory, is likely to cause unrecoverable problems.", DEFAULT_QUEUE_ASSIGNMENT_POLICY, queues)); t.setExpertSetting(true); // Register persistent CrawlURI items CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY); CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING); }
From source file:edu.cornell.mannlib.vitro.webapp.utils.jena.JenaIngestUtils.java
/** * Splits values for a given data property URI on a supplied regex and * asserts each value using newPropertyURI. New statements returned in * a Jena Model. Split values may be optionally trim()ed. * @param inModel//from w w w . ja v a 2 s.com * @param propertyURI * @param splitRegex * @param newPropertyURI * @param trim * @return outModel */ public Model splitPropertyValues(Model inModel, String propertyURI, String splitRegex, String newPropertyURI, boolean trim) { Model outModel = ModelFactory.createDefaultModel(); Pattern delimiterPattern = Pattern.compile(splitRegex); Property theProp = ResourceFactory.createProperty(propertyURI); Property newProp = ResourceFactory.createProperty(newPropertyURI); inModel.enterCriticalSection(Lock.READ); try { StmtIterator stmtIt = inModel.listStatements((Resource) null, theProp, (RDFNode) null); try { while (stmtIt.hasNext()) { Statement stmt = stmtIt.nextStatement(); Resource subj = stmt.getSubject(); RDFNode obj = stmt.getObject(); if (obj.isLiteral()) { Literal lit = (Literal) obj; String unsplitStr = lit.getLexicalForm(); String[] splitPieces = delimiterPattern.split(unsplitStr); for (int i = 0; i < splitPieces.length; i++) { String newLexicalForm = splitPieces[i]; if (trim) { newLexicalForm = newLexicalForm.trim(); } if (newLexicalForm.length() > 0) { Literal newLiteral = null; if (lit.getDatatype() != null) { newLiteral = outModel.createTypedLiteral(newLexicalForm, lit.getDatatype()); } else { if (lit.getLanguage() != null) { newLiteral = outModel.createLiteral(newLexicalForm, lit.getLanguage()); } else { newLiteral = outModel.createLiteral(newLexicalForm); } } outModel.add(subj, newProp, newLiteral); } } } } } finally { stmtIt.close(); } } finally { inModel.leaveCriticalSection(); } return outModel; }
From source file:com.krawler.formbuilder.servlet.workflowHandler.java
private void writeXml(JSONObject jsonobj, String containerId, String processId, JSONObject linejson) throws TransformerConfigurationException, TransformerException, JSONException { JSONArray jarr = jsonobj.getJSONArray("data"); JSONArray linearr = linejson.getJSONArray("data"); String split = ":"; for (int i = 0; i < jarr.length(); i++) { Pattern p = Pattern.compile(split); JSONObject jobj = jarr.getJSONObject(i); String id = jobj.getString("id"); String value = jobj.getString("value"); String[] ObjectVal = null; ObjectVal = p.split(value); if (ObjectVal[0].equals("process-swim")) { ObjectInfo obj = new ObjectInfo(); obj.objId = ObjectVal[1];/*from w w w . ja v a 2 s . c om*/ obj.name = ObjectVal[2]; obj.xpos = ObjectVal[3]; obj.ypos = ObjectVal[4]; obj.width = ObjectVal[5]; obj.height = ObjectVal[6]; obj.parentId = ObjectVal[7]; obj.type = "Pool"; obj.handId = ""; obj.refId = ObjectVal[8]; this.poolContainer.add(obj); } else if (ObjectVal[0].equals("lane-swim")) { ObjectInfo obj = new ObjectInfo(); obj.objId = ObjectVal[1]; obj.name = ObjectVal[2]; obj.xpos = ObjectVal[3]; obj.ypos = ObjectVal[4]; obj.width = ObjectVal[5]; obj.height = ObjectVal[6]; obj.parentId = ObjectVal[7]; obj.processId = ObjectVal[8]; obj.refId = ObjectVal[9]; obj.type = "Lane"; obj.handId = ""; this.poolContainer.add(obj); } else { ObjectInfo obj = new ObjectInfo(); if (ObjectVal[0].equals("task-activity")) { obj.type = "task"; } else if (ObjectVal[0].equals("start")) { obj.type = "start"; obj.handId = ObjectVal[7]; } else if (ObjectVal[0].equals("end")) { obj.type = "end"; obj.handId = ObjectVal[7]; } obj.objId = ObjectVal[1]; obj.name = ObjectVal[2]; obj.xpos = ObjectVal[3]; obj.ypos = ObjectVal[4]; obj.width = ObjectVal[5]; obj.height = ObjectVal[6]; obj.parentId = ObjectVal[7]; obj.processId = ObjectVal[8]; obj.refId = ObjectVal[9]; obj.hasStart = ObjectVal[10]; obj.hasEnd = ObjectVal[11]; obj.startRefId = ObjectVal[12]; obj.endRefId = ObjectVal[13]; obj.derivationRule = ObjectVal[14]; obj.domEl = ObjectVal[15]; this.taskContainer.add(obj); } } Element rootElet = dom.createElement("Package"); rootElet.setAttribute("xmlns", "http://www.wfmc.org/2008/XPDL2.1"); dom.appendChild(rootElet); Element ele = dom.createElement("PackageHeader"); Element childElement = dom.createElement("XPDLVersion"); Text text = dom.createTextNode("2.1"); childElement.appendChild(text); ele.appendChild(childElement); rootElet.appendChild(ele); addPools(rootElet, containerId, processId); addWorkflow(rootElet, processId, linearr); }