List of usage examples for java.util.regex Matcher toMatchResult
public MatchResult toMatchResult()
From source file:uk.ac.kcl.at.ElasticGazetteerAcceptanceTest.java
@Test public void deidentificationPerformanceTest() { dbmsTestUtils.createBasicInputTable(); dbmsTestUtils.createBasicOutputTable(); dbmsTestUtils.createDeIdInputTable(); List<Mutant> mutants = testUtils.insertTestDataForDeidentification(env.getProperty("tblIdentifiers"), env.getProperty("tblInputDocs"), mutatortype, true); int totalTruePositives = 0; int totalFalsePositives = 0; int totalFalseNegatives = 0; for (Mutant mutant : mutants) { Set<Pattern> mutatedPatterns = new HashSet<>(); mutant.setDeidentifiedString(elasticGazetteerService.deIdentifyString(mutant.getFinalText(), String.valueOf(mutant.getDocumentid()))); Set<String> set = new HashSet<>(mutant.getOutputTokens()); mutatedPatterns.addAll(//from www.ja v a2 s . c o m set.stream().map(string -> Pattern.compile(Pattern.quote(string), Pattern.CASE_INSENSITIVE)) .collect(Collectors.toSet())); List<MatchResult> results = new ArrayList<>(); for (Pattern pattern : mutatedPatterns) { Matcher matcher = pattern.matcher(mutant.getFinalText()); while (matcher.find()) { results.add(matcher.toMatchResult()); } } int truePositives = getTruePositiveTokenCount(mutant); int falsePositives = getFalsePositiveTokenCount(mutant); int falseNegatives = getFalseNegativeTokenCount(mutant); System.out.println("Doc ID " + mutant.getDocumentid() + " has " + falseNegatives + " unmasked identifiers from a total of " + (falseNegatives + truePositives)); System.out.println("Doc ID " + mutant.getDocumentid() + " has " + falsePositives + " inaccurately masked tokens from a total of " + (falsePositives + truePositives)); System.out.println("TP: " + truePositives + " FP: " + falsePositives + " FN: " + falseNegatives); System.out.println("Doc ID precision " + calcPrecision(falsePositives, truePositives)); System.out.println("Doc ID recall " + calcRecall(falseNegatives, truePositives)); System.out.println(mutant.getDeidentifiedString()); System.out.println(mutant.getFinalText()); System.out.println(mutant.getInputTokens()); System.out.println(mutant.getOutputTokens()); System.out.println(); if (env.getProperty("elasticgazetteerTestOutput") != null) { try { try (BufferedWriter bw = new BufferedWriter( new FileWriter(new File(env.getProperty("elasticgazetteerTestOutput") + File.separator + mutant.getDocumentid())))) { bw.write("Doc ID " + mutant.getDocumentid() + " has " + falseNegatives + " unmasked identifiers from a total of " + (falseNegatives + truePositives)); bw.newLine(); bw.write("Doc ID " + mutant.getDocumentid() + " has " + falsePositives + " inaccurately masked tokens from a total of " + (falsePositives + truePositives)); bw.newLine(); bw.write("TP: " + truePositives + " FP: " + falsePositives + " FN: " + falseNegatives); bw.newLine(); bw.write("Doc ID precision " + calcPrecision(falsePositives, truePositives)); bw.newLine(); bw.write("Doc ID recall " + calcRecall(falseNegatives, truePositives)); bw.newLine(); bw.write(mutant.getDeidentifiedString()); bw.newLine(); bw.write(mutant.getFinalText()); bw.newLine(); bw.write(mutant.getInputTokens().toString()); bw.newLine(); bw.write(mutant.getOutputTokens().toString()); } } catch (IOException e) { e.printStackTrace(); } } totalTruePositives += truePositives; totalFalsePositives += falsePositives; totalFalseNegatives += falseNegatives; } DecimalFormat df = new DecimalFormat("#.#"); df.setRoundingMode(RoundingMode.CEILING); System.out.println(); System.out.println(); System.out.println("THIS RUN TP: " + totalTruePositives + " FP: " + totalFalsePositives + " FN: " + totalFalseNegatives); System.out.println("Doc ID precision " + calcPrecision(totalFalsePositives, totalTruePositives)); System.out.println("Doc ID recall " + calcRecall(totalFalseNegatives, totalTruePositives)); System.out.println(totalTruePositives + " & " + totalFalsePositives + " & " + totalFalseNegatives + " & " + df.format(calcPrecision(totalFalsePositives, totalTruePositives)) + " & " + df.format(calcRecall(totalFalseNegatives, totalTruePositives)) + " \\\\"); if (env.getProperty("elasticgazetteerTestOutput") != null) { try { try (BufferedWriter bw = new BufferedWriter(new FileWriter( new File(env.getProperty("elasticgazetteerTestOutput") + File.separator + "summary")))) { bw.write("THIS RUN TP: " + totalTruePositives + " FP: " + totalFalsePositives + " FN: " + totalFalseNegatives); bw.newLine(); bw.write("Doc ID precision " + calcPrecision(totalFalsePositives, totalTruePositives)); bw.newLine(); bw.write("Doc ID recall " + calcRecall(totalFalseNegatives, totalTruePositives)); } } catch (IOException e) { e.printStackTrace(); } } }
From source file:uk.ac.kcl.it.DeIdentificationPKPartitionWithoutScheduling.java
@Test public void deidentificationPerformanceTest() { dbmsTestUtils.createBasicInputTable(); dbmsTestUtils.createBasicOutputTable(); dbmsTestUtils.createDeIdInputTable(); List<Mutant> mutants = testUtils.insertTestDataForDeidentification(env.getProperty("tblIdentifiers"), env.getProperty("tblInputDocs"), mutatortype); int totalTruePositives = 0; int totalFalsePositives = 0; int totalFalseNegatives = 0; for (Mutant mutant : mutants) { Set<Pattern> mutatedPatterns = new HashSet<>(); mutant.setDeidentifiedString(elasticGazetteerService.deIdentifyString(mutant.getFinalText(), String.valueOf(mutant.getDocumentid()))); Set<String> set = new HashSet<>(mutant.getOutputTokens()); mutatedPatterns.addAll(/* w w w . ja va 2 s.co m*/ set.stream().map(string -> Pattern.compile(Pattern.quote(string), Pattern.CASE_INSENSITIVE)) .collect(Collectors.toSet())); List<MatchResult> results = new ArrayList<>(); for (Pattern pattern : mutatedPatterns) { Matcher matcher = pattern.matcher(mutant.getFinalText()); while (matcher.find()) { results.add(matcher.toMatchResult()); } } int truePositives = getTruePositiveTokenCount(mutant); int falsePositives = getFalsePositiveTokenCount(mutant); int falseNegatives = getFalseNegativeTokenCount(mutant); System.out.println("Doc ID " + mutant.getDocumentid() + " has " + falseNegatives + " unmasked identifiers from a total of " + (falseNegatives + truePositives)); System.out.println("Doc ID " + mutant.getDocumentid() + " has " + falsePositives + " inaccurately masked tokens from a total of " + (falsePositives + truePositives)); System.out.println("TP: " + truePositives + " FP: " + falsePositives + " FN: " + falseNegatives); System.out.println("Doc ID precision " + calcPrecision(falsePositives, truePositives)); System.out.println("Doc ID recall " + calcRecall(falseNegatives, truePositives)); System.out.println(mutant.getDeidentifiedString()); System.out.println(mutant.getFinalText()); System.out.println(mutant.getInputTokens()); System.out.println(mutant.getOutputTokens()); System.out.println(); if (env.getProperty("elasticgazetteerTestOutput") != null) { try { try (BufferedWriter bw = new BufferedWriter( new FileWriter(new File(env.getProperty("elasticgazetteerTestOutput") + File.separator + mutant.getDocumentid())))) { bw.write("Doc ID " + mutant.getDocumentid() + " has " + falseNegatives + " unmasked identifiers from a total of " + (falseNegatives + truePositives)); bw.newLine(); bw.write("Doc ID " + mutant.getDocumentid() + " has " + falsePositives + " inaccurately masked tokens from a total of " + (falsePositives + truePositives)); bw.newLine(); bw.write("TP: " + truePositives + " FP: " + falsePositives + " FN: " + falseNegatives); bw.newLine(); bw.write("Doc ID precision " + calcPrecision(falsePositives, truePositives)); bw.newLine(); bw.write("Doc ID recall " + calcRecall(falseNegatives, truePositives)); bw.newLine(); bw.write(mutant.getDeidentifiedString()); bw.newLine(); bw.write(mutant.getFinalText()); bw.newLine(); bw.write(mutant.getInputTokens().toString()); bw.newLine(); bw.write(mutant.getOutputTokens().toString()); } } catch (IOException e) { e.printStackTrace(); } } totalTruePositives += truePositives; totalFalsePositives += falsePositives; totalFalseNegatives += falseNegatives; } System.out.println(); System.out.println(); System.out.println("THIS RUN TP: " + totalTruePositives + " FP: " + totalFalsePositives + " FN: " + totalFalseNegatives); System.out.println("Doc ID precision " + calcPrecision(totalFalsePositives, totalTruePositives)); System.out.println("Doc ID recall " + calcRecall(totalFalseNegatives, totalTruePositives)); if (env.getProperty("elasticgazetteerTestOutput") != null) { try { try (BufferedWriter bw = new BufferedWriter(new FileWriter( new File(env.getProperty("elasticgazetteerTestOutput") + File.separator + "summary")))) { bw.write("THIS RUN TP: " + totalTruePositives + " FP: " + totalFalsePositives + " FN: " + totalFalseNegatives); bw.newLine(); bw.write("Doc ID precision " + calcPrecision(totalFalsePositives, totalTruePositives)); bw.newLine(); bw.write("Doc ID recall " + calcRecall(totalFalseNegatives, totalTruePositives)); } } catch (IOException e) { e.printStackTrace(); } } }
From source file:org.nanocom.console.formatter.OutputFormatter.java
/** * Tries to create new style instance from string. * * @param string// ww w. ja v a2s. c o m * * @return Null if string is not format string */ private OutputFormatterStyle createStyleFromString(String string) { Matcher matcher = STYLE_PATTERN.matcher(string.toLowerCase()); OutputFormatterStyle style = new OutputFormatterStyle(); MatchResult result; if (!matcher.find()) { return null; } do { result = matcher.toMatchResult(); String match1 = result.group(1); // fg String match2 = result.group(2); // blue if ("fg".equals(result.group(1))) { style.setForeground(result.group(2)); } else if ("bg".equals(result.group(1))) { style.setBackground(result.group(2)); } else { style.setOption(result.group(2)); } } while (matcher.find()); return style; }
From source file:at.ac.tuwien.big.testsuite.impl.validator.XhtmlValidator.java
@Override public ValidationResult validate(File fileToValidate, String exerciseId) throws Exception { HttpPost request = new HttpPost(W3C_XHTML_VALIDATOR_URL); List<ValidationResultEntry> validationResultEntries = new ArrayList<>(); try {/*w ww . j a v a 2 s .c o m*/ MultipartEntity multipartEntity = new MultipartEntity(); multipartEntity.addPart("uploaded_file", new FileBody(fileToValidate, "text/html")); multipartEntity.addPart("charset", new StringBody("(detect automatically)")); multipartEntity.addPart("doctype", new StringBody("Inline")); multipartEntity.addPart("group", new StringBody("0")); request.setEntity(multipartEntity); Document doc = httpClient.execute(request, new DomResponseHandler(httpClient, request)); String doctype = DomUtils.textByXpath(doc.getDocumentElement(), "//form[@id='form']/table//tr[4]/td[1]"); if (!"XHTML 1.1".equals(doctype.trim()) && !doctype.contains("XHTML+ARIA 1.0")) { validationResultEntries.add(new DefaultValidationResultEntry("Doctype Validation", "The given document is not XHTML 1.1 compatible, instead the guessed doctype is '" + doctype + "'", ValidationResultEntryType.ERROR)); } Document fileToValidateDocument = null; Element warningsContainer = DomUtils.byId(doc.getDocumentElement(), "warnings"); if (warningsContainer != null) { for (Element warningChildElement : DomUtils.asList(warningsContainer.getChildNodes())) { if (IGNORED_MESSAGES.contains(warningChildElement.getAttribute("id"))) { continue; } ValidationResultEntryType type = getEntryType(warningChildElement.getAttribute("class")); String title = getTitle( DomUtils.firstByClass(warningChildElement.getElementsByTagName("span"), "msg")); StringBuilder descriptionSb = new StringBuilder(); for (Element descriptionElement : DomUtils.listByXpath(warningChildElement, ".//p[position()>1]")) { descriptionSb.append(descriptionElement.getTextContent()); } validationResultEntries .add(new DefaultValidationResultEntry(title, descriptionSb.toString(), type)); } } Element errorsContainer = DomUtils.byId(doc.getDocumentElement(), "error_loop"); if (errorsContainer != null) { for (Element errorChildElement : DomUtils.asList(errorsContainer.getChildNodes())) { ValidationResultEntryType type = getEntryType(errorChildElement.getAttribute("class")); StringBuilder titleSb = new StringBuilder(); NodeList errorEms = errorChildElement.getElementsByTagName("em"); if (errorEms.getLength() > 0) { titleSb.append(getTitle((Element) errorEms.item(0))); titleSb.append(": "); } titleSb.append( getTitle(DomUtils.firstByClass(errorChildElement.getElementsByTagName("span"), "msg"))); StringBuilder descriptionSb = new StringBuilder(); for (Element descriptionElement : DomUtils.listByXpath(errorChildElement, ".//div/p")) { descriptionSb.append(descriptionElement.getTextContent()); } String title = titleSb.toString(); if (TestsuiteConstants.EX_ID_LAB3.equals(exerciseId)) { // This is more a hack than anything else but we have to ignore the errors that were produced by JSF specific artifacts. // We basically extract the line and column number from the reported errors and look for the 2 elements that match these // numbers and check if they really are the input elements produced by forms that cant be wrapped by block containers. // More specifically we check for inputs with type hidden, one is for the ViewState of JSF and the other is for recognition // of the form that was submitted. Matcher matcher = LINE_AND_COLUMN_NUMBER_PATTERN.matcher(title); if (title.contains("document type does not allow element \"input\" here") && matcher.matches()) { if (fileToValidateDocument == null) { fileToValidateDocument = DomUtils.createDocument(fileToValidate); } boolean excludeEntry = false; int expectedLineNumber = Integer.parseInt(matcher.group(1)); int expectedColumnNumber = Integer.parseInt(matcher.group(2)); try (BufferedReader reader = new BufferedReader( new InputStreamReader(new FileInputStream(fileToValidate)))) { String line; while ((line = reader.readLine()) != null) { if (--expectedLineNumber == 0) { Matcher lineMatcher = HIDDEN_FORM_INPUT_PATTERN.matcher(line); if (lineMatcher.matches()) { MatchResult matchResult = lineMatcher.toMatchResult(); if (matchResult.start(1) <= expectedColumnNumber && matchResult.end(1) >= expectedColumnNumber) { excludeEntry = true; break; } } lineMatcher = HIDDEN_VIEW_STATE_INPUT_PATTERN.matcher(line); if (lineMatcher.matches()) { MatchResult matchResult = lineMatcher.toMatchResult(); if (matchResult.start(1) <= expectedColumnNumber && matchResult.end(1) >= expectedColumnNumber) { excludeEntry = true; break; } } System.out.println("Could not match potential wrong error."); break; } } } if (excludeEntry) { continue; } } } validationResultEntries .add(new DefaultValidationResultEntry(title, descriptionSb.toString(), type)); } } } finally { request.releaseConnection(); } return new DefaultValidationResult("XHTML Validation", fileToValidate.getName(), new DefaultValidationResultType("XHTML"), validationResultEntries); }
From source file:org.cosmo.common.util.Util.java
public static String replaceKeyword(String src, String keyword, String prefix, String suffix) { Pattern pattern = Pattern.compile(keyword, Pattern.CASE_INSENSITIVE); Matcher source = pattern.matcher(src); StringBuffer sb = new StringBuffer(); while (source.find()) { source.appendReplacement(sb, prefix + source.toMatchResult().group() + suffix); }/*from ww w . j av a 2 s .c o m*/ source.appendTail(sb); return sb.toString(); }
From source file:com.log4ic.compressor.utils.Compressor.java
/** * URL?//from ww w .ja v a2 s . c o m * * @param code * @param fileUrl * @param type * @param fileDomain * @return */ public static String fixUrlPath(HttpServletRequest req, String code, String fileUrl, FileType type, String fileDomain) { StringBuilder codeBuffer = new StringBuilder(); switch (type) { case GSS: case CSS: case LESS: case MSS: logger.debug("URL?..."); Pattern pattern = Pattern.compile("url\\(\\s*(?!['\"]?(?:data:|about:|#|@))([^)]+)\\)", Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(code); String[] codeFragments = pattern.split(code); fileUrl = fileUrl.substring(0, fileUrl.lastIndexOf("/") + 1); int i = 0; while (matcher.find()) { codeBuffer.append(codeFragments[i]); codeBuffer.append("url("); MatchResult result = matcher.toMatchResult(); String url = result.group(1).replaceAll("'|\"", ""); //??? if (!HttpUtils.isHttpProtocol(url) && !url.startsWith("/")) { url = URI.create(fileUrl + url).normalize().toASCIIString();//?URL } //??url?http?)?????? if (StringUtils.isNotBlank(fileDomain) && !HttpUtils.isHttpProtocol(url)) { if (!fileDomain.endsWith("/") && !url.startsWith("/")) { fileDomain = fileDomain + "/"; } else if (fileDomain.endsWith("/") && url.startsWith("/")) { url = url.substring(1); } if (!HttpUtils.isHttpProtocol(fileDomain)) { fileDomain = "http://" + fileDomain; } url = fileDomain + url; } else { url = req.getContextPath() + (url.startsWith("/") ? url : "/" + url); } codeBuffer.append(url); codeBuffer.append(")"); i++; } if (i == 0) { return code; } else { if (codeFragments.length > i && StringUtils.isNotBlank(codeFragments[i])) { codeBuffer.append(codeFragments[i]); } } logger.debug("URL?..."); break; default: return code; } return codeBuffer.toString(); }
From source file:org.apache.jackrabbit.core.security.authorization.acl.CachingPentahoEntryCollector.java
private void performAgainstAllInCache(CacheCallable callable) { Set allKeysFromRegionCache = cacheManager.getAllKeysFromRegionCache(ICacheManager.SESSION); for (Object compositeKey : allKeysFromRegionCache) { Matcher matcher = SESSION_KEY_PATTERN.matcher(compositeKey.toString()); if (matcher.matches()) { String key = matcher.toMatchResult().group(1); if (ENTRY_COLLECTOR.equals(key)) { Object fromRegionCache = cacheManager.getFromRegionCache(ICacheManager.SESSION, compositeKey); if (EntryCache.class.isAssignableFrom(fromRegionCache.getClass())) { callable.call((EntryCache) fromRegionCache); }/*w ww.j av a2s . c o m*/ } } } }
From source file:com.edgenius.wiki.render.filter.MacroFilter.java
public String filter(String input, final RenderContext context) { final List<Region> pairRegion = new ArrayList<Region>(); String result = singleMacroProvider.replaceByTokenVisitor(input, new TokenVisitor<Matcher>() { public void handleMatch(StringBuffer buffer, Matcher matcher) { handleMacro(false, buffer, matcher.toMatchResult(), context, pairRegion, null); }/*w ww . ja va 2 s . c om*/ }); // scenarios (s = start, u = unknown - no any attributes in macro, so could be start or end): // A B B A - B is process by recursive inside A // A B A B - B is treat as invalid - ignore // Au Au Au Au - First 2 Au is paired, Last 2 Au is paired // As1 As2 Au2 Au1 - OK, this case is special, which can not process by original Paired Regex patter // because it will treat it as As1 and Au2 as paired, but ignore As2 and Au1. // Here just try to resolve this problem. //exceptions // As1 As2 Au1 - then As1 can not find pair - no processed , but As2 could match with Au1 int size = pairRegion.size(); if (size > 0) { StringBuffer inputBuf = new StringBuffer(result); for (int idx = 0; idx < size; idx++) { Region reg = pairRegion.get(idx); int deep = 0; Region pair = null; //looking for pairs... for (int chIdx = idx + 1; chIdx < size; chIdx++) { Region next = pairRegion.get(chIdx); if (StringUtils.equalsIgnoreCase(reg.getContent(), next.getContent())) { //start is unknown (no attribute), then end must be unknown if (MACRO_REGION_KEY_UNKNOWN.equals(reg.getKey()) && MACRO_REGION_KEY_UNKNOWN.equals(next.getKey())) { //matched pair = next; //skip all internal node - which is handle by embedded recursive idx = chIdx; break; } if (MACRO_REGION_KEY_START.equals(reg.getKey()) && MACRO_REGION_KEY_UNKNOWN.equals(next.getKey())) { if (deep == 0) { //matched; pair = next; //skip all internal node - which is handle by embedded recursive idx = chIdx; break; } else { //just another inner same name macro matched, deep minus deep--; } } if (MACRO_REGION_KEY_START.equals(next.getKey())) { //ok, it gets another start, in 4th scenarios - then add deep deep++; } } } //ok, success find paired if (pair != null) { CharSequence macroContent = inputBuf.subSequence(reg.getStart(), pair.getEnd()); //for example, {font:size=12}abc{font}, the value is start markup string,i.e., {font:size=12} //so far, this text is useful to display markup if it has errors - which need highlight and with hover text final String pairStartMarkup = pair.getBody(); result = pairedMacroProvider.replaceByTokenVisitor(macroContent, new TokenVisitor<Matcher>() { public void handleMatch(StringBuffer buffer, Matcher matcher) { handleMacro(true, buffer, matcher.toMatchResult(), context, null, pairStartMarkup); } }); reg.setBody(result); reg.setSubRegion(pair); reg.setKey(MACRO_REGION_KEY_PROCESSED); } } //reverse, and replace input by region processed string (region.getBody()) for (int idx = size - 1; idx >= 0; idx--) { Region reg = pairRegion.get(idx); if (!MACRO_REGION_KEY_PROCESSED.equals(reg.getKey())) continue; inputBuf.replace(reg.getStart(), reg.getSubRegion().getEnd(), reg.getBody()); } return inputBuf.toString(); } return result; }
From source file:org.openhab.binding.lutron.internal.net.TelnetSession.java
public MatchResult waitFor(String prompt, long timeout) throws InterruptedException { Pattern regex = Pattern.compile(prompt); long startTime = timeout > 0 ? System.currentTimeMillis() : 0; synchronized (this.charBuffer) { this.charBuffer.flip(); String bufdata = this.charBuffer.toString(); int n = bufdata.lastIndexOf('\n'); String lastLine;/*from ww w . ja va 2 s .c o m*/ if (n != -1) { lastLine = bufdata.substring(n + 1); } else { lastLine = bufdata; } Matcher matcher = regex.matcher(lastLine); while (!matcher.find()) { long elapsed = timeout > 0 ? (System.currentTimeMillis() - startTime) : 0; if (timeout > 0 && elapsed >= timeout) { break; } this.charBuffer.clear(); this.charBuffer.put(lastLine); this.charBuffer.wait(timeout - elapsed); this.charBuffer.flip(); bufdata = this.charBuffer.toString(); n = bufdata.lastIndexOf('\n'); if (n != -1) { lastLine = bufdata.substring(n + 1); } else { lastLine = bufdata; } matcher = regex.matcher(lastLine); } this.charBuffer.clear(); return matcher.toMatchResult(); } }
From source file:gate.creole.splitter.RegexSentenceSplitter.java
@Override public void execute() throws ExecutionException { interrupted = false;/*from www . j a v a2s . co m*/ int lastProgress = 0; fireProgressChanged(lastProgress); //get pointers to the annotation sets AnnotationSet outputAS = (outputASName == null || outputASName.trim().length() == 0) ? document.getAnnotations() : document.getAnnotations(outputASName); String docText = document.getContent().toString(); /* If the document's content is empty or contains only whitespace, * we drop out right here, since there's nothing to sentence-split. */ if (docText.trim().length() < 1) { return; } Matcher internalSplitMatcher = internalSplitsPattern.matcher(docText); Matcher externalSplitMatcher = externalSplitsPattern.matcher(docText); Matcher nonSplitMatcher = nonSplitsPattern.matcher(docText); //store all non split locations in a list of pairs List<int[]> nonSplits = new LinkedList<int[]>(); while (nonSplitMatcher.find()) { nonSplits.add(new int[] { nonSplitMatcher.start(), nonSplitMatcher.end() }); } //this lists holds the next matches at each step List<MatchResult> nextSplitMatches = new ArrayList<MatchResult>(); //initialise matching process MatchResult internalMatchResult = null; if (internalSplitMatcher.find()) { internalMatchResult = internalSplitMatcher.toMatchResult(); nextSplitMatches.add(internalMatchResult); } MatchResult externalMatchResult = null; if (externalSplitMatcher.find()) { externalMatchResult = externalSplitMatcher.toMatchResult(); nextSplitMatches.add(externalMatchResult); } MatchResultComparator comparator = new MatchResultComparator(); int lastSentenceEnd = 0; while (!nextSplitMatches.isEmpty()) { //see which one matches first Collections.sort(nextSplitMatches, comparator); MatchResult nextMatch = nextSplitMatches.remove(0); if (nextMatch == internalMatchResult) { //we have a new internal split; see if it's vetoed or not if (!veto(nextMatch, nonSplits)) { //split is not vetoed try { //add the split annotation FeatureMap features = Factory.newFeatureMap(); features.put("kind", "internal"); outputAS.add(new Long(nextMatch.start()), new Long(nextMatch.end()), "Split", features); //generate the sentence annotation int endOffset = nextMatch.end(); //find the first non whitespace character starting from where the //last sentence ended while (lastSentenceEnd < endOffset && Character.isWhitespace(Character.codePointAt(docText, lastSentenceEnd))) { lastSentenceEnd++; } //if there is any useful text between the two offsets, generate //a new sentence if (lastSentenceEnd < nextMatch.start()) { outputAS.add(new Long(lastSentenceEnd), new Long(endOffset), ANNIEConstants.SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap()); } //store the new sentence end lastSentenceEnd = endOffset; } catch (InvalidOffsetException e) { // this should never happen throw new ExecutionException(e); } } //prepare for next step if (internalSplitMatcher.find()) { internalMatchResult = internalSplitMatcher.toMatchResult(); nextSplitMatches.add(internalMatchResult); } else { internalMatchResult = null; } } else if (nextMatch == externalMatchResult) { //we have a new external split; see if it's vetoed or not if (!veto(nextMatch, nonSplits)) { //split is not vetoed try { //generate the split FeatureMap features = Factory.newFeatureMap(); features.put("kind", "external"); outputAS.add(new Long(nextMatch.start()), new Long(nextMatch.end()), "Split", features); //generate the sentence annotation //find the last non whitespace character, going backward from //where the external skip starts int endOffset = nextMatch.start(); while (endOffset > lastSentenceEnd && Character.isSpaceChar(Character.codePointAt(docText, endOffset - 1))) { endOffset--; } //find the first non whitespace character starting from where the //last sentence ended while (lastSentenceEnd < endOffset && Character.isSpaceChar(Character.codePointAt(docText, lastSentenceEnd))) { lastSentenceEnd++; } //if there is any useful text between the two offsets, generate //a new sentence if (lastSentenceEnd < endOffset) { outputAS.add(new Long(lastSentenceEnd), new Long(endOffset), ANNIEConstants.SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap()); } //store the new sentence end lastSentenceEnd = nextMatch.end(); } catch (InvalidOffsetException e) { // this should never happen throw new ExecutionException(e); } } //prepare for next step if (externalSplitMatcher.find()) { externalMatchResult = externalSplitMatcher.toMatchResult(); nextSplitMatches.add(externalMatchResult); } else { externalMatchResult = null; } } else { //malfunction throw new ExecutionException("Invalid state - cannot identify match!"); } //report progress int newProgress = 100 * lastSentenceEnd / docText.length(); if (newProgress - lastProgress > 20) { lastProgress = newProgress; fireProgressChanged(lastProgress); } } //while(!nextMatches.isEmpty()){ fireProcessFinished(); }