List of usage examples for java.util.regex Matcher start
public int start()
From source file:me.Wundero.Ray.utils.TextUtils.java
/** * Split the text at a regular expression. If skip, pattern will be skipped. *//*from w w w. ja va2s . c o m*/ public static List<Text> split(Text t, Pattern p, boolean skip) { if (!lit(t)) { return Utils.al(t); } List<Text> out = Utils.al(); List<Text> children = t.getChildren(); LiteralText.Builder text = ((LiteralText) t).toBuilder(); String content = text.getContent(); if (!p.matcher(content).find()) { return Utils.al(t); } if (p.matcher(content).matches()) { return skip ? Utils.al() : Utils.al(t); } Matcher m = p.matcher(content); while ((m = m.reset(content)).find()) { int s = m.start(); int e = m.end(); Text.Builder b = Text.builder(content.substring(0, s)).format(text.getFormat()); b = apply(b, text); out.add(b.build()); if (!skip) { b = Text.builder(m.group()).format(text.getFormat()); b = apply(b, text); out.add(b.build()); } content = content.substring(0, e); } if (!content.isEmpty()) { Text.Builder b = Text.builder(content).format(text.getFormat()); b = apply(b, text); out.add(b.build()); } Text.Builder tx = out.get(out.size() - 1).toBuilder(); out.remove(out.size() - 1); for (Text child : children) { List<Text> lt = split(child, p, skip); if (lt.isEmpty()) { out.add(tx.build()); tx = null; } else if (lt.size() == 1) { tx = tx == null ? lt.get(0).toBuilder() : tx.append(lt.get(0)); } else { out.add(tx == null ? lt.get(0) : tx.append(lt.get(0)).build()); for (int i = 1; i < lt.size() - 1; i++) { out.add(lt.get(i)); } tx = tx == null ? lt.get(lt.size() - 1).toBuilder() : lt.get(lt.size() - 1).toBuilder(); } } if (tx != null) { out.add(tx.build()); } return out; }
From source file:com.novartis.opensource.yada.plugin.Gatekeeper.java
/** * Modifies the original query by appending a dynamic predicate * <p>Recall the {@link Service#engagePreprocess} method * will recall {@link QueryManager#endowQuery} to * reconform the code after this {@link Preprocess} * disengages./*from w w w. ja v a 2 s. c o m*/ * * * @throws YADASecurityException when token retrieval fails */ @Override public void applyContentPolicy() throws YADASecurityException { // TODO make it impossible to reset args and preargs dynamically if pl class implements SecurityPolicy // this will close an attack vector String SPACE = " "; StringBuilder contentPolicy = new StringBuilder(); Pattern rxInjection = Pattern.compile(RX_COL_INJECTION); String rawPolicy = getArgumentValue(CONTENT_POLICY_PREDICATE); Matcher m1 = rxInjection.matcher(rawPolicy); int start = 0; // field = getToken // field = getCookie(string) // field = getHeader(string) // field = getUser() // field = getRandom(string) if (!m1.find()) { String msg = "Unathorized. Injected method invocation failed."; throw new YADASecurityException(msg); } m1.reset(); while (m1.find()) { int rxStart = m1.start(); int rxEnd = m1.end(); contentPolicy.append(rawPolicy.substring(start, rxStart)); String frag = rawPolicy.substring(rxStart, rxEnd); String method = frag.substring(0, frag.indexOf('(')); String arg = frag.substring(frag.indexOf('(') + 1, frag.indexOf(')')); Object val = null; try { if (arg.equals("")) val = getClass().getMethod(method).invoke(this, new Object[] {}); else val = getClass().getMethod(method, new Class[] { java.lang.String.class }).invoke(this, new Object[] { arg }); } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) { String msg = "Unathorized. Injected method invocation failed."; throw new YADASecurityException(msg, e); } contentPolicy.append((String) val + SPACE); start = rxEnd; } Expression parsedContentPolicy; try { parsedContentPolicy = CCJSqlParserUtil.parseCondExpression(contentPolicy.toString()); } catch (JSQLParserException e) { String msg = "Unauthorized. Content policy is not valid."; throw new YADASecurityException(msg, e); } PlainSelect sql = (PlainSelect) ((Select) getYADAQuery().getStatement()).getSelectBody(); Expression where = sql.getWhere(); if (where != null) { AndExpression and = new AndExpression(where, parsedContentPolicy); sql.setWhere(and); } else { sql.setWhere(parsedContentPolicy); } try { CCJSqlParserManager parserManager = new CCJSqlParserManager(); sql = (PlainSelect) ((Select) parserManager.parse(new StringReader(sql.toString()))).getSelectBody(); } catch (JSQLParserException e) { String msg = "Unauthorized. Content policy is not valid."; throw new YADASecurityException(msg, e); } getYADAQuery().setCoreCode(sql.toString()); this.clearSecurityPolicy(); }
From source file:com.hp.autonomy.frontend.reports.powerpoint.PowerPointServiceImpl.java
/** * Internal implementation to add a list of documents to a presentation; either as a single slide or a series of slides. * @param imageSource the image source to convert images to data. * @param ppt the presentation to add to. * @param sl the slide to add to (can be null if pagination is enabled). * @param anchor bounding rectangle to draw onto, in PowerPoint coordinates. * @param paginate whether to render results as multiple slides if they don't fit on one slide. * @param data the documents to render.// w ww . j av a 2 s. c om * @param results optional string to render into the top-left corner of the available space. * Will appear on each page if pagination is enabled. * @param sortBy optional string to render into the top-right corner of the available space. * Will appear on each page if pagination is enabled. */ private static void addList(final ImageSource imageSource, final XMLSlideShow ppt, XSLFSlide sl, final Rectangle2D.Double anchor, final boolean paginate, final ListData data, final String results, final String sortBy) { final double // How much space to leave at the left and right edge of the slide xMargin = 20, // How much space to leave at the top yMargin = 5, // Size of the icon iconWidth = 20, iconHeight = 24, // Find's thumbnail height is 97px by 55px, hardcoded in the CSS in .document-thumbnail thumbScale = 0.8, thumbW = 97 * thumbScale, thumbH = 55 * thumbScale, // Margin around the thumbnail thumbMargin = 4., // Space between list items listItemMargin = 5.; final Pattern highlightPattern = Pattern .compile("<HavenSearch-QueryText-Placeholder>(.*?)</HavenSearch-QueryText-Placeholder>"); double yCursor = yMargin + anchor.getMinY(), xCursor = xMargin + anchor.getMinX(); int docsOnPage = 0; final Document[] docs = data.getDocs(); for (int docIdx = 0; docIdx < docs.length; ++docIdx) { final Document doc = docs[docIdx]; if (sl == null) { sl = ppt.createSlide(); yCursor = yMargin + anchor.getMinY(); xCursor = xMargin + anchor.getMinX(); docsOnPage = 0; double yStep = 0; if (StringUtils.isNotBlank(results)) { final XSLFTextBox textBox = sl.createTextBox(); textBox.clearText(); final Rectangle2D.Double textBounds = new Rectangle2D.Double(xCursor, yCursor, Math.max(0, anchor.getMaxX() - xCursor - xMargin), 20); textBox.setAnchor(textBounds); addTextRun(textBox.addNewTextParagraph(), results, 12., Color.LIGHT_GRAY); yStep = textBox.getTextHeight(); } if (StringUtils.isNotBlank(sortBy)) { final XSLFTextBox sortByEl = sl.createTextBox(); sortByEl.clearText(); final XSLFTextParagraph sortByText = sortByEl.addNewTextParagraph(); sortByText.setTextAlign(TextParagraph.TextAlign.RIGHT); addTextRun(sortByText, sortBy, 12., Color.LIGHT_GRAY); sortByEl.setAnchor(new Rectangle2D.Double(xCursor, yCursor, Math.max(0, anchor.getMaxX() - xCursor - xMargin), 20)); yStep = Math.max(sortByEl.getTextHeight(), yStep); } if (yStep > 0) { yCursor += listItemMargin + yStep; } } XSLFAutoShape icon = null; if (data.isDrawIcons()) { icon = sl.createAutoShape(); icon.setShapeType(ShapeType.SNIP_1_RECT); icon.setAnchor(new Rectangle2D.Double(xCursor, yCursor + listItemMargin, iconWidth, iconHeight)); icon.setLineColor(Color.decode("#888888")); icon.setLineWidth(2.0); xCursor += iconWidth; } final XSLFTextBox listEl = sl.createTextBox(); listEl.clearText(); listEl.setAnchor(new Rectangle2D.Double(xCursor, yCursor, Math.max(0, anchor.getMaxX() - xCursor - xMargin), Math.max(0, anchor.getMaxY() - yCursor))); final XSLFTextParagraph titlePara = listEl.addNewTextParagraph(); addTextRun(titlePara, doc.getTitle(), data.getTitleFontSize(), Color.BLACK).setBold(true); if (StringUtils.isNotBlank(doc.getDate())) { final XSLFTextParagraph datePara = listEl.addNewTextParagraph(); datePara.setLeftMargin(5.); addTextRun(datePara, doc.getDate(), data.getDateFontSize(), Color.GRAY).setItalic(true); } if (StringUtils.isNotBlank(doc.getRef())) { addTextRun(listEl.addNewTextParagraph(), doc.getRef(), data.getRefFontSize(), Color.GRAY); } final double thumbnailOffset = listEl.getTextHeight(); final XSLFTextParagraph contentPara = listEl.addNewTextParagraph(); Rectangle2D.Double pictureAnchor = null; XSLFPictureData pictureData = null; if (StringUtils.isNotBlank(doc.getThumbnail())) { try { // Picture reuse is automatic pictureData = addPictureData(imageSource, ppt, doc.getThumbnail()); // We reserve space for the picture, but we don't actually add it yet. // The reason is we may have to remove it later if it doesn't fit; but due to a quirk of OpenOffice, // deleting the picture shape removes the pictureData as well; which is a problem since the // pictureData can be shared between multiple pictures. pictureAnchor = new Rectangle2D.Double(xCursor, yCursor + thumbnailOffset + thumbMargin, thumbW, thumbH); // If there is enough horizontal space, put the text summary to the right of the thumbnail image, // otherwise put it under the thumbnail, if (listEl.getAnchor().getWidth() > 2.5 * thumbW) { contentPara.setLeftMargin(thumbW); } else { contentPara.addLineBreak().setFontSize(thumbH); } } catch (RuntimeException e) { // if there's any errors, we'll just ignore the image } } final String rawSummary = doc.getSummary(); if (StringUtils.isNotBlank(rawSummary)) { // HTML treats newlines and multiple whitespace as a single whitespace. final String summary = rawSummary.replaceAll("\\s+", " "); final Matcher matcher = highlightPattern.matcher(summary); int idx = 0; while (matcher.find()) { final int start = matcher.start(); if (idx < start) { addTextRun(contentPara, summary.substring(idx, start), data.getSummaryFontSize(), Color.DARK_GRAY); } addTextRun(contentPara, matcher.group(1), data.getSummaryFontSize(), Color.DARK_GRAY) .setBold(true); idx = matcher.end(); } if (idx < summary.length()) { addTextRun(contentPara, summary.substring(idx), data.getSummaryFontSize(), Color.DARK_GRAY); } } double elHeight = Math.max(listEl.getTextHeight(), iconHeight); if (pictureAnchor != null) { elHeight = Math.max(elHeight, pictureAnchor.getMaxY() - yCursor); } yCursor += elHeight; xCursor = xMargin + anchor.getMinX(); docsOnPage++; if (yCursor > anchor.getMaxY()) { if (docsOnPage > 1) { // If we drew more than one list element on this page; and we exceeded the available space, // delete the last element's shapes and redraw it on the next page. // We don't have to remove the picture since we never added it. sl.removeShape(listEl); if (icon != null) { sl.removeShape(icon); } --docIdx; } else if (pictureAnchor != null) { // We've confirmed we need the picture, add it. sl.createPicture(pictureData).setAnchor(pictureAnchor); } sl = null; if (!paginate) { break; } } else { yCursor += listItemMargin; if (pictureAnchor != null) { // We've confirmed we need the picture, add it. sl.createPicture(pictureData).setAnchor(pictureAnchor); } } } }
From source file:net.antoinecomte.regex.RegExTesterApplication.java
private void showResult(String regexValue, String textValue) { Matcher matcher; try {/*from w ww. j a va2 s .co m*/ result.setVisible(!"".equals(regexValue)); Label match = new Label("no match"); match.addStyleName("h3 color"); result.removeAllComponents(); result.addComponent(match); matcher = Pattern.compile(regexValue).matcher(textValue); if (matcher.matches()) { if (matcher.groupCount() > 0) for (int i = 1; i <= matcher.groupCount(); i++) { Label g = new Label("group " + i + " = " + matcher.group(i)); g.addStyleName("h3 color"); g.setSizeUndefined(); result.addComponent(g); } match.setValue("match"); } matcher.reset(); if (matcher.find()) { Label findresult = new Label("find=true, start = " + matcher.start() + " end = " + matcher.end()); findresult.addStyleName("h3 color"); result.addComponent(findresult); } Label javaString = new Label("java string : \"" + StringEscapeUtils.escapeJava(regexValue) + "\""); javaString.addStyleName("small color"); result.addComponent(javaString); } catch (Exception e) { result.removeAllComponents(); Label error = new Label(e.getMessage()); error.addStyleName("error"); result.addComponent(error); } }
From source file:com.zimbra.common.util.TemplateCompiler.java
public static void compile(File ifile, File ofile, String format, String pkg, boolean authoritative, boolean define) throws IOException { BufferedReader in = null;/*from ww w . ja va 2 s. co m*/ PrintWriter out = null; try { boolean isProperties = format.equals("properties"); in = new BufferedReader(new FileReader(ifile)); out = new PrintWriter(new FileWriter(ofile)); String lines = readLines(in); Matcher matcher = RE_TEMPLATE.matcher(lines); if (matcher.find()) { boolean first = true; do { Map<String, String> attrs = parseAttrs(matcher.group(1)); String templateId = attrs.get("id"); String packageId = pkg; // NOTE: Template ids can be specified absolutely (i.e. // overriding the default package) if the id starts // with a forward slash (/), or if the id contains // a hash mark (#). This allows a template file to // override both types of template files (i.e. a // single template per file or multiple templates // per file). if (templateId != null && (templateId.indexOf('#') != -1 || templateId.startsWith("/"))) { if (templateId.indexOf('#') == -1) templateId += "#"; packageId = templateId.replaceAll("#.*$", "").replaceAll("^/", "").replace('/', '.'); templateId = templateId.replaceAll("^.*#", ""); } String id = templateId != null && !templateId.equals("") ? packageId + "#" + templateId : packageId; // copy to .properties file if (isProperties) { printEscaped(out, id); String body = lines.substring(matcher.start(), matcher.end()); if (body.indexOf('\n') == -1) { out.print(" = "); printEscaped(out, body); } else { out.print(" ="); String[] bodylines = body.split("\n"); for (String bodyline : bodylines) { out.print("\\\n\t"); printEscaped(out, bodyline); } } out.println(); continue; } // compile to JavaScript String body = matcher.group(2); String stripWsAttr = attrs.get(A_XML_SPACE); if (stripWsAttr == null || !stripWsAttr.equals(V_XML_SPACE_PRESERVE)) { body = body.replaceAll(S_GT_LINESEP_LT, "><").trim(); } convertLines(out, id, body, attrs, authoritative); if (first && define) { out.print("AjxPackage.define(\""); out.print(packageId); out.println("\");"); } if (first) { first = false; out.print("AjxTemplate.register(\""); out.print(packageId); out.print("\", "); out.print("AjxTemplate.getTemplate(\""); out.print(id); out.print("\"), "); out.print("AjxTemplate.getParams(\""); out.print(id); out.println("\"));"); } out.println(); } while (matcher.find()); } else { convertLines(out, pkg, lines, null, authoritative); } } finally { if (in != null) { try { in.close(); } catch (Exception e) { // ignore } } if (out != null) { out.close(); } } }
From source file:com.joliciel.frenchTreebank.export.FrenchTreebankTokenReader.java
PosTagSequence nextSentenceInternal() { MONITOR.startTask("nextSentenceInternal"); try {//ww w . java 2s .com Sentence sentence = treebankReader.nextSentence(); LOG.debug("Sentence " + sentence.getSentenceNumber()); List<Integer> tokenSplits = new ArrayList<Integer>(); PosTagSet posTagSet = TalismaneSession.getPosTagSet(); String text = sentence.getText(); // get rid of duplicate white space Pattern duplicateWhiteSpace = Pattern.compile("\\s[\\s]+"); text = duplicateWhiteSpace.matcher(text).replaceAll(" "); // there's no guarantee that the phrase units align to the original sentence text // given the issues we had for aligning sentences in the first place List<PhraseUnit> phraseUnits = sentence.getAllPhraseUnits(); LOG.trace("Phrase units: " + phraseUnits.size()); Pattern separators = Tokeniser.SEPARATORS; Pattern whitespace = Pattern.compile("\\s+"); Matcher matcher = separators.matcher(text); List<String> allTokens = new ArrayList<String>(); int currentPos = 0; while (matcher.find()) { if (matcher.start() > currentPos) { String leftoverToken = text.substring(currentPos, matcher.start()); allTokens.add(leftoverToken); } String token = text.substring(matcher.start(), matcher.end()); allTokens.add(token); currentPos = matcher.end(); } if (currentPos < text.length()) allTokens.add(text.substring(currentPos)); com.joliciel.talismane.filters.Sentence oneSentence = this.filterService.getSentence(text); TokenSequence tokenSequence = this.tokeniserService.getTokenSequence(oneSentence); List<PosTaggedToken> posTaggedTokens = new ArrayList<PosTaggedToken>(); PhraseUnitReader phraseUnitReader = new ComplexPhraseUnitReaderWithEmptyTokens(phraseUnits); phraseUnitReader.setTreebankService(treebankService); if (ftbPosTagMapper != null) phraseUnitReader.setFtbPosTagMapper(ftbPosTagMapper); String phraseUnitText = phraseUnitReader.nextString(); LOG.trace("phrase unit: " + phraseUnitText); currentPos = 0; int lastSplit = 0; tokenSplits.add(0); while (phraseUnitText != null && phraseUnitText.length() == 0) { tokenSplits.add(currentPos); Token aToken = tokenSequence.addEmptyToken(currentPos); PosTag posTag = phraseUnitReader.getPosTag(); Decision<PosTag> corpusDecision = posTagSet.createDefaultDecision(posTag); PosTaggedToken posTaggedToken = posTaggerService.getPosTaggedToken(aToken, corpusDecision); posTaggedTokens.add(posTaggedToken); phraseUnitText = phraseUnitReader.nextString(); } boolean inPhraseUnit = false; boolean addEmptyTokenBeforeNextToken = false; PosTag emptyTokenPosTag = null; for (String token : allTokens) { if (LOG.isTraceEnabled()) LOG.trace("token: " + token); currentPos += token.length(); if ((!ignoreCase && phraseUnitText.equals(token)) || (ignoreCase && phraseUnitText.equalsIgnoreCase(token))) { // exact match if (addEmptyTokenBeforeNextToken) { if (LOG.isTraceEnabled()) LOG.trace("Adding empty token at " + (currentPos - token.length())); tokenSplits.add((currentPos - token.length())); Token emptyToken = tokenSequence.addEmptyToken((currentPos - token.length())); Decision<PosTag> emptyTokenDecision = posTagSet.createDefaultDecision(emptyTokenPosTag); PosTaggedToken posTaggedToken2 = posTaggerService.getPosTaggedToken(emptyToken, emptyTokenDecision); posTaggedTokens.add(posTaggedToken2); addEmptyTokenBeforeNextToken = false; } if (LOG.isTraceEnabled()) LOG.trace("Adding split " + currentPos); tokenSplits.add(currentPos); Token aToken = tokenSequence.addToken(lastSplit, currentPos); PosTag posTag = phraseUnitReader.getPosTag(); Decision<PosTag> corpusDecision = posTagSet.createDefaultDecision(posTag); PosTaggedToken posTaggedToken = posTaggerService.getPosTaggedToken(aToken, corpusDecision); posTaggedTokens.add(posTaggedToken); lastSplit = currentPos; phraseUnitText = phraseUnitReader.nextString(); if (LOG.isTraceEnabled()) LOG.trace("phrase unit: " + phraseUnitText); while (phraseUnitText != null && phraseUnitText.length() == 0) { Token emptyToken = null; emptyTokenPosTag = phraseUnitReader.getPosTag(); phraseUnitText = phraseUnitReader.nextString(); if (LOG.isTraceEnabled()) LOG.trace("phrase unit: " + phraseUnitText); // Empty tokens need to be attached either to the right (auquel, duquel) // or to the left (du, des) if (phraseUnitText.equals("duquel") || phraseUnitText.equals("auquel") || phraseUnitText.equals("desquels") || phraseUnitText.equals("auxquels") || phraseUnitText.equals("desquelles") || phraseUnitText.equals("auxquelles")) { // attach empty token to the "duquel" that follows it addEmptyTokenBeforeNextToken = true; } else { if (LOG.isTraceEnabled()) LOG.trace("Adding empty token at " + currentPos); tokenSplits.add(currentPos); emptyToken = tokenSequence.addEmptyToken(currentPos); Decision<PosTag> emptyTokenDecision = posTagSet.createDefaultDecision(emptyTokenPosTag); PosTaggedToken posTaggedToken2 = posTaggerService.getPosTaggedToken(emptyToken, emptyTokenDecision); posTaggedTokens.add(posTaggedToken2); } } inPhraseUnit = false; } else if (phraseUnitText.length() >= token.length() && ((!ignoreCase && phraseUnitText.substring(0, token.length()).equals(token)) || (ignoreCase && phraseUnitText.substring(0, token.length()).equalsIgnoreCase(token)))) { // the current phrase unit text starts with this token phraseUnitText = phraseUnitText.substring(token.length()); if (LOG.isTraceEnabled()) LOG.trace("phrase unit: " + phraseUnitText); inPhraseUnit = true; } else if (token.length() == 1 && whitespace.matcher(token).matches()) { // white space, always add split unless we're already inside white space if (!inPhraseUnit) { if (LOG.isTraceEnabled()) LOG.trace("Adding split " + currentPos); tokenSplits.add(currentPos); tokenSequence.addToken(lastSplit, currentPos); lastSplit = currentPos; } } else { // non-white space, what to do? either we skip the token, or we skip the phrase unit! // for now let's assume it never happens and see what results! int pos = 0; StringBuilder sb = new StringBuilder(); for (int split : tokenSplits) { String aToken = text.substring(pos, split); sb.append('|'); sb.append(aToken); pos = split; } LOG.info(sb.toString()); LOG.info("File: " + sentence.getFile().getFileName()); LOG.info("Sentence: " + text); if (csvFileErrorWriter != null) { try { csvFileErrorWriter.write(CSVFormatter.format(phraseUnitText) + ","); for (String info : phraseUnitReader.getCurrentInfo()) csvFileErrorWriter.write(CSVFormatter.format(info) + ","); csvFileErrorWriter.write(CSVFormatter.format(token) + ","); csvFileErrorWriter.write(sentence.getFile().getFileName() + ","); csvFileErrorWriter.write(sentence.getSentenceNumber() + ","); csvFileErrorWriter.write(CSVFormatter.format(sentence.getText()) + ","); csvFileErrorWriter.write("\n"); csvFileErrorWriter.flush(); } catch (IOException ioe) { throw new RuntimeException(ioe); } break; } else { // instead of throwing an error, write these to a file (or do both) // so we can catch them all in one fell swoop throw new RuntimeException("Unexpected text: " + token); } } } if (lastSplit < currentPos) { tokenSplits.add(currentPos); } if (LOG.isDebugEnabled()) { LOG.debug(text); int pos = 0; StringBuilder sb = new StringBuilder(); for (int split : tokenSplits) { String aToken = text.substring(pos, split); sb.append('|'); sb.append(aToken); pos = split; } LOG.debug(sb.toString()); } for (TokenSequenceFilter tokenSequenceFilter : this.tokenSequenceFilters) { if (LOG.isTraceEnabled()) LOG.trace("Applying filter: " + tokenSequenceFilter.getClass().getSimpleName()); tokenSequenceFilter.apply(tokenSequence); } if (tokenFilterWrapper == null) { tokenFilterWrapper = tokenFilterService.getTokenSequenceFilter(this.tokenFilters); } tokenFilterWrapper.apply(tokenSequence); tokenSequence.finalise(); PosTagSequence posTagSequence = this.posTaggerService.getPosTagSequence(tokenSequence, allTokens.size() / 2); int i = 0; for (Token token : tokenSequence) { if (LOG.isTraceEnabled()) LOG.trace("Token : \"" + token.getText() + "\" (was \"" + token.getOriginalText() + "\")"); PosTaggedToken posTaggedToken = posTaggedTokens.get(i); if (token.equals(posTaggedToken.getToken())) { posTagSequence.addPosTaggedToken(posTaggedToken); i++; } else if (token.getStartIndex() == token.getEndIndex()) { LOG.debug("Adding null pos tag at position " + token.getStartIndex()); Decision<PosTag> nullPosTagDecision = posTagSet.createDefaultDecision(PosTag.NULL_POS_TAG); PosTaggedToken emptyTagToken = posTaggerService.getPosTaggedToken(token, nullPosTagDecision); posTagSequence.addPosTaggedToken(emptyTagToken); } else { throw new RuntimeException("Expected only empty tokens added. Postag Token = " + posTaggedToken.getToken().getText() + ", start: " + token.getStartIndex() + ", end:" + token.getEndIndex()); } } if (useCompoundPosTags) { PosTagSequence newSequence = this.posTaggerService.getPosTagSequence(tokenSequence, allTokens.size() / 2); PosTaggedToken lastPosTaggedToken = null; i = 0; for (PosTaggedToken posTaggedToken : posTagSequence) { boolean removed = false; if (posTaggedToken.getToken().isEmpty()) { String lastWord = ""; if (lastPosTaggedToken != null) lastWord = lastPosTaggedToken.getToken().getOriginalText().toLowerCase(); if (lastWord.equals("des") || lastWord.equals("du") || lastWord.equals("aux") || lastWord.equals("au") || lastWord.endsWith(" des") || lastWord.endsWith(" du") || lastWord.endsWith(" aux") || lastWord.endsWith(" au") || lastWord.endsWith("'aux") || lastWord.endsWith("'au")) { if (lastWord.equals("des") || lastWord.equals("du") || lastWord.equals("aux") || lastWord.equals("au")) { if (lastPosTaggedToken.getTag().getCode().equals("P")) { lastPosTaggedToken.setTag(posTagSet.getPosTag("P+D")); lastPosTaggedToken.getToken().setText(lastWord); } } posTaggedToken.setTag(PosTag.NULL_POS_TAG); tokenSequence.removeEmptyToken(posTaggedToken.getToken()); removed = true; } else if (i == posTagSequence.size() - 1) { // last token in sequence // need to remove it now, since it won't get removed in the next iteration tokenSequence.removeEmptyToken(posTaggedToken.getToken()); removed = true; } } else { newSequence.addPosTaggedToken(posTaggedToken); } if (lastPosTaggedToken != null && lastPosTaggedToken.getToken().isEmpty()) { String word = posTaggedToken.getToken().getOriginalText().toLowerCase(); if (word.equals("duquel") || word.equals("desquels") || word.equals("desquelles") || word.equals("auquel") || word.equals("auxquels") || word.equals("auxquelles")) { posTaggedToken.setTag(posTagSet.getPosTag("P+PRO")); lastPosTaggedToken.setTag(PosTag.NULL_POS_TAG); posTaggedToken.getToken().setText(word); tokenSequence.removeEmptyToken(lastPosTaggedToken.getToken()); } else if (word.equals("dudit")) { posTaggedToken.setTag(posTagSet.getPosTag("P+D")); lastPosTaggedToken.setTag(PosTag.NULL_POS_TAG); posTaggedToken.getToken().setText(word); tokenSequence.removeEmptyToken(lastPosTaggedToken.getToken()); } else { LOG.info("Not expecting empty token here (index " + lastPosTaggedToken.getToken().getIndex() + ", next token = " + word + "): " + posTagSequence); lastPosTaggedToken.setTag(PosTag.NULL_POS_TAG); tokenSequence.removeEmptyToken(lastPosTaggedToken.getToken()); } } if (!removed) lastPosTaggedToken = posTaggedToken; i++; } posTagSequence = newSequence; tokenSequence.finalise(); } for (PosTagSequenceFilter posTagSequenceFilter : this.posTagSequenceFilters) { posTagSequenceFilter.apply(posTagSequence); } return posTagSequence; } finally { MONITOR.endTask("nextSentenceInternal"); } }
From source file:com.stealthyone.mcb.mcml.MCMLBuilder.java
public MCMLBuilder(String input, Map<String, Object> replacements) { Validate.notNull(input, "Input cannot be null."); this.rawText = input; if (this.replacements != null && replacements != null) { this.replacements.putAll(replacements); for (Entry<String, Object> entry : replacements.entrySet()) { if (entry.getValue() instanceof String) { this.rawText = rawText.replace(entry.getKey(), (String) entry.getValue()); }/*w w w . j a v a2 s . c o m*/ } } this.rawText = rawText.replace(ChatColor.COLOR_CHAR, '&'); // Identify text groups int lastIndex = 0; final Matcher matcher = PATTERN_TEXT_GROUP.matcher(input); while (matcher.find()) { TempPart part = new TempPart(matcher.group(1)); if (!parts.isEmpty()) { TempPart prevPart = parts.get(parts.size() - 1); TextPiece lastPiece = prevPart.text.get(prevPart.text.size() - 1); for (TextPiece piece : part.text) { if (piece.color == null) { piece.color = lastPiece.color; piece.italicize = lastPiece.italicize; piece.bold = lastPiece.bold; piece.underline = lastPiece.underline; piece.strikeout = lastPiece.strikeout; piece.magic = lastPiece.magic; } } } if (matcher.start() > lastIndex) { // Handle ungrouped text TempPart ungroupedPart = new TempPart(rawText.substring(lastIndex, matcher.start())); parts.add(ungroupedPart); } lastIndex = matcher.end(); // Check for event int offset = rawText.length() - input.substring(lastIndex).length(); final Matcher eventMatcher = PATTERN_EVENT.matcher(input.substring(lastIndex)); if (eventMatcher.find()) { handleEvent(part, eventMatcher); lastIndex = eventMatcher.end() + offset; offset = rawText.length() - input.substring(lastIndex).length(); final Matcher secEventMatcher = PATTERN_EVENT.matcher(input.substring(lastIndex)); if (secEventMatcher.find()) { handleEvent(part, secEventMatcher); lastIndex = secEventMatcher.end() + offset; } } parts.add(part); } if (lastIndex != rawText.length()) { TempPart ungroupedPart = new TempPart(rawText.substring(lastIndex)); if (!parts.contains(ungroupedPart)) { parts.add(ungroupedPart); } } }
From source file:tr.edu.gsu.nerwip.recognition.internal.modelless.subee.Subee.java
/** * Handles the name of the person described in the processed article. For this matter, * we consider the article title and name, as well as the first sentence, which generally * starts with the full name of the person. * /*from w w w . j a v a2 s . c o m*/ * @param article * Article to process. * @return * List of possible entities based on the analysis of the article title and name. * * @throws ClientProtocolException * Problem while accessing Freebase. * @throws ParseException * Problem while accessing Freebase. * @throws IOException * Problem while accessing Freebase. * @throws org.json.simple.parser.ParseException * Problem while accessing Freebase. */ private List<AbstractEntity<?>> processMainName(Article article) throws ClientProtocolException, ParseException, IOException, org.json.simple.parser.ParseException { logger.increaseOffset(); List<AbstractEntity<?>> result = new ArrayList<AbstractEntity<?>>(); String rawText = article.getRawText(); // init candidate strings with article name and title Set<String> candidateStrings = new TreeSet<String>(); String articleTitle = article.getTitle(); //debug //if(articleTitle.equals("Alfred Lothar Wegener")) // System.out.print(""); logger.log("Article title: " + articleTitle); candidateStrings.add(articleTitle); String articleName = article.getName(); logger.log("Article name: " + articleName); articleName = articleName.replace('_', ' ').trim(); candidateStrings.add(articleName); // process the beginning of the first sentence // we look for the string before the first parenthesis (usually containing birth info) // if there's none, we just ignore this potential information source Pattern p = Pattern.compile("^[^\\.]+?\\("); Matcher m = p.matcher(rawText); if (m.find()) { int startPos = m.start(); if (startPos == 0) { int endPos = m.end(); String persName = rawText.substring(0, endPos - 1); persName = persName.trim(); int wordCount = persName.length() - persName.replaceAll(" ", "").length(); if (wordCount > 6) logger.log( "Not able to extract person name from first sentence (too many words before the parenthesis): \"" + rawText.substring(0, 75) + "\""); else { logger.log("Person name: " + persName); candidateStrings.add(persName); } } } else logger.log("Not able to extract person name from first sentence (can't find the parenthesis): \"" + rawText.substring(0, 75) + "\""); // possibly remove double quotes (especially for the nicknames) List<String> nickFull = new ArrayList<String>(); Set<String> copy = new TreeSet<String>(candidateStrings); candidateStrings.clear(); for (String candidateString : copy) { if (candidateString.contains("\"")) { nickFull.add(candidateString); candidateString = candidateString.replaceAll("\"", ""); } candidateStrings.add(candidateString); } // possibly remove an indication in parenthesis at the end (especially for the titles) copy = new TreeSet<String>(candidateStrings); candidateStrings.clear(); for (String candidateString : copy) { if (candidateString.endsWith(")")) { String temp[] = candidateString.split("\\("); candidateString = temp[0].trim(); } candidateStrings.add(candidateString); } // add the lastname alone; only with the preceeding word; only with the 2 preeceding words, etc. copy = new TreeSet<String>(candidateStrings); for (String candidateString : copy) { String split[] = candidateString.split(" "); for (int i = split.length - 1; i >= 0; i--) { String temp = ""; for (int j = i; j < split.length; j++) temp = temp + split[j] + " "; temp = temp.trim(); candidateStrings.add(temp); } } // add very first and very last names (for more than 2 words) copy = new TreeSet<String>(candidateStrings); for (String candidateString : copy) { String split[] = candidateString.split(" "); if (split.length > 2) { String temp = split[0] + " " + split[split.length - 1]; candidateStrings.add(temp); } } // add variants with initials instead of firstnames copy = new TreeSet<String>(candidateStrings); for (String candidateString : copy) { String split[] = candidateString.split(" "); if (split.length > 1) { String initials1 = ""; String initials2 = ""; for (int i = 0; i < split.length - 1; i++) { initials1 = initials1 + split[i].substring(0, 1).toUpperCase(Locale.ENGLISH) + ". "; initials2 = initials2 + split[i].substring(0, 1).toUpperCase(Locale.ENGLISH) + "."; } initials1 = initials1 + split[split.length - 1]; initials2 = initials2 + " " + split[split.length - 1]; candidateStrings.add(initials1); candidateStrings.add(initials2); } } // add the original version of the nicknames candidateStrings.addAll(nickFull); // look for similar strings in the text for (String expr : candidateStrings) { String escapedStr = Pattern.quote(expr); p = Pattern.compile("\\b" + escapedStr + "\\b"); m = p.matcher(rawText); while (m.find()) { int startPos = m.start(); int endPos = m.end(); String valueStr = m.group(); AbstractEntity<?> ent = AbstractEntity.build(EntityType.PERSON, startPos, endPos, RecognizerName.SUBEE, valueStr); result.add(ent); } } if (result.isEmpty()) logger.log("WARNING: title not found at all in the text, which is unusual"); logger.decreaseOffset(); return result; }
From source file:com.manydesigns.portofino.pageactions.text.TextAction.java
protected String restoreAttachmentUrls(String content) { Pattern pattern = Pattern.compile(PORTOFINO_ATTACHMENT_PATTERN); Matcher matcher = pattern.matcher(content); int lastEnd = 0; StringBuilder sb = new StringBuilder(); while (matcher.find()) { String attachmentId = matcher.group(1); //Default to src for old texts String hrefAttribute = (matcher.groupCount() >= 3 && matcher.group(3) != null) ? matcher.group(3) : "src"; sb.append(content.substring(lastEnd, matcher.start())).append(hrefAttribute).append("=\"") .append(StringEscapeUtils.escapeHtml(generateViewAttachmentUrl(attachmentId))).append("\""); lastEnd = matcher.end();/*from w w w . ja v a 2 s . c o m*/ } sb.append(content.substring(lastEnd)); return sb.toString(); }
From source file:gtu._work.ui.RegexReplacer.java
/** * @param fromPattern//from w ww .ja va2s . c o m * ???pattern * @param toFormat * ??pattern * @param replaceText * ?? */ String replacer(String fromPattern, String toFormat, String replaceText) { String errorRtn = replaceText.toString(); try { int patternFlag = 0; // if (multiLineCheckBox.isSelected()) { patternFlag = Pattern.DOTALL | Pattern.MULTILINE; } Pattern pattern = Pattern.compile(fromPattern, patternFlag); Matcher matcher = pattern.matcher(replaceText); StringBuffer sb = new StringBuffer(); String tempStr = null; TradeOffConfig config = this.getTradeOffConfig(); { int startPos = 0; for (; matcher.find();) { tempStr = toFormat.toString(); sb.append(replaceText.substring(startPos, matcher.start())); // ---------------------------------------------- if (StringUtils.isBlank(config.fremarkerKey)) { // regex for (int ii = 0; ii <= matcher.groupCount(); ii++) { System.out.println(ii + " -- " + matcher.group(ii)); tempStr = tempStr.replaceAll("#" + ii + "#", Matcher.quoteReplacement(matcher.group(ii))); } } else if (StringUtils.isNotBlank(config.fremarkerKey)) { // freemarker Map<String, Object> root = new HashMap<String, Object>(); TreeMap<Integer, Object> lstMap = new TreeMap<Integer, Object>(); for (int ii = 0; ii <= matcher.groupCount(); ii++) { lstMap.put(ii, matcher.group(ii)); } root.put(StringUtils.trimToEmpty(config.fremarkerKey), lstMap.values()); System.out.println("template Map : " + root); tempStr = FreeMarkerSimpleUtil.replace(tempStr, root); } // ---------------------------------------------- sb.append(tempStr); startPos = matcher.end(); } sb.append(replaceText.substring(startPos)); } return sb.toString(); } catch (Exception ex) { JOptionPaneUtil.newInstance().iconErrorMessage().showMessageDialog(ex.getMessage(), getTitle()); return errorRtn; } }