Example usage for java.util.regex Matcher start

Introduction

In this page you can find the example usage for java.util.regex Matcher start.

Prototype

public int start()

Source Link

Document

Returns the start index of the previous match.

Usage

From source file:me.Wundero.Ray.utils.TextUtils.java

/**
 * Split the text at a regular expression. If skip, pattern will be skipped.
 *//*from  w w  w.  ja  va2s  . c  o m*/
public static List<Text> split(Text t, Pattern p, boolean skip) {
    if (!lit(t)) {
        return Utils.al(t);
    }
    List<Text> out = Utils.al();
    List<Text> children = t.getChildren();
    LiteralText.Builder text = ((LiteralText) t).toBuilder();
    String content = text.getContent();
    if (!p.matcher(content).find()) {
        return Utils.al(t);
    }
    if (p.matcher(content).matches()) {
        return skip ? Utils.al() : Utils.al(t);
    }
    Matcher m = p.matcher(content);
    while ((m = m.reset(content)).find()) {
        int s = m.start();
        int e = m.end();
        Text.Builder b = Text.builder(content.substring(0, s)).format(text.getFormat());
        b = apply(b, text);
        out.add(b.build());
        if (!skip) {
            b = Text.builder(m.group()).format(text.getFormat());
            b = apply(b, text);
            out.add(b.build());
        }
        content = content.substring(0, e);
    }
    if (!content.isEmpty()) {
        Text.Builder b = Text.builder(content).format(text.getFormat());
        b = apply(b, text);
        out.add(b.build());
    }
    Text.Builder tx = out.get(out.size() - 1).toBuilder();
    out.remove(out.size() - 1);
    for (Text child : children) {
        List<Text> lt = split(child, p, skip);
        if (lt.isEmpty()) {
            out.add(tx.build());
            tx = null;
        } else if (lt.size() == 1) {
            tx = tx == null ? lt.get(0).toBuilder() : tx.append(lt.get(0));
        } else {
            out.add(tx == null ? lt.get(0) : tx.append(lt.get(0)).build());
            for (int i = 1; i < lt.size() - 1; i++) {
                out.add(lt.get(i));
            }
            tx = tx == null ? lt.get(lt.size() - 1).toBuilder() : lt.get(lt.size() - 1).toBuilder();
        }
    }
    if (tx != null) {
        out.add(tx.build());
    }
    return out;
}

From source file:com.novartis.opensource.yada.plugin.Gatekeeper.java

/**
 * Modifies the original query by appending a dynamic predicate
 * <p>Recall the {@link Service#engagePreprocess} method
 * will recall {@link QueryManager#endowQuery} to 
 * reconform the code after this {@link Preprocess} 
 * disengages./*from w  w w. ja  v a 2 s. c o  m*/
 * 
 * 
 * @throws YADASecurityException when token retrieval fails
 */
@Override
public void applyContentPolicy() throws YADASecurityException {

    // TODO make it impossible to reset args and preargs dynamically if pl class implements SecurityPolicy
    //   this will close an attack vector

    String SPACE = " ";
    StringBuilder contentPolicy = new StringBuilder();
    Pattern rxInjection = Pattern.compile(RX_COL_INJECTION);
    String rawPolicy = getArgumentValue(CONTENT_POLICY_PREDICATE);
    Matcher m1 = rxInjection.matcher(rawPolicy);
    int start = 0;

    // field = getToken
    // field = getCookie(string)
    // field = getHeader(string)
    // field = getUser()
    // field = getRandom(string)

    if (!m1.find()) {
        String msg = "Unathorized. Injected method invocation failed.";
        throw new YADASecurityException(msg);
    }

    m1.reset();

    while (m1.find()) {
        int rxStart = m1.start();
        int rxEnd = m1.end();

        contentPolicy.append(rawPolicy.substring(start, rxStart));

        String frag = rawPolicy.substring(rxStart, rxEnd);
        String method = frag.substring(0, frag.indexOf('('));
        String arg = frag.substring(frag.indexOf('(') + 1, frag.indexOf(')'));
        Object val = null;
        try {
            if (arg.equals(""))
                val = getClass().getMethod(method).invoke(this, new Object[] {});
            else
                val = getClass().getMethod(method, new Class[] { java.lang.String.class }).invoke(this,
                        new Object[] { arg });
        } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException
                | InvocationTargetException e) {
            String msg = "Unathorized. Injected method invocation failed.";
            throw new YADASecurityException(msg, e);
        }
        contentPolicy.append((String) val + SPACE);

        start = rxEnd;
    }

    Expression parsedContentPolicy;
    try {
        parsedContentPolicy = CCJSqlParserUtil.parseCondExpression(contentPolicy.toString());
    } catch (JSQLParserException e) {
        String msg = "Unauthorized. Content policy is not valid.";
        throw new YADASecurityException(msg, e);
    }

    PlainSelect sql = (PlainSelect) ((Select) getYADAQuery().getStatement()).getSelectBody();
    Expression where = sql.getWhere();

    if (where != null) {
        AndExpression and = new AndExpression(where, parsedContentPolicy);
        sql.setWhere(and);
    } else {
        sql.setWhere(parsedContentPolicy);
    }
    try {
        CCJSqlParserManager parserManager = new CCJSqlParserManager();
        sql = (PlainSelect) ((Select) parserManager.parse(new StringReader(sql.toString()))).getSelectBody();
    } catch (JSQLParserException e) {
        String msg = "Unauthorized. Content policy is not valid.";
        throw new YADASecurityException(msg, e);
    }

    getYADAQuery().setCoreCode(sql.toString());
    this.clearSecurityPolicy();
}

From source file:com.hp.autonomy.frontend.reports.powerpoint.PowerPointServiceImpl.java

/**
 * Internal implementation to add a list of documents to a presentation; either as a single slide or a series of slides.
 * @param imageSource the image source to convert images to data.
 * @param ppt the presentation to add to.
 * @param sl the slide to add to (can be null if pagination is enabled).
 * @param anchor bounding rectangle to draw onto, in PowerPoint coordinates.
 * @param paginate whether to render results as multiple slides if they don't fit on one slide.
 * @param data the documents to render.//  w ww  . j  av  a 2 s.  c om
 * @param results optional string to render into the top-left corner of the available space.
 *                  Will appear on each page if pagination is enabled.
 * @param sortBy optional string to render into the top-right corner of the available space.
 *                  Will appear on each page if pagination is enabled.
 */
private static void addList(final ImageSource imageSource, final XMLSlideShow ppt, XSLFSlide sl,
        final Rectangle2D.Double anchor, final boolean paginate, final ListData data, final String results,
        final String sortBy) {
    final double
    // How much space to leave at the left and right edge of the slide
    xMargin = 20,
            // How much space to leave at the top
            yMargin = 5,
            // Size of the icon
            iconWidth = 20, iconHeight = 24,
            // Find's thumbnail height is 97px by 55px, hardcoded in the CSS in .document-thumbnail
            thumbScale = 0.8, thumbW = 97 * thumbScale, thumbH = 55 * thumbScale,
            // Margin around the thumbnail
            thumbMargin = 4.,
            // Space between list items
            listItemMargin = 5.;

    final Pattern highlightPattern = Pattern
            .compile("<HavenSearch-QueryText-Placeholder>(.*?)</HavenSearch-QueryText-Placeholder>");

    double yCursor = yMargin + anchor.getMinY(), xCursor = xMargin + anchor.getMinX();

    int docsOnPage = 0;

    final Document[] docs = data.getDocs();
    for (int docIdx = 0; docIdx < docs.length; ++docIdx) {
        final Document doc = docs[docIdx];

        if (sl == null) {
            sl = ppt.createSlide();
            yCursor = yMargin + anchor.getMinY();
            xCursor = xMargin + anchor.getMinX();
            docsOnPage = 0;

            double yStep = 0;

            if (StringUtils.isNotBlank(results)) {
                final XSLFTextBox textBox = sl.createTextBox();
                textBox.clearText();
                final Rectangle2D.Double textBounds = new Rectangle2D.Double(xCursor, yCursor,
                        Math.max(0, anchor.getMaxX() - xCursor - xMargin), 20);
                textBox.setAnchor(textBounds);

                addTextRun(textBox.addNewTextParagraph(), results, 12., Color.LIGHT_GRAY);

                yStep = textBox.getTextHeight();
            }

            if (StringUtils.isNotBlank(sortBy)) {
                final XSLFTextBox sortByEl = sl.createTextBox();
                sortByEl.clearText();
                final XSLFTextParagraph sortByText = sortByEl.addNewTextParagraph();
                sortByText.setTextAlign(TextParagraph.TextAlign.RIGHT);

                addTextRun(sortByText, sortBy, 12., Color.LIGHT_GRAY);

                sortByEl.setAnchor(new Rectangle2D.Double(xCursor, yCursor,
                        Math.max(0, anchor.getMaxX() - xCursor - xMargin), 20));

                yStep = Math.max(sortByEl.getTextHeight(), yStep);
            }

            if (yStep > 0) {
                yCursor += listItemMargin + yStep;
            }
        }

        XSLFAutoShape icon = null;
        if (data.isDrawIcons()) {
            icon = sl.createAutoShape();
            icon.setShapeType(ShapeType.SNIP_1_RECT);
            icon.setAnchor(new Rectangle2D.Double(xCursor, yCursor + listItemMargin, iconWidth, iconHeight));
            icon.setLineColor(Color.decode("#888888"));
            icon.setLineWidth(2.0);

            xCursor += iconWidth;
        }

        final XSLFTextBox listEl = sl.createTextBox();
        listEl.clearText();
        listEl.setAnchor(new Rectangle2D.Double(xCursor, yCursor,
                Math.max(0, anchor.getMaxX() - xCursor - xMargin), Math.max(0, anchor.getMaxY() - yCursor)));

        final XSLFTextParagraph titlePara = listEl.addNewTextParagraph();
        addTextRun(titlePara, doc.getTitle(), data.getTitleFontSize(), Color.BLACK).setBold(true);

        if (StringUtils.isNotBlank(doc.getDate())) {
            final XSLFTextParagraph datePara = listEl.addNewTextParagraph();
            datePara.setLeftMargin(5.);
            addTextRun(datePara, doc.getDate(), data.getDateFontSize(), Color.GRAY).setItalic(true);
        }

        if (StringUtils.isNotBlank(doc.getRef())) {
            addTextRun(listEl.addNewTextParagraph(), doc.getRef(), data.getRefFontSize(), Color.GRAY);
        }

        final double thumbnailOffset = listEl.getTextHeight();

        final XSLFTextParagraph contentPara = listEl.addNewTextParagraph();

        Rectangle2D.Double pictureAnchor = null;
        XSLFPictureData pictureData = null;

        if (StringUtils.isNotBlank(doc.getThumbnail())) {
            try {
                // Picture reuse is automatic
                pictureData = addPictureData(imageSource, ppt, doc.getThumbnail());
                // We reserve space for the picture, but we don't actually add it yet.
                // The reason is we may have to remove it later if it doesn't fit; but due to a quirk of OpenOffice,
                //   deleting the picture shape removes the pictureData as well; which is a problem since the
                //   pictureData can be shared between multiple pictures.
                pictureAnchor = new Rectangle2D.Double(xCursor, yCursor + thumbnailOffset + thumbMargin, thumbW,
                        thumbH);

                // If there is enough horizontal space, put the text summary to the right of the thumbnail image,
                //    otherwise put it under the thumbnail,
                if (listEl.getAnchor().getWidth() > 2.5 * thumbW) {
                    contentPara.setLeftMargin(thumbW);
                } else {
                    contentPara.addLineBreak().setFontSize(thumbH);
                }

            } catch (RuntimeException e) {
                // if there's any errors, we'll just ignore the image
            }
        }

        final String rawSummary = doc.getSummary();
        if (StringUtils.isNotBlank(rawSummary)) {
            // HTML treats newlines and multiple whitespace as a single whitespace.
            final String summary = rawSummary.replaceAll("\\s+", " ");
            final Matcher matcher = highlightPattern.matcher(summary);
            int idx = 0;

            while (matcher.find()) {
                final int start = matcher.start();

                if (idx < start) {
                    addTextRun(contentPara, summary.substring(idx, start), data.getSummaryFontSize(),
                            Color.DARK_GRAY);
                }

                addTextRun(contentPara, matcher.group(1), data.getSummaryFontSize(), Color.DARK_GRAY)
                        .setBold(true);
                idx = matcher.end();
            }

            if (idx < summary.length()) {
                addTextRun(contentPara, summary.substring(idx), data.getSummaryFontSize(), Color.DARK_GRAY);
            }
        }

        double elHeight = Math.max(listEl.getTextHeight(), iconHeight);
        if (pictureAnchor != null) {
            elHeight = Math.max(elHeight, pictureAnchor.getMaxY() - yCursor);
        }

        yCursor += elHeight;
        xCursor = xMargin + anchor.getMinX();

        docsOnPage++;

        if (yCursor > anchor.getMaxY()) {
            if (docsOnPage > 1) {
                // If we drew more than one list element on this page; and we exceeded the available space,
                //   delete the last element's shapes and redraw it on the next page.
                // We don't have to remove the picture since we never added it.
                sl.removeShape(listEl);
                if (icon != null) {
                    sl.removeShape(icon);
                }

                --docIdx;
            } else if (pictureAnchor != null) {
                // We've confirmed we need the picture, add it.
                sl.createPicture(pictureData).setAnchor(pictureAnchor);
            }

            sl = null;

            if (!paginate) {
                break;
            }
        } else {
            yCursor += listItemMargin;

            if (pictureAnchor != null) {
                // We've confirmed we need the picture, add it.
                sl.createPicture(pictureData).setAnchor(pictureAnchor);
            }
        }
    }
}

From source file:net.antoinecomte.regex.RegExTesterApplication.java

private void showResult(String regexValue, String textValue) {
    Matcher matcher;
    try {/*from w ww.  j a  va2  s .co m*/
        result.setVisible(!"".equals(regexValue));
        Label match = new Label("no match");
        match.addStyleName("h3 color");
        result.removeAllComponents();
        result.addComponent(match);
        matcher = Pattern.compile(regexValue).matcher(textValue);
        if (matcher.matches()) {
            if (matcher.groupCount() > 0)
                for (int i = 1; i <= matcher.groupCount(); i++) {
                    Label g = new Label("group " + i + " = " + matcher.group(i));
                    g.addStyleName("h3 color");
                    g.setSizeUndefined();
                    result.addComponent(g);
                }
            match.setValue("match");
        }
        matcher.reset();
        if (matcher.find()) {
            Label findresult = new Label("find=true, start = " + matcher.start() + " end = " + matcher.end());
            findresult.addStyleName("h3 color");
            result.addComponent(findresult);
        }
        Label javaString = new Label("java string : \"" + StringEscapeUtils.escapeJava(regexValue) + "\"");
        javaString.addStyleName("small color");
        result.addComponent(javaString);
    } catch (Exception e) {
        result.removeAllComponents();
        Label error = new Label(e.getMessage());
        error.addStyleName("error");
        result.addComponent(error);
    }
}

From source file:com.zimbra.common.util.TemplateCompiler.java

public static void compile(File ifile, File ofile, String format, String pkg, boolean authoritative,
        boolean define) throws IOException {
    BufferedReader in = null;/*from   ww w . ja  va 2 s. co  m*/
    PrintWriter out = null;
    try {
        boolean isProperties = format.equals("properties");
        in = new BufferedReader(new FileReader(ifile));
        out = new PrintWriter(new FileWriter(ofile));

        String lines = readLines(in);
        Matcher matcher = RE_TEMPLATE.matcher(lines);
        if (matcher.find()) {
            boolean first = true;
            do {
                Map<String, String> attrs = parseAttrs(matcher.group(1));
                String templateId = attrs.get("id");
                String packageId = pkg;
                // NOTE: Template ids can be specified absolutely (i.e.
                //       overriding the default package) if the id starts
                //       with a forward slash (/), or if the id contains
                //       a hash mark (#). This allows a template file to
                //       override both types of template files (i.e. a
                //       single template per file or multiple templates
                //       per file).
                if (templateId != null && (templateId.indexOf('#') != -1 || templateId.startsWith("/"))) {
                    if (templateId.indexOf('#') == -1)
                        templateId += "#";
                    packageId = templateId.replaceAll("#.*$", "").replaceAll("^/", "").replace('/', '.');
                    templateId = templateId.replaceAll("^.*#", "");
                }
                String id = templateId != null && !templateId.equals("") ? packageId + "#" + templateId
                        : packageId;

                // copy to .properties file
                if (isProperties) {
                    printEscaped(out, id);
                    String body = lines.substring(matcher.start(), matcher.end());
                    if (body.indexOf('\n') == -1) {
                        out.print(" = ");
                        printEscaped(out, body);
                    } else {
                        out.print(" =");
                        String[] bodylines = body.split("\n");
                        for (String bodyline : bodylines) {
                            out.print("\\\n\t");
                            printEscaped(out, bodyline);
                        }
                    }
                    out.println();
                    continue;
                }

                // compile to JavaScript
                String body = matcher.group(2);
                String stripWsAttr = attrs.get(A_XML_SPACE);
                if (stripWsAttr == null || !stripWsAttr.equals(V_XML_SPACE_PRESERVE)) {
                    body = body.replaceAll(S_GT_LINESEP_LT, "><").trim();
                }
                convertLines(out, id, body, attrs, authoritative);
                if (first && define) {
                    out.print("AjxPackage.define(\"");
                    out.print(packageId);
                    out.println("\");");
                }
                if (first) {
                    first = false;
                    out.print("AjxTemplate.register(\"");
                    out.print(packageId);
                    out.print("\", ");
                    out.print("AjxTemplate.getTemplate(\"");
                    out.print(id);
                    out.print("\"), ");
                    out.print("AjxTemplate.getParams(\"");
                    out.print(id);

                    out.println("\"));");
                }
                out.println();
            } while (matcher.find());
        } else {
            convertLines(out, pkg, lines, null, authoritative);
        }
    } finally {
        if (in != null) {
            try {
                in.close();
            } catch (Exception e) {
                // ignore
            }
        }
        if (out != null) {
            out.close();
        }
    }
}

From source file:com.joliciel.frenchTreebank.export.FrenchTreebankTokenReader.java

PosTagSequence nextSentenceInternal() {
    MONITOR.startTask("nextSentenceInternal");
    try {//ww w  . java  2s .com
        Sentence sentence = treebankReader.nextSentence();
        LOG.debug("Sentence " + sentence.getSentenceNumber());
        List<Integer> tokenSplits = new ArrayList<Integer>();
        PosTagSet posTagSet = TalismaneSession.getPosTagSet();

        String text = sentence.getText();
        // get rid of duplicate white space
        Pattern duplicateWhiteSpace = Pattern.compile("\\s[\\s]+");
        text = duplicateWhiteSpace.matcher(text).replaceAll(" ");

        // there's no guarantee that the phrase units align to the original sentence text
        // given the issues we had for aligning sentences in the first place
        List<PhraseUnit> phraseUnits = sentence.getAllPhraseUnits();
        LOG.trace("Phrase units: " + phraseUnits.size());
        Pattern separators = Tokeniser.SEPARATORS;
        Pattern whitespace = Pattern.compile("\\s+");

        Matcher matcher = separators.matcher(text);
        List<String> allTokens = new ArrayList<String>();
        int currentPos = 0;
        while (matcher.find()) {
            if (matcher.start() > currentPos) {
                String leftoverToken = text.substring(currentPos, matcher.start());
                allTokens.add(leftoverToken);
            }
            String token = text.substring(matcher.start(), matcher.end());
            allTokens.add(token);
            currentPos = matcher.end();
        }
        if (currentPos < text.length())
            allTokens.add(text.substring(currentPos));

        com.joliciel.talismane.filters.Sentence oneSentence = this.filterService.getSentence(text);
        TokenSequence tokenSequence = this.tokeniserService.getTokenSequence(oneSentence);
        List<PosTaggedToken> posTaggedTokens = new ArrayList<PosTaggedToken>();

        PhraseUnitReader phraseUnitReader = new ComplexPhraseUnitReaderWithEmptyTokens(phraseUnits);

        phraseUnitReader.setTreebankService(treebankService);
        if (ftbPosTagMapper != null)
            phraseUnitReader.setFtbPosTagMapper(ftbPosTagMapper);

        String phraseUnitText = phraseUnitReader.nextString();

        LOG.trace("phrase unit: " + phraseUnitText);
        currentPos = 0;
        int lastSplit = 0;
        tokenSplits.add(0);

        while (phraseUnitText != null && phraseUnitText.length() == 0) {
            tokenSplits.add(currentPos);
            Token aToken = tokenSequence.addEmptyToken(currentPos);
            PosTag posTag = phraseUnitReader.getPosTag();
            Decision<PosTag> corpusDecision = posTagSet.createDefaultDecision(posTag);

            PosTaggedToken posTaggedToken = posTaggerService.getPosTaggedToken(aToken, corpusDecision);
            posTaggedTokens.add(posTaggedToken);
            phraseUnitText = phraseUnitReader.nextString();
        }
        boolean inPhraseUnit = false;
        boolean addEmptyTokenBeforeNextToken = false;
        PosTag emptyTokenPosTag = null;
        for (String token : allTokens) {
            if (LOG.isTraceEnabled())
                LOG.trace("token: " + token);
            currentPos += token.length();
            if ((!ignoreCase && phraseUnitText.equals(token))
                    || (ignoreCase && phraseUnitText.equalsIgnoreCase(token))) {
                // exact match

                if (addEmptyTokenBeforeNextToken) {
                    if (LOG.isTraceEnabled())
                        LOG.trace("Adding empty token at " + (currentPos - token.length()));
                    tokenSplits.add((currentPos - token.length()));
                    Token emptyToken = tokenSequence.addEmptyToken((currentPos - token.length()));
                    Decision<PosTag> emptyTokenDecision = posTagSet.createDefaultDecision(emptyTokenPosTag);
                    PosTaggedToken posTaggedToken2 = posTaggerService.getPosTaggedToken(emptyToken,
                            emptyTokenDecision);
                    posTaggedTokens.add(posTaggedToken2);
                    addEmptyTokenBeforeNextToken = false;
                }

                if (LOG.isTraceEnabled())
                    LOG.trace("Adding split " + currentPos);
                tokenSplits.add(currentPos);

                Token aToken = tokenSequence.addToken(lastSplit, currentPos);
                PosTag posTag = phraseUnitReader.getPosTag();
                Decision<PosTag> corpusDecision = posTagSet.createDefaultDecision(posTag);
                PosTaggedToken posTaggedToken = posTaggerService.getPosTaggedToken(aToken, corpusDecision);
                posTaggedTokens.add(posTaggedToken);

                lastSplit = currentPos;
                phraseUnitText = phraseUnitReader.nextString();
                if (LOG.isTraceEnabled())
                    LOG.trace("phrase unit: " + phraseUnitText);
                while (phraseUnitText != null && phraseUnitText.length() == 0) {
                    Token emptyToken = null;
                    emptyTokenPosTag = phraseUnitReader.getPosTag();
                    phraseUnitText = phraseUnitReader.nextString();
                    if (LOG.isTraceEnabled())
                        LOG.trace("phrase unit: " + phraseUnitText);

                    // Empty tokens need to be attached either to the right (auquel, duquel)
                    // or to the left (du, des)
                    if (phraseUnitText.equals("duquel") || phraseUnitText.equals("auquel")
                            || phraseUnitText.equals("desquels") || phraseUnitText.equals("auxquels")
                            || phraseUnitText.equals("desquelles") || phraseUnitText.equals("auxquelles")) {
                        // attach empty token to the "duquel" that follows it
                        addEmptyTokenBeforeNextToken = true;
                    } else {
                        if (LOG.isTraceEnabled())
                            LOG.trace("Adding empty token at " + currentPos);
                        tokenSplits.add(currentPos);
                        emptyToken = tokenSequence.addEmptyToken(currentPos);
                        Decision<PosTag> emptyTokenDecision = posTagSet.createDefaultDecision(emptyTokenPosTag);
                        PosTaggedToken posTaggedToken2 = posTaggerService.getPosTaggedToken(emptyToken,
                                emptyTokenDecision);
                        posTaggedTokens.add(posTaggedToken2);
                    }
                }
                inPhraseUnit = false;
            } else if (phraseUnitText.length() >= token.length() && ((!ignoreCase
                    && phraseUnitText.substring(0, token.length()).equals(token))
                    || (ignoreCase && phraseUnitText.substring(0, token.length()).equalsIgnoreCase(token)))) {
                // the current phrase unit text starts with this token
                phraseUnitText = phraseUnitText.substring(token.length());
                if (LOG.isTraceEnabled())
                    LOG.trace("phrase unit: " + phraseUnitText);
                inPhraseUnit = true;
            } else if (token.length() == 1 && whitespace.matcher(token).matches()) {
                // white space, always add split unless we're already inside white space
                if (!inPhraseUnit) {
                    if (LOG.isTraceEnabled())
                        LOG.trace("Adding split " + currentPos);
                    tokenSplits.add(currentPos);
                    tokenSequence.addToken(lastSplit, currentPos);
                    lastSplit = currentPos;
                }
            } else {
                // non-white space, what to do? either we skip the token, or we skip the phrase unit!
                // for now let's assume it never happens and see what results!
                int pos = 0;
                StringBuilder sb = new StringBuilder();
                for (int split : tokenSplits) {
                    String aToken = text.substring(pos, split);
                    sb.append('|');
                    sb.append(aToken);
                    pos = split;
                }
                LOG.info(sb.toString());
                LOG.info("File: " + sentence.getFile().getFileName());
                LOG.info("Sentence: " + text);
                if (csvFileErrorWriter != null) {
                    try {
                        csvFileErrorWriter.write(CSVFormatter.format(phraseUnitText) + ",");
                        for (String info : phraseUnitReader.getCurrentInfo())
                            csvFileErrorWriter.write(CSVFormatter.format(info) + ",");
                        csvFileErrorWriter.write(CSVFormatter.format(token) + ",");
                        csvFileErrorWriter.write(sentence.getFile().getFileName() + ",");
                        csvFileErrorWriter.write(sentence.getSentenceNumber() + ",");
                        csvFileErrorWriter.write(CSVFormatter.format(sentence.getText()) + ",");
                        csvFileErrorWriter.write("\n");
                        csvFileErrorWriter.flush();
                    } catch (IOException ioe) {
                        throw new RuntimeException(ioe);
                    }
                    break;
                } else {
                    // instead of throwing an error, write these to a file (or do both)
                    // so we can catch them all in one fell swoop
                    throw new RuntimeException("Unexpected text: " + token);
                }
            }
        }
        if (lastSplit < currentPos) {
            tokenSplits.add(currentPos);
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug(text);

            int pos = 0;
            StringBuilder sb = new StringBuilder();
            for (int split : tokenSplits) {
                String aToken = text.substring(pos, split);
                sb.append('|');
                sb.append(aToken);
                pos = split;
            }
            LOG.debug(sb.toString());
        }

        for (TokenSequenceFilter tokenSequenceFilter : this.tokenSequenceFilters) {
            if (LOG.isTraceEnabled())
                LOG.trace("Applying filter: " + tokenSequenceFilter.getClass().getSimpleName());
            tokenSequenceFilter.apply(tokenSequence);
        }

        if (tokenFilterWrapper == null) {
            tokenFilterWrapper = tokenFilterService.getTokenSequenceFilter(this.tokenFilters);
        }
        tokenFilterWrapper.apply(tokenSequence);

        tokenSequence.finalise();

        PosTagSequence posTagSequence = this.posTaggerService.getPosTagSequence(tokenSequence,
                allTokens.size() / 2);
        int i = 0;
        for (Token token : tokenSequence) {
            if (LOG.isTraceEnabled())
                LOG.trace("Token : \"" + token.getText() + "\" (was \"" + token.getOriginalText() + "\")");
            PosTaggedToken posTaggedToken = posTaggedTokens.get(i);
            if (token.equals(posTaggedToken.getToken())) {
                posTagSequence.addPosTaggedToken(posTaggedToken);
                i++;
            } else if (token.getStartIndex() == token.getEndIndex()) {
                LOG.debug("Adding null pos tag at position " + token.getStartIndex());
                Decision<PosTag> nullPosTagDecision = posTagSet.createDefaultDecision(PosTag.NULL_POS_TAG);
                PosTaggedToken emptyTagToken = posTaggerService.getPosTaggedToken(token, nullPosTagDecision);
                posTagSequence.addPosTaggedToken(emptyTagToken);
            } else {
                throw new RuntimeException("Expected only empty tokens added. Postag Token = "
                        + posTaggedToken.getToken().getText() + ", start: " + token.getStartIndex() + ", end:"
                        + token.getEndIndex());
            }
        }

        if (useCompoundPosTags) {
            PosTagSequence newSequence = this.posTaggerService.getPosTagSequence(tokenSequence,
                    allTokens.size() / 2);
            PosTaggedToken lastPosTaggedToken = null;
            i = 0;
            for (PosTaggedToken posTaggedToken : posTagSequence) {
                boolean removed = false;
                if (posTaggedToken.getToken().isEmpty()) {
                    String lastWord = "";
                    if (lastPosTaggedToken != null)
                        lastWord = lastPosTaggedToken.getToken().getOriginalText().toLowerCase();
                    if (lastWord.equals("des") || lastWord.equals("du") || lastWord.equals("aux")
                            || lastWord.equals("au") || lastWord.endsWith(" des") || lastWord.endsWith(" du")
                            || lastWord.endsWith(" aux") || lastWord.endsWith(" au")
                            || lastWord.endsWith("'aux") || lastWord.endsWith("'au")) {
                        if (lastWord.equals("des") || lastWord.equals("du") || lastWord.equals("aux")
                                || lastWord.equals("au")) {
                            if (lastPosTaggedToken.getTag().getCode().equals("P")) {
                                lastPosTaggedToken.setTag(posTagSet.getPosTag("P+D"));
                                lastPosTaggedToken.getToken().setText(lastWord);
                            }
                        }
                        posTaggedToken.setTag(PosTag.NULL_POS_TAG);
                        tokenSequence.removeEmptyToken(posTaggedToken.getToken());
                        removed = true;
                    } else if (i == posTagSequence.size() - 1) {
                        // last token in sequence
                        // need to remove it now, since it won't get removed in the next iteration
                        tokenSequence.removeEmptyToken(posTaggedToken.getToken());
                        removed = true;
                    }
                } else {
                    newSequence.addPosTaggedToken(posTaggedToken);
                }
                if (lastPosTaggedToken != null && lastPosTaggedToken.getToken().isEmpty()) {
                    String word = posTaggedToken.getToken().getOriginalText().toLowerCase();
                    if (word.equals("duquel") || word.equals("desquels") || word.equals("desquelles")
                            || word.equals("auquel") || word.equals("auxquels") || word.equals("auxquelles")) {
                        posTaggedToken.setTag(posTagSet.getPosTag("P+PRO"));
                        lastPosTaggedToken.setTag(PosTag.NULL_POS_TAG);
                        posTaggedToken.getToken().setText(word);
                        tokenSequence.removeEmptyToken(lastPosTaggedToken.getToken());
                    } else if (word.equals("dudit")) {
                        posTaggedToken.setTag(posTagSet.getPosTag("P+D"));
                        lastPosTaggedToken.setTag(PosTag.NULL_POS_TAG);
                        posTaggedToken.getToken().setText(word);
                        tokenSequence.removeEmptyToken(lastPosTaggedToken.getToken());
                    } else {
                        LOG.info("Not expecting empty token here (index "
                                + lastPosTaggedToken.getToken().getIndex() + ", next token = " + word + "): "
                                + posTagSequence);
                        lastPosTaggedToken.setTag(PosTag.NULL_POS_TAG);
                        tokenSequence.removeEmptyToken(lastPosTaggedToken.getToken());
                    }
                }
                if (!removed)
                    lastPosTaggedToken = posTaggedToken;
                i++;
            }
            posTagSequence = newSequence;
            tokenSequence.finalise();
        }
        for (PosTagSequenceFilter posTagSequenceFilter : this.posTagSequenceFilters) {
            posTagSequenceFilter.apply(posTagSequence);
        }

        return posTagSequence;
    } finally {
        MONITOR.endTask("nextSentenceInternal");
    }
}

From source file:com.stealthyone.mcb.mcml.MCMLBuilder.java

public MCMLBuilder(String input, Map<String, Object> replacements) {
    Validate.notNull(input, "Input cannot be null.");

    this.rawText = input;

    if (this.replacements != null && replacements != null) {
        this.replacements.putAll(replacements);

        for (Entry<String, Object> entry : replacements.entrySet()) {
            if (entry.getValue() instanceof String) {
                this.rawText = rawText.replace(entry.getKey(), (String) entry.getValue());
            }/*w  w  w  . j a  v  a2  s . c o  m*/
        }
    }

    this.rawText = rawText.replace(ChatColor.COLOR_CHAR, '&');

    // Identify text groups
    int lastIndex = 0;

    final Matcher matcher = PATTERN_TEXT_GROUP.matcher(input);
    while (matcher.find()) {
        TempPart part = new TempPart(matcher.group(1));
        if (!parts.isEmpty()) {
            TempPart prevPart = parts.get(parts.size() - 1);
            TextPiece lastPiece = prevPart.text.get(prevPart.text.size() - 1);

            for (TextPiece piece : part.text) {
                if (piece.color == null) {
                    piece.color = lastPiece.color;
                    piece.italicize = lastPiece.italicize;
                    piece.bold = lastPiece.bold;
                    piece.underline = lastPiece.underline;
                    piece.strikeout = lastPiece.strikeout;
                    piece.magic = lastPiece.magic;
                }
            }

        }

        if (matcher.start() > lastIndex) {
            // Handle ungrouped text
            TempPart ungroupedPart = new TempPart(rawText.substring(lastIndex, matcher.start()));

            parts.add(ungroupedPart);
        }

        lastIndex = matcher.end();

        // Check for event
        int offset = rawText.length() - input.substring(lastIndex).length();
        final Matcher eventMatcher = PATTERN_EVENT.matcher(input.substring(lastIndex));
        if (eventMatcher.find()) {
            handleEvent(part, eventMatcher);

            lastIndex = eventMatcher.end() + offset;

            offset = rawText.length() - input.substring(lastIndex).length();
            final Matcher secEventMatcher = PATTERN_EVENT.matcher(input.substring(lastIndex));
            if (secEventMatcher.find()) {
                handleEvent(part, secEventMatcher);

                lastIndex = secEventMatcher.end() + offset;
            }
        }

        parts.add(part);
    }

    if (lastIndex != rawText.length()) {
        TempPart ungroupedPart = new TempPart(rawText.substring(lastIndex));

        if (!parts.contains(ungroupedPart)) {
            parts.add(ungroupedPart);
        }
    }
}

From source file:tr.edu.gsu.nerwip.recognition.internal.modelless.subee.Subee.java

/**
 * Handles the name of the person described in the processed article. For this matter,
 * we consider the article title and name, as well as the first sentence, which generally
 * starts with the full name of the person.
 * /*from w w  w .  j  a  v a2  s  .  c  o m*/
 * @param article 
 *       Article to process.
 * @return
 *       List of possible entities based on the analysis of the article title and name.
 * 
 * @throws ClientProtocolException
 *       Problem while accessing Freebase.
 * @throws ParseException
 *       Problem while accessing Freebase.
 * @throws IOException
 *       Problem while accessing Freebase.
 * @throws org.json.simple.parser.ParseException
 *       Problem while accessing Freebase.
 */
private List<AbstractEntity<?>> processMainName(Article article)
        throws ClientProtocolException, ParseException, IOException, org.json.simple.parser.ParseException {
    logger.increaseOffset();
    List<AbstractEntity<?>> result = new ArrayList<AbstractEntity<?>>();
    String rawText = article.getRawText();

    // init candidate strings with article name and title 
    Set<String> candidateStrings = new TreeSet<String>();
    String articleTitle = article.getTitle();
    //debug
    //if(articleTitle.equals("Alfred Lothar Wegener"))
    //   System.out.print("");
    logger.log("Article title: " + articleTitle);
    candidateStrings.add(articleTitle);
    String articleName = article.getName();
    logger.log("Article name: " + articleName);
    articleName = articleName.replace('_', ' ').trim();
    candidateStrings.add(articleName);

    // process the beginning of the first sentence
    // we look for the string before the first parenthesis (usually containing birth info)
    // if there's none, we just ignore this potential information source
    Pattern p = Pattern.compile("^[^\\.]+?\\(");
    Matcher m = p.matcher(rawText);
    if (m.find()) {
        int startPos = m.start();
        if (startPos == 0) {
            int endPos = m.end();
            String persName = rawText.substring(0, endPos - 1);
            persName = persName.trim();
            int wordCount = persName.length() - persName.replaceAll(" ", "").length();
            if (wordCount > 6)
                logger.log(
                        "Not able to extract person name from first sentence (too many words before the parenthesis): \""
                                + rawText.substring(0, 75) + "\"");
            else {
                logger.log("Person name: " + persName);
                candidateStrings.add(persName);
            }
        }
    } else
        logger.log("Not able to extract person name from first sentence (can't find the parenthesis): \""
                + rawText.substring(0, 75) + "\"");

    // possibly remove double quotes (especially for the nicknames)
    List<String> nickFull = new ArrayList<String>();
    Set<String> copy = new TreeSet<String>(candidateStrings);
    candidateStrings.clear();
    for (String candidateString : copy) {
        if (candidateString.contains("\"")) {
            nickFull.add(candidateString);
            candidateString = candidateString.replaceAll("\"", "");
        }
        candidateStrings.add(candidateString);
    }

    // possibly remove an indication in parenthesis at the end (especially for the titles)
    copy = new TreeSet<String>(candidateStrings);
    candidateStrings.clear();
    for (String candidateString : copy) {
        if (candidateString.endsWith(")")) {
            String temp[] = candidateString.split("\\(");
            candidateString = temp[0].trim();
        }
        candidateStrings.add(candidateString);
    }

    // add the lastname alone; only with the preceeding word; only with the 2 preeceding words, etc.
    copy = new TreeSet<String>(candidateStrings);
    for (String candidateString : copy) {
        String split[] = candidateString.split(" ");
        for (int i = split.length - 1; i >= 0; i--) {
            String temp = "";
            for (int j = i; j < split.length; j++)
                temp = temp + split[j] + " ";
            temp = temp.trim();
            candidateStrings.add(temp);
        }
    }

    // add very first and very last names (for more than 2 words)
    copy = new TreeSet<String>(candidateStrings);
    for (String candidateString : copy) {
        String split[] = candidateString.split(" ");
        if (split.length > 2) {
            String temp = split[0] + " " + split[split.length - 1];
            candidateStrings.add(temp);
        }
    }

    // add variants with initials instead of firstnames
    copy = new TreeSet<String>(candidateStrings);
    for (String candidateString : copy) {
        String split[] = candidateString.split(" ");
        if (split.length > 1) {
            String initials1 = "";
            String initials2 = "";
            for (int i = 0; i < split.length - 1; i++) {
                initials1 = initials1 + split[i].substring(0, 1).toUpperCase(Locale.ENGLISH) + ". ";
                initials2 = initials2 + split[i].substring(0, 1).toUpperCase(Locale.ENGLISH) + ".";
            }
            initials1 = initials1 + split[split.length - 1];
            initials2 = initials2 + " " + split[split.length - 1];
            candidateStrings.add(initials1);
            candidateStrings.add(initials2);
        }
    }

    // add the original version of the nicknames
    candidateStrings.addAll(nickFull);

    // look for similar strings in the text
    for (String expr : candidateStrings) {
        String escapedStr = Pattern.quote(expr);
        p = Pattern.compile("\\b" + escapedStr + "\\b");
        m = p.matcher(rawText);
        while (m.find()) {
            int startPos = m.start();
            int endPos = m.end();
            String valueStr = m.group();
            AbstractEntity<?> ent = AbstractEntity.build(EntityType.PERSON, startPos, endPos,
                    RecognizerName.SUBEE, valueStr);
            result.add(ent);
        }
    }

    if (result.isEmpty())
        logger.log("WARNING: title not found at all in the text, which is unusual");

    logger.decreaseOffset();
    return result;
}

From source file:com.manydesigns.portofino.pageactions.text.TextAction.java

protected String restoreAttachmentUrls(String content) {
    Pattern pattern = Pattern.compile(PORTOFINO_ATTACHMENT_PATTERN);
    Matcher matcher = pattern.matcher(content);
    int lastEnd = 0;
    StringBuilder sb = new StringBuilder();
    while (matcher.find()) {
        String attachmentId = matcher.group(1);
        //Default to src for old texts
        String hrefAttribute = (matcher.groupCount() >= 3 && matcher.group(3) != null) ? matcher.group(3)
                : "src";

        sb.append(content.substring(lastEnd, matcher.start())).append(hrefAttribute).append("=\"")
                .append(StringEscapeUtils.escapeHtml(generateViewAttachmentUrl(attachmentId))).append("\"");

        lastEnd = matcher.end();/*from   w w  w .  ja v a  2 s . c o m*/
    }
    sb.append(content.substring(lastEnd));
    return sb.toString();
}

From source file:gtu._work.ui.RegexReplacer.java

/**
 * @param fromPattern//from   w ww  .ja  va2s . c o  m
 *            ???pattern
 * @param toFormat
 *            ??pattern
 * @param replaceText
 *            ??
 */
String replacer(String fromPattern, String toFormat, String replaceText) {
    String errorRtn = replaceText.toString();
    try {
        int patternFlag = 0;

        // 
        if (multiLineCheckBox.isSelected()) {
            patternFlag = Pattern.DOTALL | Pattern.MULTILINE;
        }

        Pattern pattern = Pattern.compile(fromPattern, patternFlag);
        Matcher matcher = pattern.matcher(replaceText);

        StringBuffer sb = new StringBuffer();
        String tempStr = null;

        TradeOffConfig config = this.getTradeOffConfig();

        {
            int startPos = 0;
            for (; matcher.find();) {
                tempStr = toFormat.toString();
                sb.append(replaceText.substring(startPos, matcher.start()));

                // ----------------------------------------------
                if (StringUtils.isBlank(config.fremarkerKey)) {
                    // regex
                    for (int ii = 0; ii <= matcher.groupCount(); ii++) {
                        System.out.println(ii + " -- " + matcher.group(ii));
                        tempStr = tempStr.replaceAll("#" + ii + "#",
                                Matcher.quoteReplacement(matcher.group(ii)));
                    }
                } else if (StringUtils.isNotBlank(config.fremarkerKey)) {
                    // freemarker
                    Map<String, Object> root = new HashMap<String, Object>();
                    TreeMap<Integer, Object> lstMap = new TreeMap<Integer, Object>();
                    for (int ii = 0; ii <= matcher.groupCount(); ii++) {
                        lstMap.put(ii, matcher.group(ii));
                    }
                    root.put(StringUtils.trimToEmpty(config.fremarkerKey), lstMap.values());
                    System.out.println("template Map : " + root);
                    tempStr = FreeMarkerSimpleUtil.replace(tempStr, root);
                }
                // ----------------------------------------------

                sb.append(tempStr);
                startPos = matcher.end();
            }
            sb.append(replaceText.substring(startPos));
        }

        return sb.toString();
    } catch (Exception ex) {
        JOptionPaneUtil.newInstance().iconErrorMessage().showMessageDialog(ex.getMessage(), getTitle());
        return errorRtn;
    }
}