org.languagetool.rules.patterns.MatchState.java Source code

Introduction

Here is the source code for org.languagetool.rules.patterns.MatchState.java
Source

/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * Copyright (C) 2013 Stefan Lotties
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.patterns;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.TreeSet;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.Language;
import org.languagetool.rules.patterns.Match.IncludeRange;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tools.StringTools;

import static org.languagetool.JLanguageTool.PARAGRAPH_END_TAGNAME;
import static org.languagetool.JLanguageTool.SENTENCE_END_TAGNAME;
import static org.languagetool.JLanguageTool.SENTENCE_START_TAGNAME;

/**
 * The state of a matching process. This state is <strong>NOT</strong> thread-safe.
 *
 * @author Stefan Lotties
 * @since 2.3
 */
public class MatchState {

    private final Match match;
    private final Synthesizer synthesizer;

    private AnalyzedTokenReadings formattedToken;
    private AnalyzedTokenReadings matchedToken;
    private String skippedTokens;

    public MatchState(Match match, Synthesizer synthesizer) {
        this.match = match;
        this.synthesizer = synthesizer;
        String lemma = match.getLemma();
        if (!StringUtils.isEmpty(lemma)) {
            formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(lemma, match.getPosTag(), lemma), 0);
        }
    }

    public void setToken(AnalyzedTokenReadings token) {
        if (match.isStaticLemma()) {
            this.matchedToken = token;
        } else {
            this.formattedToken = token;
        }
    }

    /**
     * Sets the token to be formatted etc. and includes the support for
     * including the skipped tokens.
     *
     * @param tokens Array of tokens
     * @param index Index of the token to be formatted
     * @param next Position of the next token (the skipped tokens are the ones between the tokens[index] and tokens[next]
     */
    public final void setToken(AnalyzedTokenReadings[] tokens, int index, int next) {
        int idx = index;
        if (index >= tokens.length) {
            // TODO: hacky workaround, find a proper solution. See EnglishPatternRuleTest.testBug()
            idx = tokens.length - 1;
        }
        setToken(tokens[idx]);
        IncludeRange includeSkipped = match.getIncludeSkipped();
        if (next > 1 && includeSkipped != IncludeRange.NONE) {
            StringBuilder sb = new StringBuilder();
            if (includeSkipped == IncludeRange.FOLLOWING) {
                formattedToken = null;
            }
            for (int k = index + 1; k < index + next; k++) {
                if (tokens[k].isWhitespaceBefore()
                        && !(k == index + 1 && includeSkipped == IncludeRange.FOLLOWING)) {
                    sb.append(' ');
                }
                sb.append(tokens[k].getToken());
            }
            skippedTokens = sb.toString();
        } else {
            skippedTokens = "";
        }
    }

    public final AnalyzedTokenReadings filterReadings() {
        List<AnalyzedToken> l = new ArrayList<>();
        if (formattedToken != null) {
            if (match.isStaticLemma()) {
                // Note: we want the token without ignored characters so we can't use matchedToken.getToken()
                matchedToken.leaveReading(new AnalyzedToken(matchedToken.getReadings().get(0).getToken(),
                        match.getPosTag(), formattedToken.getToken()));
                formattedToken = matchedToken;
            }
            // Note: we want the token without ignored characters so we can't use formattedToken.getToken()
            String token = formattedToken.getAnalyzedToken(0).getToken();
            Pattern regexMatch = match.getRegexMatch();
            String regexReplace = match.getRegexReplace();
            if (regexMatch != null && regexReplace != null) {
                /* only replace if it is something to replace */
                token = regexMatch.matcher(token).replaceAll(regexReplace);
            }
            token = convertCase(token, token, null);

            String posTag = match.getPosTag();
            if (posTag != null) {
                int numRead = formattedToken.getReadingsLength();
                if (match.isPostagRegexp()) {
                    Pattern pPosRegexMatch = match.getPosRegexMatch();
                    String posTagReplace = match.getPosTagReplace();
                    String targetPosTag;
                    for (int i = 0; i < numRead; i++) {
                        String testTag = formattedToken.getAnalyzedToken(i).getPOSTag();
                        if (testTag != null && pPosRegexMatch.matcher(testTag).matches()) {
                            targetPosTag = testTag;
                            if (posTagReplace != null) {
                                targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(posTagReplace);
                            }
                            l.add(new AnalyzedToken(token, targetPosTag,
                                    formattedToken.getAnalyzedToken(i).getLemma()));
                            l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
                        }
                    }
                    if (l.isEmpty()) {
                        l.addAll(getNewToken(numRead, token));
                    }
                } else {
                    l.addAll(getNewToken(numRead, token));
                }
                String lemma = formattedToken.getAnalyzedToken(0).getLemma();
                if (formattedToken.isSentenceEnd()) {
                    l.add(new AnalyzedToken(formattedToken.getToken(), SENTENCE_END_TAGNAME, lemma));
                }
                if (formattedToken.isParagraphEnd()) {
                    l.add(new AnalyzedToken(formattedToken.getToken(), PARAGRAPH_END_TAGNAME, lemma));
                }

            }
        }
        if (l.isEmpty()) {
            return formattedToken;
        }
        final AnalyzedTokenReadings anTkRead = new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[0]),
                formattedToken.getStartPos());
        // TODO: in case original had ignored characters we want to restore readings.token
        // but there's no setToken() available :(
        //    anTkRead.setToken(formattedToken.getToken());

        anTkRead.setWhitespaceBefore(formattedToken.isWhitespaceBefore());
        if (!formattedToken.getChunkTags().isEmpty()) {
            anTkRead.setChunkTags(formattedToken.getChunkTags());
        }
        if (formattedToken.isImmunized()) {
            anTkRead.immunize();
        }
        return anTkRead;
    }

    /**
     * Converts case of the string token according to match element attributes.
     * @param s Token to be converted.
     * @param sample the sample string used to determine how the original string looks like (used only on case preservation)
     * @return Converted string.
     */
    String convertCase(String s, String sample, Language lang) {
        return CaseConversionHelper.convertCase(match.getCaseConversionType(), s, sample, lang);
    }

    private List<AnalyzedToken> getNewToken(int numRead, String token) {
        String posTag = match.getPosTag();
        List<AnalyzedToken> list = new ArrayList<>();
        String lemma = "";
        for (int j = 0; j < numRead; j++) {
            String tempPosTag = formattedToken.getAnalyzedToken(j).getPOSTag();
            if (tempPosTag != null) {
                if (tempPosTag.equals(posTag) && formattedToken.getAnalyzedToken(j).getLemma() != null) {
                    lemma = formattedToken.getAnalyzedToken(j).getLemma();
                }
                if (StringTools.isEmpty(lemma)) {
                    lemma = formattedToken.getAnalyzedToken(0).getLemma();
                }
                list.add(new AnalyzedToken(token, posTag, lemma));
                list.get(list.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
            }
        }
        return list;
    }

    /**
     * Gets all strings formatted using the match element.
     */
    public final String[] toFinalString(Language lang) throws IOException {
        String[] formattedString = new String[1];
        if (formattedToken != null) {
            int readingCount = formattedToken.getReadingsLength();
            formattedString[0] = formattedToken.getToken();

            Pattern pRegexMatch = match.getRegexMatch();
            String regexReplace = match.getRegexReplace();
            if (pRegexMatch != null) {
                formattedString[0] = pRegexMatch.matcher(formattedString[0]).replaceAll(regexReplace);
            }

            String posTag = match.getPosTag();
            if (posTag != null) {
                if (synthesizer == null) {
                    formattedString[0] = formattedToken.getToken();
                } else if (match.isPostagRegexp()) {
                    TreeSet<String> wordForms = new TreeSet<>();
                    boolean oneForm = false;
                    for (int k = 0; k < readingCount; k++) {
                        if (formattedToken.getAnalyzedToken(k).getLemma() == null) {
                            String posUnique = formattedToken.getAnalyzedToken(k).getPOSTag();
                            if (posUnique == null) {
                                wordForms.add(formattedToken.getToken());
                                oneForm = true;
                            } else {
                                if (SENTENCE_START_TAGNAME.equals(posUnique)
                                        || SENTENCE_END_TAGNAME.equals(posUnique)
                                        || PARAGRAPH_END_TAGNAME.equals(posUnique)) {
                                    if (!oneForm) {
                                        wordForms.add(formattedToken.getToken());
                                    }
                                    oneForm = true;
                                } else {
                                    oneForm = false;
                                }
                            }
                        }
                    }
                    String targetPosTag = getTargetPosTag();
                    if (!oneForm) {
                        for (int i = 0; i < readingCount; i++) {
                            String[] possibleWordForms = synthesizer.synthesize(formattedToken.getAnalyzedToken(i),
                                    targetPosTag, true);
                            if (possibleWordForms != null && possibleWordForms.length > 0) {
                                wordForms.addAll(Arrays.asList(possibleWordForms));
                            }
                        }
                    }
                    if (wordForms.isEmpty()) {
                        if (match.checksSpelling()) {
                            formattedString[0] = "";
                        } else {
                            formattedString[0] = "(" + formattedToken.getToken() + ")";
                        }
                    } else {
                        formattedString = wordForms.toArray(new String[0]);
                    }
                } else {
                    TreeSet<String> wordForms = new TreeSet<>();
                    for (int i = 0; i < readingCount; i++) {
                        String[] possibleWordForms = synthesizer.synthesize(formattedToken.getAnalyzedToken(i),
                                posTag);
                        if (possibleWordForms != null) {
                            wordForms.addAll(Arrays.asList(possibleWordForms));
                        }
                    }
                    formattedString = wordForms.toArray(new String[0]);
                }
            }
        }
        String original;
        if (match.isStaticLemma()) {
            original = matchedToken != null ? matchedToken.getToken() : "";
        } else {
            original = formattedToken != null ? formattedToken.getToken() : "";
        }
        for (int i = 0; i < formattedString.length; i++) {
            formattedString[i] = convertCase(formattedString[i], original, lang);
        }
        // TODO should case conversion happen before or after including skipped tokens?
        IncludeRange includeSkipped = match.getIncludeSkipped();
        if (includeSkipped != IncludeRange.NONE && skippedTokens != null && !skippedTokens.isEmpty()) {
            String[] helper = new String[formattedString.length];
            for (int i = 0; i < formattedString.length; i++) {
                if (formattedString[i] == null) {
                    formattedString[i] = "";
                }
                helper[i] = formattedString[i] + skippedTokens;
            }
            formattedString = helper;
        }
        if (match.checksSpelling() && lang != null) {
            List<String> formattedStringElements = Arrays.asList(formattedString);
            // tagger-based speller
            List<AnalyzedTokenReadings> analyzed = lang.getTagger().tag(formattedStringElements);
            for (int i = 0; i < formattedString.length; i++) {
                AnalyzedToken analyzedToken = analyzed.get(i).getAnalyzedToken(0);
                if (analyzedToken.getLemma() == null && analyzedToken.hasNoTag()) {
                    formattedString[i] = "";
                }
            }
        }
        return formattedString;
    }

    /**
     * Format POS tag using parameters already defined in the class.
     *
     * @return Formatted POS tag as String.
     */
    // FIXME: gets only the first POS tag that matches, this can be wrong
    // on the other hand, many POS tags = too many suggestions?
    public final String getTargetPosTag() {
        String targetPosTag = match.getPosTag();
        List<String> posTags = new ArrayList<>();
        Pattern pPosRegexMatch = match.getPosRegexMatch();
        String posTagReplace = match.getPosTagReplace();

        if (match.isStaticLemma()) {
            for (AnalyzedToken analyzedToken : matchedToken) {
                String tst = analyzedToken.getPOSTag();
                if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
                    targetPosTag = analyzedToken.getPOSTag();
                    posTags.add(targetPosTag);
                }
            }

            if (pPosRegexMatch != null && posTagReplace != null && !posTags.isEmpty()) {
                targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(posTagReplace);
            }
        } else {
            for (AnalyzedToken analyzedToken : formattedToken) {
                String tst = analyzedToken.getPOSTag();
                if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
                    targetPosTag = analyzedToken.getPOSTag();
                    posTags.add(targetPosTag);
                }
            }
            if (pPosRegexMatch != null && posTagReplace != null) {
                if (posTags.isEmpty()) {
                    posTags.add(targetPosTag);
                }
                StringBuilder sb = new StringBuilder();
                int posTagLen = posTags.size();
                int l = 0;
                for (String lPosTag : posTags) {
                    l++;
                    lPosTag = pPosRegexMatch.matcher(lPosTag).replaceAll(posTagReplace);
                    if (match.setsPos()) {
                        lPosTag = synthesizer.getPosTagCorrection(lPosTag);
                    }
                    sb.append(lPosTag);
                    if (l < posTagLen) {
                        sb.append('|');
                    }
                }
                targetPosTag = sb.toString();
            }
        }
        return targetPosTag;
    }

    /**
     * Method for getting the formatted match as a single string. In case of
     * multiple matches, it joins them using a regular expression operator "|".
     * @return Formatted string of the matched token.
     */
    final String toTokenString() throws IOException {
        String[] stringToFormat = toFinalString(null);
        return String.join("|", Arrays.asList(stringToFormat));
    }

    public Match getMatch() {
        return match;
    }
}