org.languagetool.rules.UppercaseSentenceStartRule.java Source code

Introduction

Here is the source code for org.languagetool.rules.UppercaseSentenceStartRule.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.ResourceBundle;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.Language;
import org.languagetool.tokenizers.WordTokenizer;
import org.languagetool.tools.StringTools;

/**
 * Checks that a sentence starts with an uppercase letter.
 * 
 * @author Daniel Naber
 */
public class UppercaseSentenceStartRule extends TextLevelRule {

    private static final Pattern NUMERALS_EN = Pattern
            .compile("[a-z]|(m{0,4}(c[md]|d?c{0,3})(x[cl]|l?x{0,3})(i[xv]|v?i{0,3}))$");
    private static final Pattern WHITESPACE_OR_QUOTE = Pattern.compile("[ \"'?\\n]"); //only ending quote is necessary?
    private static final Pattern SENTENCE_END1 = Pattern.compile("[.?!]|");

    private final Language language;

    /** @since 3.3 */
    public UppercaseSentenceStartRule(ResourceBundle messages, Language language, IncorrectExample incorrectExample,
            CorrectExample correctExample) {
        super(messages);
        super.setCategory(Categories.CASING.getCategory(messages));
        this.language = language;
        setLocQualityIssueType(ITSIssueType.Typographical);
        if (incorrectExample != null && correctExample != null) {
            addExamplePair(incorrectExample, correctExample);
        }
    }

    /**
     * @deprecated use {@link #UppercaseSentenceStartRule(ResourceBundle, Language, IncorrectExample, CorrectExample)} instead (deprecated since 3.3)
     */
    public UppercaseSentenceStartRule(ResourceBundle messages, Language language) {
        this(messages, language, null, null);
    }

    @Override
    public final String getId() {
        return "UPPERCASE_SENTENCE_START";
    }

    @Override
    public final String getDescription() {
        return messages.getString("desc_uppercase_sentence");
    }

    @Override
    public RuleMatch[] match(List<AnalyzedSentence> sentences) throws IOException {
        String lastParagraphString = "";
        List<RuleMatch> ruleMatches = new ArrayList<>();
        if (sentences.size() == 1 && sentences.get(0).getTokens().length == 2) {
            // Special case for a single "sentence" with a single word - it's not useful
            // to complain about this (and might hide a typo error):
            return toRuleMatchArray(ruleMatches);
        }
        int pos = 0;
        for (AnalyzedSentence sentence : sentences) {
            AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
            if (tokens.length < 2) {
                return toRuleMatchArray(ruleMatches);
            }
            int matchTokenPos = 1; // 0 = SENT_START
            AnalyzedTokenReadings firstTokenObj = tokens[matchTokenPos];
            String firstToken = firstTokenObj.getToken();
            String secondToken = null;
            String thirdToken = null;
            // ignore quote characters:
            if (tokens.length >= 3 && isQuoteStart(firstToken)) {
                matchTokenPos = 2;
                secondToken = tokens[matchTokenPos].getToken();
            }
            String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens);
            if (firstDutchToken != null) {
                thirdToken = firstDutchToken;
                matchTokenPos = 3;
            }

            String checkToken = firstToken;
            if (thirdToken != null) {
                checkToken = thirdToken;
            } else if (secondToken != null) {
                checkToken = secondToken;
            }

            String lastToken = tokens[tokens.length - 1].getToken();
            if (WHITESPACE_OR_QUOTE.matcher(lastToken).matches()) {
                // ignore trailing whitespace or quote
                lastToken = tokens[tokens.length - 2].getToken();
            }

            boolean preventError = false;
            if (lastParagraphString.equals(",") || lastParagraphString.equals(";")) {
                preventError = true;
            }
            if (!SENTENCE_END1.matcher(lastParagraphString).matches() && !isSentenceEnd(lastToken)) {
                preventError = true;
            }

            lastParagraphString = lastToken;

            //allows enumeration with lowercase letters: a), iv., etc.
            if (matchTokenPos + 1 < tokens.length && NUMERALS_EN.matcher(tokens[matchTokenPos].getToken()).matches()
                    && (tokens[matchTokenPos + 1].getToken().equals(".")
                            || tokens[matchTokenPos + 1].getToken().equals(")"))) {
                preventError = true;
            }

            if (isUrl(checkToken) || isEMail(checkToken) || firstTokenObj.isImmunized()) {
                preventError = true;
            }

            if (checkToken.length() > 0) {
                char firstChar = checkToken.charAt(0);
                if (!preventError && Character.isLowerCase(firstChar)) {
                    RuleMatch ruleMatch = new RuleMatch(this, sentence, pos + tokens[matchTokenPos].getStartPos(),
                            pos + tokens[matchTokenPos].getEndPos(), messages.getString("incorrect_case"));
                    ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken));
                    ruleMatches.add(ruleMatch);
                }
            }
            pos += sentence.getText().length();
        }
        return toRuleMatchArray(ruleMatches);
    }

    @Nullable
    private String dutchSpecialCase(String firstToken, String secondToken, AnalyzedTokenReadings[] tokens) {
        if (!language.getShortCode().equals("nl")) {
            return null;
        }
        if (tokens.length > 3 && firstToken.equals("'") && isDutchSpecialCase(secondToken)) {
            return tokens[3].getToken();
        }
        return null;
    }

    protected boolean isUrl(String token) {
        return WordTokenizer.isUrl(token);
    }

    protected boolean isEMail(String token) {
        return WordTokenizer.isEMail(token);
    }

    private boolean isDutchSpecialCase(String word) {
        return StringUtils.equalsAny(word, "k", "m", "n", "r", "s", "t");
    }

    private boolean isSentenceEnd(String word) {
        return StringUtils.equalsAny(word, ".", "?", "!", "");
    }

    private boolean isQuoteStart(String word) {
        return StringUtils.equalsAny(word, "\"", "'", "", "", "", "", "");
    }

    @Override
    public int minToCheckParagraph() {
        return 0;
    }
}