org.languagetool.AnalyzedTokenReadings.java Source code

Introduction

Here is the source code for org.languagetool.AnalyzedTokenReadings.java
Source

/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */

package org.languagetool;

import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.languagetool.chunking.ChunkTag;
import org.languagetool.tools.StringTools;

import static org.languagetool.JLanguageTool.*;

/**
 * An array of {@link AnalyzedToken}s used to store multiple POS tags and lemmas
 * for a given single token.
 * 
 * @author Marcin Milkowski
 */
public final class AnalyzedTokenReadings implements Iterable<AnalyzedToken> {

    private static final Pattern NON_WORD_REGEX = Pattern
            .compile("[.?!:;,~'\"?()\\[\\]\\-*+/=]");

    private final boolean isWhitespace;
    private final boolean isLinebreak;
    private final boolean isSentStart;

    private AnalyzedToken[] anTokReadings;
    private int startPos;
    private String token;
    private List<ChunkTag> chunkTags = new ArrayList<>();
    private boolean isSentEnd;
    private boolean isParaEnd;
    private boolean isWhitespaceBefore;
    private boolean isPosTagUnknown;

    // If true, then the token is marked up as immune against tests:
    // it should never be matched by any rule. Used to have generalized
    // mechanism for exceptions in rules.
    private boolean isImmunized;

    // If true, then the token is marked up as ignored in all spelling rules:
    // other rules can freely match it.
    private boolean isIgnoredBySpeller;

    // Used to hold the string representation of the disambiguator actions on a token.
    private String historicalAnnotations = "";

    // True if the token has the same lemma value for all tokens.
    // Can be used internally to optimize matching.
    private boolean hasSameLemmas;

    public AnalyzedTokenReadings(AnalyzedToken[] tokens, int startPos) {
        this(Arrays.asList(tokens), startPos);
    }

    public AnalyzedTokenReadings(AnalyzedToken token, int startPos) {
        this(Collections.singletonList(token), startPos);
    }

    public AnalyzedTokenReadings(List<AnalyzedToken> tokens, int startPos) {
        anTokReadings = tokens.toArray(new AnalyzedToken[0]);
        this.startPos = startPos;
        token = anTokReadings[0].getToken();
        isWhitespace = StringTools.isWhitespace(token);
        isWhitespaceBefore = anTokReadings[0].isWhitespaceBefore();
        isLinebreak = "\n".equals(token) || "\r\n".equals(token) || "\r".equals(token) || "\n\r".equals(token);
        isSentStart = SENTENCE_START_TAGNAME.equals(anTokReadings[0].getPOSTag());
        isParaEnd = hasPosTag(PARAGRAPH_END_TAGNAME);
        isSentEnd = hasPosTag(SENTENCE_END_TAGNAME);
        isPosTagUnknown = tokens.size() == 1 && tokens.get(0).getPOSTag() == null;
        setNoRealPOStag();
        hasSameLemmas = areLemmasSame();
    }

    AnalyzedTokenReadings(AnalyzedToken token) {
        this(Collections.singletonList(token), 0);
    }

    public List<AnalyzedToken> getReadings() {
        return Arrays.asList(anTokReadings);
    }

    /**
     * Get a token reading.
     * @see #getReadingsLength() getReadingsLength() for how many token readings there are
     */
    public AnalyzedToken getAnalyzedToken(int idx) {
        return anTokReadings[idx];
    }

    /**
     * Checks if the token has a particular POS tag.
     * 
     * @param posTag POS tag to look for
     */
    public boolean hasPosTag(String posTag) {
        boolean found = false;
        for (AnalyzedToken reading : anTokReadings) {
            if (reading.getPOSTag() != null) {
                found = posTag.equals(reading.getPOSTag());
                if (found) {
                    break;
                }
            }
        }
        return found;
    }

    /**
     * Checks if one of the token's readings has a particular lemma.
     *
     * @param lemma lemma POS tag to look for
     */
    public boolean hasLemma(String lemma) {
        boolean found = false;
        for (AnalyzedToken reading : anTokReadings) {
            if (reading.getLemma() != null) {
                found = lemma.equals(reading.getLemma());
                if (found) {
                    break;
                }
            }
        }
        return found;
    }

    /**
     * Checks if one of the token's readings has one of the given lemmas
     *
     * @param lemmas to look for
     */
    public boolean hasAnyLemma(String... lemmas) {
        boolean found = false;
        for (String lemma : lemmas) {
            for (AnalyzedToken reading : anTokReadings) {
                if (reading.getLemma() != null) {
                    found = lemma.equals(reading.getLemma());
                    if (found) {
                        return found;
                    }
                }
            }
        }
        return found;
    }

    /**
     * Checks if the token has a particular POS tag, where only a part of the given POS tag needs to match.
     *
     * @param posTag POS tag substring to look for
     * @since 1.8
     */
    public boolean hasPartialPosTag(String posTag) {
        boolean found = false;
        for (AnalyzedToken reading : anTokReadings) {
            if (reading.getPOSTag() != null) {
                found = reading.getPOSTag().contains(posTag);
                if (found) {
                    break;
                }
            }
        }
        return found;
    }

    /**
     * Checks if the token has any of the given particular POS tags (only a part of the given POS tag needs to match)
     *
     * @param posTags POS tag substring to look for
     * @since 4.0
     */
    public boolean hasAnyPartialPosTag(String... posTags) {
        for (String posTag : posTags) {
            if (hasPartialPosTag(posTag)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Checks if the token has a POS tag starting with the given string.
     *
     * @param posTag POS tag substring to look for
     * @since 4.0
     */
    public boolean hasPosTagStartingWith(String posTag) {
        boolean found = false;
        for (AnalyzedToken reading : anTokReadings) {
            if (reading.getPOSTag() != null) {
                found = reading.getPOSTag().startsWith(posTag);
                if (found) {
                    break;
                }
            }
        }
        return found;
    }

    /**
     * Checks if at least one of the readings matches a given POS tag regex.
     *
     * @param posTagRegex POS tag regular expression to look for
     * @since 2.9
     */
    public boolean matchesPosTagRegex(String posTagRegex) {
        Pattern pattern = Pattern.compile(posTagRegex);
        boolean found = false;
        for (AnalyzedToken reading : anTokReadings) {
            if (reading.getPOSTag() != null) {
                found = pattern.matcher(reading.getPOSTag()).matches();
                if (found) {
                    break;
                }
            }
        }
        return found;
    }

    /**
     * Add a new reading.
     * @param token new reading, given as {@link AnalyzedToken}
     */
    public void addReading(AnalyzedToken token) {
        List<AnalyzedToken> l = new ArrayList<>(Arrays.asList(anTokReadings).subList(0, anTokReadings.length - 1));
        if (anTokReadings[anTokReadings.length - 1].getPOSTag() != null) {
            l.add(anTokReadings[anTokReadings.length - 1]);
        }
        token.setWhitespaceBefore(isWhitespaceBefore);
        l.add(token);
        anTokReadings = l.toArray(new AnalyzedToken[0]);
        if (token.getToken().length() > this.token.length()) { //in case a longer token is added
            this.token = token.getToken();
        }
        anTokReadings[anTokReadings.length - 1].setWhitespaceBefore(isWhitespaceBefore);
        isParaEnd = hasPosTag(PARAGRAPH_END_TAGNAME);
        isSentEnd = hasPosTag(SENTENCE_END_TAGNAME);
        setNoRealPOStag();
        hasSameLemmas = areLemmasSame();
    }

    /**
     * Removes a reading from the list of readings. Note: if the token
     * has only one reading, then a new reading with an empty POS tag
     * and an empty lemma is created.
     * @param token reading to be removed
     */
    public void removeReading(AnalyzedToken token) {
        List<AnalyzedToken> l = new ArrayList<>();
        AnalyzedToken tmpTok = new AnalyzedToken(token.getToken(), token.getPOSTag(), token.getLemma());
        tmpTok.setWhitespaceBefore(isWhitespaceBefore);
        boolean removedSentEnd = false;
        boolean removedParaEnd = false;
        for (AnalyzedToken anTokReading : anTokReadings) {
            if (!anTokReading.matches(tmpTok)) {
                l.add(anTokReading);
            } else if (SENTENCE_END_TAGNAME.equals(anTokReading.getPOSTag())) {
                removedSentEnd = true;
            } else if (PARAGRAPH_END_TAGNAME.equals(anTokReading.getPOSTag())) {
                removedParaEnd = true;
            }
        }
        if (l.isEmpty()) {
            l.add(new AnalyzedToken(this.token, null, null));
            l.get(0).setWhitespaceBefore(isWhitespaceBefore);
        }
        anTokReadings = l.toArray(new AnalyzedToken[0]);
        setNoRealPOStag();
        if (removedSentEnd) {
            isSentEnd = false;
            setSentEnd();
        }
        if (removedParaEnd) {
            isParaEnd = false;
            setParagraphEnd();
        }
        hasSameLemmas = areLemmasSame();
    }

    /**
     * Removes all readings but the one that matches the token given.
     * @param token Token to be matched
     * @since 1.5
     */
    public void leaveReading(AnalyzedToken token) {
        List<AnalyzedToken> l = new ArrayList<>();
        AnalyzedToken tmpTok = new AnalyzedToken(token.getToken(), token.getPOSTag(), token.getLemma());
        tmpTok.setWhitespaceBefore(isWhitespaceBefore);
        for (AnalyzedToken anTokReading : anTokReadings) {
            if (anTokReading.matches(tmpTok)) {
                l.add(anTokReading);
            }
        }
        if (l.isEmpty()) {
            l.add(new AnalyzedToken(this.token, null, null));
            l.get(0).setWhitespaceBefore(isWhitespaceBefore);
        }
        anTokReadings = l.toArray(new AnalyzedToken[0]);
        setNoRealPOStag();
        hasSameLemmas = areLemmasSame();
    }

    /**
     * Number of readings.
     */
    public int getReadingsLength() {
        return anTokReadings.length;
    }

    public boolean isWhitespace() {
        return isWhitespace;
    }

    /**
     * Returns true if the token equals {@code \n}, {@code \r}, {@code \n\r}, or {@code \r\n}.
     */
    public boolean isLinebreak() {
        return isLinebreak;
    }

    /**
     * @since 2.3
     */
    public boolean isSentenceStart() {
        return isSentStart;
    }

    /**
     * @return true when the token is a last token in a paragraph.
     * @since 2.3
     */
    public boolean isParagraphEnd() {
        return isParaEnd;
    }

    /**
     * Add a reading with a paragraph end token unless this is already a paragraph end.
     * @since 2.3
     */
    public void setParagraphEnd() {
        if (!isParagraphEnd()) {
            AnalyzedToken paragraphEnd = new AnalyzedToken(getToken(), PARAGRAPH_END_TAGNAME,
                    getAnalyzedToken(0).getLemma());
            addReading(paragraphEnd);
        }
    }

    /**
     * @return true when the token is a last token in a sentence.
     * @since 2.3
     */
    public boolean isSentenceEnd() {
        return isSentEnd;
    }

    /**
     * @return true if the token is LibreOffice/OpenOffice field code.
     * @since 0.9.9
     */
    public boolean isFieldCode() {
        return "\u0001".equals(token) || "\u0002".equals(token);
    }

    /**
     * Add a SENT_END tag.
     */
    public void setSentEnd() {
        if (!isSentenceEnd()) {
            AnalyzedToken sentenceEnd = new AnalyzedToken(getToken(), SENTENCE_END_TAGNAME,
                    getAnalyzedToken(0).getLemma());
            addReading(sentenceEnd);
        }
    }

    public int getStartPos() {
        return startPos;
    }

    /** @since 2.9 */
    public int getEndPos() {
        return startPos + token.length();
    }

    public void setStartPos(int position) {
        startPos = position;
    }

    public String getToken() {
        return token;
    }

    public void setWhitespaceBefore(boolean isWhiteSpaceBefore) {
        isWhitespaceBefore = isWhiteSpaceBefore;
        for (AnalyzedToken aTok : anTokReadings) {
            aTok.setWhitespaceBefore(isWhiteSpaceBefore);
        }
    }

    public boolean isWhitespaceBefore() {
        return isWhitespaceBefore;
    }

    public void immunize() {
        isImmunized = true;
    }

    public boolean isImmunized() {
        return isImmunized;
    }

    /**
     * Make the token ignored by all spelling rules.
     * @since 2.5
     */
    public void ignoreSpelling() {
        isIgnoredBySpeller = true;
    }

    /**
     * Test if the token can be ignored by spelling rules.
     * @return true if the token should be ignored.
     * @since 2.5
     */
    public boolean isIgnoredBySpeller() {
        return isIgnoredBySpeller;
    }

    /**
     * Test if the token's POStag equals null.
     * @return true if the token does not have a POStag
     * @since 3.9
     */
    public boolean isPosTagUnknown() {
        return isPosTagUnknown;
    }

    /**
     * Sets the flag on AnalyzedTokens to make matching
     * on {@code UNKNOWN} POS tag correct in the Element class.
     */
    private void setNoRealPOStag() {
        boolean hasNoPOStag = !isLinebreak();
        for (AnalyzedToken an : anTokReadings) {
            String posTag = an.getPOSTag();
            if (PARAGRAPH_END_TAGNAME.equals(posTag) || SENTENCE_END_TAGNAME.equals(posTag)) {
                continue;
            }
            if (posTag != null) {
                hasNoPOStag = false;
            }
        }
        for (AnalyzedToken an : anTokReadings) {
            an.setNoPOSTag(hasNoPOStag);
        }
    }

    /**
     * Used to track disambiguator actions.
     * @return the historicalAnnotations
     */
    public String getHistoricalAnnotations() {
        return historicalAnnotations;
    }

    /**
     * Used to track disambiguator actions.
     * @param historicalAnnotations the historicalAnnotations to set
     */
    public void setHistoricalAnnotations(String historicalAnnotations) {
        this.historicalAnnotations = historicalAnnotations;
    }

    /**
     * @since 2.3
     */
    public void setChunkTags(List<ChunkTag> chunkTags) {
        this.chunkTags = Objects.requireNonNull(chunkTags);
    }

    /**
     * @since 2.3
     */
    public List<ChunkTag> getChunkTags() {
        return chunkTags;
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append(token);
        sb.append('[');
        for (AnalyzedToken element : anTokReadings) {
            sb.append(element);
            if (!element.isWhitespaceBefore()) {
                sb.append('*');
            }
            sb.append(',');
        }
        sb.delete(sb.length() - 1, sb.length());
        if (chunkTags.size() > 0) {
            sb.append(',');
            sb.append(StringUtils.join(chunkTags, "|"));
        }
        sb.append(']');
        if (isImmunized()) {
            sb.append("{!},");
        }
        return sb.toString();
    }

    /**
     * @return true if AnalyzedTokenReadings has some real POS tag (= not null or a special tag)
     * @since 2.3
     */
    public boolean isTagged() {
        for (AnalyzedToken element : anTokReadings) {
            if (!element.hasNoTag()) {
                return true;
            }
        }
        return false;
    }

    /**
     * Used to configure the internal variable for lemma equality.
     * @return true if all {@link AnalyzedToken} lemmas are the same.
     * @since 2.5
     */
    private boolean areLemmasSame() {
        String previousLemma = anTokReadings[0].getLemma();
        if (previousLemma == null) {
            for (AnalyzedToken element : anTokReadings) {
                if (element.getLemma() != null) {
                    return false;
                }
            }
            return true;
        }
        for (AnalyzedToken element : anTokReadings) {
            if (!previousLemma.equals(element.getLemma())) {
                return false;
            }
        }
        return true;
    }

    /**
     * Used to optimize pattern matching.
     * 
     * @return true if all {@link AnalyzedToken} lemmas are the same.
     */
    public boolean hasSameLemmas() {
        return hasSameLemmas;
    }

    /**
     * @return true if AnalyzedTokenReadings is a punctuation mark, bracket, etc
     * @since 4.4
     */
    public boolean isNonWord() {
        return NON_WORD_REGEX.matcher(token).matches();
    }

    @Override
    public int hashCode() {
        return Arrays.hashCode(anTokReadings) + Objects.hash(isLinebreak, isParaEnd, isSentEnd, isSentStart,
                isWhitespace, isWhitespaceBefore, chunkTags, startPos, token);
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj) {
            return true;
        }
        if (obj == null) {
            return false;
        }
        if (getClass() != obj.getClass()) {
            return false;
        }
        AnalyzedTokenReadings other = (AnalyzedTokenReadings) obj;
        return new EqualsBuilder().append(anTokReadings, other.anTokReadings).append(isLinebreak, other.isLinebreak)
                .append(isParaEnd, other.isParaEnd).append(isSentEnd, other.isSentEnd)
                .append(isSentStart, other.isSentStart).append(isWhitespace, other.isWhitespace)
                .append(isWhitespaceBefore, other.isWhitespaceBefore).append(isImmunized, other.isImmunized)
                .append(startPos, other.startPos).append(chunkTags, other.chunkTags)
                .append(hasSameLemmas, other.hasSameLemmas).append(isIgnoredBySpeller, other.isIgnoredBySpeller)
                .append(token, other.token).isEquals();
    }

    /**
     * @since 2.3
     */
    @Override
    public Iterator<AnalyzedToken> iterator() {
        AtomicInteger i = new AtomicInteger(0);
        return new Iterator<AnalyzedToken>() {
            @Override
            public boolean hasNext() {
                return i.get() < getReadingsLength();
            }

            @Override
            public AnalyzedToken next() {
                try {
                    return anTokReadings[i.getAndAdd(1)];
                } catch (ArrayIndexOutOfBoundsException e) {
                    throw new NoSuchElementException(
                            "No such element: " + i + ", element count: " + anTokReadings.length);
                }
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }
        };
    }
}