com.dgtlrepublic.anitomyj.ParserNumber.java Source code

Java tutorial

Introduction

Here is the source code for com.dgtlrepublic.anitomyj.ParserNumber.java

Source

/*
 * Copyright (c) 2014-2016, Eren Okka
 * Copyright (c) 2016, Paul Miller
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 */

package com.dgtlrepublic.anitomyj;

import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementAnimeSeason;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementAnimeType;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementEpisodeNumber;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementEpisodeNumberAlt;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementEpisodePrefix;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementReleaseVersion;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementVolumeNumber;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementVolumePrefix;
import static com.dgtlrepublic.anitomyj.Token.TokenCategory.kBracket;
import static com.dgtlrepublic.anitomyj.Token.TokenCategory.kDelimiter;
import static com.dgtlrepublic.anitomyj.Token.TokenCategory.kIdentifier;
import static com.dgtlrepublic.anitomyj.Token.TokenCategory.kUnknown;
import static com.dgtlrepublic.anitomyj.Token.TokenFlag.kFlagEnclosed;
import static com.dgtlrepublic.anitomyj.Token.TokenFlag.kFlagNotDelimiter;

import java.text.NumberFormat;
import java.text.ParseException;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.IntStream;

import org.apache.commons.lang3.StringUtils;

import com.dgtlrepublic.anitomyj.Element.ElementCategory;
import com.dgtlrepublic.anitomyj.KeywordManager.KeywordOptions;
import com.dgtlrepublic.anitomyj.Token.Result;
import com.dgtlrepublic.anitomyj.Token.TokenFlag;

/**
 * A Utility class to assist in number parsing.
 *
 * @author Paul Miller
 * @author Eren Okka
 */
public class ParserNumber {
    public static final int kAnimeYearMin = 1900;
    public static final int kAnimeYearMax = 2050;
    public static final int kEpisodeNumberMax = kAnimeYearMin - 1;
    public static final int kVolumeNumberMax = 20;

    private final Parser parser;

    public ParserNumber(Parser parser) {
        this.parser = parser;
    }

    /** Returns whether or not a the {@code number} is a volume number. */
    public boolean isValidVolumeNumber(String number) {
        return StringHelper.stringToInt(number) <= kVolumeNumberMax;
    }

    /** Returns whether or not the {@code number} is a valid episode number. */
    public boolean isValidEpisodeNumber(String number) {
        try {
            return NumberFormat.getInstance().parse(number).doubleValue() <= kEpisodeNumberMax;
        } catch (ParseException | NullPointerException | NumberFormatException e) {
            return false;
        }
    }

    /************ S E T ********** */

    /** Sets the alternative episode number. */
    public boolean setAlternativeEpisodeNumber(String number, Token token) {
        parser.getElements().add(new Element(kElementEpisodeNumberAlt, number));
        token.setCategory(kIdentifier);
        return true;
    }

    /**
     * Sets the volume number.
     *
     * @param number   the number
     * @param token    the token which contains the volume number
     * @param validate true if we should check if it's a valid number; false to disable verification.
     * @return true if the volume number was set
     */
    public boolean setVolumeNumber(String number, Token token, boolean validate) {
        if (validate && !isValidVolumeNumber(number)) {
            return false;
        }

        parser.getElements().add(new Element(kElementVolumeNumber, number));
        token.setCategory(kIdentifier);
        return true;
    }

    /**
     * Sets the anime episode number.
     *
     * @param number   the episode number
     * @param token    the token which contains the volume number.
     * @param validate true if we should check if it's a valid episode number; false to disable validation
     * @return true if the episode number was set
     */
    public boolean setEpisodeNumber(String number, Token token, boolean validate) {
        if (validate && !isValidEpisodeNumber(number))
            return false;
        token.setCategory(kIdentifier);
        ElementCategory category = kElementEpisodeNumber;

        /** Handle equivalent numbers */
        if (parser.isEpisodeKeywordsFound()) {
            for (Element element : parser.getElements()) {
                if (element.getCategory() != kElementEpisodeNumber)
                    continue;

                /** The larger number gets to be the alternative one */
                int comparison = StringHelper.stringToInt(number) - StringHelper.stringToInt(element.getValue());
                if (comparison > 0) {
                    category = kElementEpisodeNumberAlt;
                } else if (comparison < 0) {
                    element.setCategory(kElementEpisodeNumberAlt);
                } else {
                    return false; /** No need to add the same number twice */
                }

                break;
            }
        }

        parser.getElements().add(new Element(category, number));
        return true;
    }

    /**
     * Checks if a number follows the specified {@code token}.
     *
     * @param category the category to set if a number follows the {@code token}.
     * @param token    the token
     * @return true if a number follows the token; false otherwise
     */
    public boolean numberComesAfterPrefix(ElementCategory category, Token token) {
        int number_begin = ParserHelper.indexOfFirstDigit(token.getContent());
        String prefix = StringUtils.substring(token.getContent(), 0, number_begin).toUpperCase(Locale.ENGLISH);
        if (KeywordManager.getInstance().contains(category, prefix)) {
            String number = StringUtils.substring(token.getContent(), number_begin, token.getContent().length());

            switch (category) {
            case kElementEpisodePrefix:
                if (!matchEpisodePatterns(number, token))
                    setEpisodeNumber(number, token, false);
                return true;
            case kElementVolumePrefix:
                if (!matchVolumePatterns(number, token))
                    setVolumeNumber(number, token, false);
                return true;
            }
        }

        return false;
    }

    /**
     * Checks whether the the number precedes the word "of".
     *
     * @param token           the token
     * @param currentTokenIdx the index of the token.
     * @return true if the token precedes the word "of"
     */
    public boolean numberComesBeforeTotalNumber(Token token, int currentTokenIdx) {
        Result nextToken = Token.findNextToken(parser.getTokens(), currentTokenIdx, kFlagNotDelimiter);
        if (nextToken.token != null) {
            if (StringUtils.equalsIgnoreCase(nextToken.token.getContent(), "of")) {
                Result otherToken = Token.findNextToken(parser.getTokens(), nextToken, kFlagNotDelimiter);

                if (otherToken.token != null) {
                    if (StringHelper.isNumericString(otherToken.token.getContent())) {
                        setEpisodeNumber(token.getContent(), token, false);
                        nextToken.token.setCategory(kIdentifier);
                        otherToken.token.setCategory(kIdentifier);
                        return true;
                    }
                }
            }
        }

        return false;
    }

    /************ E P I S O D E  M A T C H E R S ********** */

    /**
     * Attempts to find an episode/season inside a {@code word}/
     *
     * @param word  the word
     * @param token the token
     * @return true if the word was matched to an episode/season number
     */
    public boolean matchEpisodePatterns(String word, Token token) {
        if (StringHelper.isNumericString(word))
            return false;

        word = StringHelper.trimAny(word, " -");

        boolean numericFront = Character.isDigit(word.charAt(0));
        boolean numericBack = Character.isDigit(word.charAt(word.length() - 1));

        // e.g. "01v2"
        if (numericFront && numericBack)
            if (matchSingleEpisodePattern(word, token))
                return true;
        // e.g. "01-02", "03-05v2"
        if (numericFront && numericBack)
            if (matchMultiEpisodePattern(word, token))
                return true;
        // e.g. "2x01", "S01E03", "S01-02xE001-150"
        if (numericBack)
            if (matchSeasonAndEpisodePattern(word, token))
                return true;
        // e.g. "ED1", "OP4a", "OVA2"
        if (!numericFront)
            if (matchTypeAndEpisodePattern(word, token))
                return true;
        // e.g. "07.5"
        if (numericFront && numericBack)
            if (matchFractionalEpisodePattern(word, token))
                return true;
        // e.g. "4a", "111C"
        if (numericFront && !numericBack)
            if (matchPartialEpisodePattern(word, token))
                return true;
        // e.g. "#01", "#02-03v2"
        if (numericBack)
            if (matchNumberSignPattern(word, token))
                return true;
        // U+8A71 is used as counter for stories, episodes of TV series, etc.
        if (numericFront)
            if (matchJapaneseCounterPattern(word, token))
                return true;

        return false;
    }

    /**
     * Match a single episode pattern. e.g. "01v2".
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchSingleEpisodePattern(String word, Token token) {
        String regexPattern = "(\\d{1,3})[vV](\\d)";
        Pattern pattern = Pattern.compile(regexPattern);
        Matcher matcher = pattern.matcher(word);
        if (matcher.matches()) {
            setEpisodeNumber(matcher.group(1), token, false);
            parser.getElements().add(new Element(kElementReleaseVersion, matcher.group(2)));
            return true;
        }

        return false;
    }

    /**
     * Match a multi episode pattern. e.g. "01-02", "03-05v2".
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchMultiEpisodePattern(String word, Token token) {
        String regexPattern = "(\\d{1,3})(?:[vV](\\d))?[-~&+](\\d{1,3})(?:[vV](\\d))?";
        Pattern pattern = Pattern.compile(regexPattern);
        Matcher matcher = pattern.matcher(word);
        if (matcher.matches()) {
            String lowerBound = matcher.group(1);
            String upperBound = matcher.group(3);

            /** Avoid matching expressions such as "009-1" or "5-2" */
            if (StringHelper.stringToInt(lowerBound) < StringHelper.stringToInt(upperBound)) {
                if (setEpisodeNumber(lowerBound, token, true)) {
                    setEpisodeNumber(upperBound, token, true);
                    if (StringUtils.isNotEmpty(matcher.group(2)))
                        parser.getElements().add(new Element(kElementReleaseVersion, matcher.group(2)));
                    if (StringUtils.isNotEmpty(matcher.group(4)))
                        parser.getElements().add(new Element(kElementReleaseVersion, matcher.group(4)));
                    return true;
                }
            }
        }

        return false;
    }

    /**
     * Match season and episode patters. e.g. "2x01", "S01E03", "S01-02xE001-150".
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchSeasonAndEpisodePattern(String word, Token token) {
        String regexPattern = "S?(\\d{1,2})(?:-S?(\\d{1,2}))?(?:x|[ ._-x]?E)(\\d{1,3})(?:-E?(\\d{1,3}))?";
        Pattern pattern = Pattern.compile(regexPattern, Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(word);
        if (matcher.matches()) {
            parser.getElements().add(new Element(kElementAnimeSeason, matcher.group(1)));
            if (StringUtils.isNotEmpty(matcher.group(2)))
                parser.getElements().add(new Element(kElementAnimeSeason, matcher.group(2)));
            setEpisodeNumber(matcher.group(3), token, false);
            if (StringUtils.isNotEmpty(matcher.group(4)))
                setEpisodeNumber(matcher.group(4), token, false);
            return true;
        }

        return false;
    }

    /**
     * Match type and episode. e.g. "2x01", "S01E03", "S01-02xE001-150".
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchTypeAndEpisodePattern(String word, Token token) {
        int numberBegin = ParserHelper.indexOfFirstDigit(word);
        String prefix = StringUtils.substring(word, 0, numberBegin);

        AtomicReference<ElementCategory> category = new AtomicReference<>(kElementAnimeType);
        AtomicReference<KeywordOptions> options = new AtomicReference<>();

        if (KeywordManager.getInstance().findAndSet(KeywordManager.normalzie(prefix), category, options)) {
            parser.getElements().add(new Element(kElementAnimeType, prefix));
            String number = StringUtils.substring(word, numberBegin);
            if (matchEpisodePatterns(number, token) || setEpisodeNumber(number, token, true)) {
                int foundIdx = parser.getTokens().indexOf(token);
                if (foundIdx != -1) {
                    token.setContent(number);
                    parser.getTokens().add(foundIdx, new Token(
                            options.get().isIdentifiable() ? kIdentifier : kUnknown, prefix, token.isEnclosed()));
                }

                return true;
            }
        }

        return false;
    }

    /**
     * Match fractional episodes. e.g. "07.5".
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchFractionalEpisodePattern(String word, Token token) {
        if (StringUtils.isEmpty(word))
            word = "";
        String regexPattern = "\\d+\\.5";
        Pattern pattern = Pattern.compile(regexPattern, Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(word);
        if (matcher.matches()) {
            if (setEpisodeNumber(word, token, true))
                return true;
        }

        return false;
    }

    /**
     * Match partial episodes episodes. "4a", "111C".
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchPartialEpisodePattern(String word, Token token) {
        if (StringUtils.isEmpty(word))
            return false;
        int foundIdx = IntStream.rangeClosed(0, word.length())
                .filter(value -> !Character.isDigit(word.charAt(value))).findFirst().orElse(word.length());
        int suffixLength = word.length() - foundIdx;

        Function<Integer, Boolean> isValidSuffix = c -> (c >= 'A' && c <= 'C') || (c >= 'a' && c <= 'c');

        if (suffixLength == 1 && isValidSuffix.apply((int) word.charAt(foundIdx)))
            if (setEpisodeNumber(word, token, true))
                return true;

        return false;
    }

    /**
     * Match partial episodes episodes. e.g. "#01", "#02-03v2".
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchNumberSignPattern(String word, Token token) {
        if (StringUtils.isEmpty(word) || word.charAt(0) != '#')
            word = "";
        String regexPattern = "#(\\d{1,3})(?:[-~&+](\\d{1,3}))?(?:[vV](\\d))?";
        Pattern pattern = Pattern.compile(regexPattern);
        Matcher matcher = pattern.matcher(word);
        if (matcher.matches()) {
            if (setEpisodeNumber(matcher.group(1), token, true)) {
                if (StringUtils.isNotEmpty(matcher.group(2)))
                    setEpisodeNumber(matcher.group(2), token, false);
                if (StringUtils.isNotEmpty(matcher.group(3)))
                    parser.getElements().add(new Element(kElementReleaseVersion, matcher.group(3)));
                return true;
            }
        }

        return false;
    }

    /**
     * Match Japanese patterns. e.g. U+8A71 is used as counter for stories, episodes of TV series, etc.
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchJapaneseCounterPattern(String word, Token token) {
        if (StringUtils.isEmpty(word) || word.charAt(word.length() - 1) != '\u8A71')
            return false;
        String regexPattern = "(\\d{1,3})\u8A71";
        Pattern pattern = Pattern.compile(regexPattern);
        Matcher matcher = pattern.matcher(word);
        if (matcher.matches()) {
            setEpisodeNumber(matcher.group(1), token, false);
            return true;
        }

        return false;
    }

    /************ V O L U M E  M A T C H E R S ********** */

    /**
     * Attempts to find an volume numbers inside a {@code word}.
     *
     * @param word  the word
     * @param token the token
     * @return true if the word was matched to an episode/season number
     */
    public boolean matchVolumePatterns(String word, Token token) {
        // All patterns contain at least one non-numeric character
        if (StringHelper.isNumericString(word))
            return false;

        word = StringHelper.trimAny(word, " -");

        boolean numericFront = Character.isDigit(word.charAt(0));
        boolean numericBack = Character.isDigit(word.charAt(word.length() - 1));

        // e.g. "01v2"
        if (numericFront && numericBack)
            if (matchSingleVolumePattern(word, token))
                return true;
        // e.g. "01-02", "03-05v2"
        if (numericFront && numericBack)
            if (matchMultiVolumePattern(word, token))
                return true;

        return false;
    }

    /**
     * Match single volume. e.g. "01v2".
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchSingleVolumePattern(String word, Token token) {
        if (StringUtils.isEmpty(word))
            word = "";
        String regexPattern = "(\\d{1,2})[vV](\\d)";
        Pattern pattern = Pattern.compile(regexPattern);
        Matcher matcher = pattern.matcher(word);
        if (matcher.matches()) {
            setVolumeNumber(matcher.group(1), token, false);
            parser.getElements().add(new Element(kElementReleaseVersion, matcher.group(2)));
            return true;
        }

        return false;
    }

    /**
     * Match multi-volume. e.g. "01-02", "03-05v2".
     *
     * @param word  the word
     * @param token the token
     * @return true if the token matched
     */
    public boolean matchMultiVolumePattern(String word, Token token) {
        if (StringUtils.isEmpty(word))
            word = "";
        String regexPattern = "(\\d{1,2})[-~&+](\\d{1,2})(?:[vV](\\d))?";
        Pattern pattern = Pattern.compile(regexPattern);
        Matcher matcher = pattern.matcher(word);
        if (matcher.matches()) {
            String lowerBound = matcher.group(1);
            String upperBound = matcher.group(2);
            if (StringHelper.stringToInt(lowerBound) < StringHelper.stringToInt(upperBound)) {
                if (setVolumeNumber(lowerBound, token, true)) {
                    setVolumeNumber(upperBound, token, false);
                    if (StringUtils.isNotEmpty(matcher.group(3)))
                        parser.getElements().add(new Element(kElementReleaseVersion, matcher.group(3)));
                    return true;
                }
            }
        }

        return false;
    }

    /************ S E A R C H ********** */

    /**
     * Searches for isolated numbers in a list of {@code tokens}.
     *
     * @param tokens the list of tokens
     * @return true if an isolated number was found
     */
    public boolean searchForIsolatedNumbers(List<Result> tokens) {
        for (Result it : tokens) {
            if (!it.token.isEnclosed() || !parser.getParserHelper().isTokenIsolated(it.pos))
                continue;
            if (setEpisodeNumber(it.token.getContent(), it.token, true))
                return true;
        }

        return false;
    }

    /**
     * Searches for separated numbers in a list of {@code tokens}.
     *
     * @param tokens the list of tokens
     * @return true if a separated number was found
     */
    public boolean searchForSeparatedNumbers(List<Result> tokens) {
        for (Result it : tokens) {
            Result previousToken = Token.findPrevToken(parser.getTokens(), it, TokenFlag.kFlagNotDelimiter);

            // See if the number has a preceding "-" separator
            if (ParserHelper.isTokenCategory(previousToken.token, kUnknown)
                    && ParserHelper.isDashCharacter(previousToken.token.getContent().charAt(0))) {
                if (setEpisodeNumber(it.token.getContent(), it.token, true)) {
                    previousToken.token.setCategory(kIdentifier);
                    return true;
                }
            }
        }

        return false;
    }

    /**
     * Searches for episode patterns in a list of {@code tokens}.
     *
     * @param tokens the list of tokens
     * @return true if an episode number was found
     */
    public boolean searchForEpisodePatterns(List<Result> tokens) {
        for (Result it : tokens) {
            boolean numericFront = it.token.getContent().length() > 0
                    && Character.isDigit(it.token.getContent().charAt(0));

            if (!numericFront) {
                // e.g. "EP.1", "Vol.1"
                if (numberComesAfterPrefix(kElementEpisodePrefix, it.token))
                    return true;
                if (numberComesAfterPrefix(kElementVolumePrefix, it.token))
                    continue;
            } else {
                // e.g. "8 of 12"
                if (numberComesBeforeTotalNumber(it.token, it.pos))
                    return true;
            }

            // Look for other patterns
            if (matchEpisodePatterns(it.token.getContent(), it.token)) {
                return true;
            }
        }

        return false;
    }

    /**
     * Searches for equivalent number in a list of {@code tokens}. e.g 08(114)
     *
     * @param tokens the list of tokens
     * @return true if an equivalent number was found
     */
    public boolean searchForEquivalentNumbers(List<Result> tokens) {
        for (Result it : tokens) {
            // find number must be isolated
            if (parser.getParserHelper().isTokenIsolated(it.pos) || !isValidEpisodeNumber(it.token.getContent())) {
                continue;
            }

            // Find the first enclosed, non-delimiter token
            Result nextToken = Token.findNextToken(parser.getTokens(), it.pos, kFlagNotDelimiter);
            if (!ParserHelper.isTokenCategory(nextToken, kBracket))
                continue;
            nextToken = Token.findNextToken(parser.getTokens(), nextToken, kFlagEnclosed, kFlagNotDelimiter);
            if (!ParserHelper.isTokenCategory(nextToken, kUnknown))
                continue;

            // Check if it's an isolated number
            if (!parser.getParserHelper().isTokenIsolated(nextToken.pos)
                    || !StringHelper.isNumericString(nextToken.token.getContent())
                    || !isValidEpisodeNumber(nextToken.token.getContent())) {
                continue;
            }

            List<Token> list = Arrays.asList(it.token, nextToken.token);
            list.sort((o1, o2) -> Integer.compare(StringHelper.stringToInt(o1.getContent()),
                    StringHelper.stringToInt(o2.getContent())));
            setEpisodeNumber(list.get(0).getContent(), list.get(0), false);
            setAlternativeEpisodeNumber(list.get(1).getContent(), list.get(1));
            return true;
        }

        return false;
    }

    /**
     * Searches for the last number token in a list of {@code tokens}.
     *
     * @param tokens the list of tokens
     * @return true if the last number token was found
     */
    public boolean searchForLastNumber(List<Result> tokens) {
        for (int i = tokens.size() - 1; i >= 0; i--) {
            Result it = tokens.get(i);

            // Assuming that episode number always comes after the title, first token
            // cannot be what we're looking for
            if (it.pos == 0)
                continue;

            if (it.token.isEnclosed())
                continue;

            // Ignore if it's the first non-enclosed, non-delimiter token
            if (parser.getTokens().subList(0, it.pos).stream()
                    .allMatch(r -> r.isEnclosed() || r.getCategory() == kDelimiter)) {
                continue;
            }

            // Ignore if the previous token is "Movie" or "Part"
            Result previousToken = Token.findPrevToken(parser.getTokens(), it, TokenFlag.kFlagNotDelimiter);
            if (ParserHelper.isTokenCategory(previousToken, kUnknown)) {
                if (StringUtils.equalsIgnoreCase(previousToken.token.getContent(), "Movie")
                        || StringUtils.equalsIgnoreCase(previousToken.token.getContent(), "Part")) {
                    continue;
                }
            }

            // We'll use this number after all
            if (setEpisodeNumber(it.token.getContent(), it.token, true))
                return true;
        }

        return false;
    }
}