com.dgtlrepublic.anitomyj.Parser.java Source code

Java tutorial

Introduction

Here is the source code for com.dgtlrepublic.anitomyj.Parser.java

Source

/*
 * Copyright (c) 2014-2016, Eren Okka
 * Copyright (c) 2016, Paul Miller
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 */

package com.dgtlrepublic.anitomyj;

import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementAnimeSeasonPrefix;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementAnimeTitle;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementAnimeType;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementAnimeYear;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementEpisodeNumber;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementEpisodePrefix;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementEpisodeTitle;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementFileChecksum;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementReleaseGroup;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementReleaseVersion;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementUnknown;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementVideoResolution;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementVolumeNumber;
import static com.dgtlrepublic.anitomyj.Element.ElementCategory.kElementVolumePrefix;
import static com.dgtlrepublic.anitomyj.Token.TokenCategory.kBracket;
import static com.dgtlrepublic.anitomyj.Token.TokenCategory.kIdentifier;
import static com.dgtlrepublic.anitomyj.Token.TokenCategory.kUnknown;
import static com.dgtlrepublic.anitomyj.Token.TokenFlag.kFlagBracket;
import static com.dgtlrepublic.anitomyj.Token.TokenFlag.kFlagEnclosed;
import static com.dgtlrepublic.anitomyj.Token.TokenFlag.kFlagIdentifier;
import static com.dgtlrepublic.anitomyj.Token.TokenFlag.kFlagNone;
import static com.dgtlrepublic.anitomyj.Token.TokenFlag.kFlagNotDelimiter;
import static com.dgtlrepublic.anitomyj.Token.TokenFlag.kFlagNotEnclosed;
import static com.dgtlrepublic.anitomyj.Token.TokenFlag.kFlagUnknown;

import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.lang3.StringUtils;

import com.dgtlrepublic.anitomyj.Element.ElementCategory;
import com.dgtlrepublic.anitomyj.KeywordManager.KeywordOptions;
import com.dgtlrepublic.anitomyj.Token.Result;
import com.dgtlrepublic.anitomyj.Token.TokenFlag;

/**
 * Class to classify {@link Token}s.
 *
 * @author Paul Miller
 * @author Eren Okka
 */
public class Parser {
    private boolean isEpisodeKeywordsFound = false;
    private final ParserHelper parserHelper;
    private final ParserNumber parserNumber;
    private final List<Element> elements;
    private final List<Token> tokens;
    private final Options options;

    /**
     * Constructs a new token parser.
     *
     * @param elements the list where parsed elements will be added
     * @param options  the parser options
     * @param tokens   the list of tokens.
     */
    public Parser(List<Element> elements, Options options, List<Token> tokens) {
        this.elements = Objects.requireNonNull(elements);
        this.options = Objects.requireNonNull(options);
        this.tokens = Objects.requireNonNull(tokens);
        this.parserHelper = new ParserHelper(this);
        this.parserNumber = new ParserNumber(this);
    }

    /** Returns the list of elements. */
    public List<Element> getElements() {
        return elements;
    }

    /** Returns the list of tokens. */
    public List<Token> getTokens() {
        return tokens;
    }

    /** Returns the parser helper. */
    public ParserHelper getParserHelper() {
        return parserHelper;
    }

    /** Returns the number parser. */
    public ParserNumber getParserNumber() {
        return parserNumber;
    }

    /** Returns whether or not episode keywords were found. */
    public boolean isEpisodeKeywordsFound() {
        return isEpisodeKeywordsFound;
    }

    /** Begins the parsing process */
    public boolean parse() {
        searchForKeywords();
        searchForIsolatedNumbers();

        if (options.parseEpisodeNumber) {
            SearchForEpisodeNumber();
        }

        searchForAnimeTitle();

        if (options.parseReleaseGroup && empty(kElementReleaseGroup)) {
            searchForReleaseGroup();
        }

        if (options.parseEpisodeTitle && !empty(kElementEpisodeNumber)) {
            searchForEpisodeTitle();
        }

        validateElements();
        return empty(kElementAnimeTitle);
    }

    /** Search for anime keywords. */
    private void searchForKeywords() {
        for (int i = 0; i < tokens.size(); i++) {
            Token token = tokens.get(i);
            if (token.getCategory() != kUnknown)
                continue;

            String word = token.getContent();
            word = StringHelper.trimAny(word, " -");
            if (word.isEmpty())
                continue;

            // Don't bother if the word is a number that cannot be CRC
            if (word.length() != 8 && StringHelper.isNumericString(word))
                continue;

            String keyword = KeywordManager.normalzie(word);
            AtomicReference<ElementCategory> category = new AtomicReference<>(kElementUnknown);
            AtomicReference<KeywordOptions> options = new AtomicReference<>(new KeywordOptions());

            if (KeywordManager.getInstance().findAndSet(keyword, category, options)) {
                if (!this.options.parseReleaseGroup && category.get() == kElementReleaseGroup)
                    continue;
                if (!ParserHelper.isElementCategorySearchable(category.get()) || !options.get().isSearchable())
                    continue;
                if (ParserHelper.isElementCategorySingular(category.get()) && !empty(category.get()))
                    continue;
                if (category.get() == kElementAnimeSeasonPrefix) {
                    parserHelper.checkAndSetAnimeSeasonKeyword(token, i);
                    continue;
                } else if (category.get() == kElementEpisodePrefix) {
                    if (options.get().isValid()) {
                        parserHelper.checkExtentKeyword(kElementEpisodeNumber, i, token);
                        continue;
                    }
                } else if (category.get() == kElementReleaseVersion) {
                    word = StringUtils.substring(word, 1);
                } else if (category.get() == kElementVolumePrefix) {
                    parserHelper.checkExtentKeyword(kElementVolumeNumber, i, token);
                    continue;
                }
            } else {
                if (empty(kElementFileChecksum) && ParserHelper.isCrc32(word)) {
                    category.set(kElementFileChecksum);
                } else if (empty(kElementVideoResolution) && ParserHelper.isResolution(word)) {
                    category.set(kElementVideoResolution);
                }
            }

            if (category.get() != kElementUnknown) {
                elements.add(new Element(category.get(), word));
                if (options.get() != null && options.get().isIdentifiable()) {
                    token.setCategory(kIdentifier);
                }
            }
        }
    }

    /** Search for episode number. */
    private void SearchForEpisodeNumber() {
        // List all unknown tokens that contain a number
        List<Result> tokens = new ArrayList<>();
        for (int i = 0; i < this.tokens.size(); i++) {
            Token token = this.tokens.get(i);
            if (token.getCategory() == kUnknown && ParserHelper.indexOfFirstDigit(token.getContent()) != -1) {
                tokens.add(new Result(token, i));
            }
        }

        if (tokens.isEmpty())
            return;

        isEpisodeKeywordsFound = !empty(kElementEpisodeNumber);

        // If a token matches a known episode pattern, it has to be the episode number
        if (parserNumber.searchForEpisodePatterns(tokens))
            return;

        // We have previously found an episode number via keywords
        if (!empty(kElementEpisodeNumber))
            return;

        // From now on, we're only interested in numeric tokens
        tokens.removeIf(r -> !StringHelper.isNumericString(r.token.getContent()));

        // e.g. "01 (176)", "29 (04)"
        if (parserNumber.searchForEquivalentNumbers(tokens))
            return;

        // e.g. " - 08"
        if (parserNumber.searchForSeparatedNumbers(tokens))
            return;

        // e.g. "[12]", "(2006)"
        if (parserNumber.searchForIsolatedNumbers(tokens))
            return;

        // Consider using the last number as a last resort
        parserNumber.searchForLastNumber(tokens);
    }

    /** Search for anime title. */
    private void searchForAnimeTitle() {
        boolean enclosedTitle = false;

        Result tokenBegin = Token.findToken(tokens, kFlagNotEnclosed, kFlagUnknown);

        // If that doesn't work, find the first unknown token in the second enclosed
        // group, assuming that the first one is the release group
        if (tokenBegin.token == null) {
            tokenBegin = new Result(null, 0);
            enclosedTitle = true;
            boolean skippedPreviousGroup = false;

            do {
                tokenBegin = Token.findToken(tokens, tokenBegin, kFlagUnknown);
                if (tokenBegin.token == null)
                    break;

                // Ignore groups that are composed of non-Latin characters
                if (StringHelper.isMostlyLatinString(tokenBegin.token.getContent()) && skippedPreviousGroup) {
                    break;
                }

                // Get the first unknown token of the next group
                tokenBegin = Token.findToken(tokens, tokenBegin, kFlagBracket);
                tokenBegin = Token.findToken(tokens, tokenBegin, kFlagUnknown);
                skippedPreviousGroup = true;
            } while (tokenBegin.token != null);
        }

        if (tokenBegin.token == null)
            return;

        // Continue until an identifier (or a bracket, if the title is enclosed)
        // is found
        Result tokenEnd = Token.findToken(tokens, tokenBegin, kFlagIdentifier,
                enclosedTitle ? kFlagBracket : kFlagNone);

        // If within the interval there's an open bracket without its matching pair,
        // move the upper endpoint back to the bracket
        if (!enclosedTitle) {
            int end = tokenEnd.pos != null ? tokenEnd.pos : tokens.size();
            Result lastBracket = tokenEnd;
            boolean bracketOpen = false;
            for (int i = tokenBegin.pos; i < end; i++) {
                Token token = tokens.get(i);
                if (token.getCategory() == kBracket) {
                    lastBracket = new Result(token, i);
                    bracketOpen = !bracketOpen;
                }
            }
            if (bracketOpen)
                tokenEnd = lastBracket;
        }

        // If the interval ends with an enclosed group (e.g. "Anime Title [Fansub]"),
        // move the upper endpoint back to the beginning of the group. We ignore
        // parentheses in order to keep certain groups (e.g. "(TV)") intact.
        if (!enclosedTitle) {
            int end = tokenEnd.pos != null ? tokenEnd.pos : tokens.size();
            Result token = Token.findPrevToken(tokens, end, kFlagNotDelimiter);

            while (ParserHelper.isTokenCategory(token.token, kBracket)
                    && token.token.getContent().charAt(0) != ')') {

                token = Token.findPrevToken(tokens, token, kFlagBracket);
                if (token.pos != null) {
                    tokenEnd = token;
                    token = Token.findPrevToken(tokens, tokenEnd, kFlagNotDelimiter);
                }
            }
        }

        int end = tokens.size();
        if (tokenEnd.token != null)
            end = Math.min(tokenEnd.pos, end);
        parserHelper.buildElement(kElementAnimeTitle, false, tokens.subList(tokenBegin.pos, end));
    }

    /** Search for release group. */
    private void searchForReleaseGroup() {
        for (Result tokenBegin = new Result(null, 0), tokenEnd = tokenBegin; tokenBegin.pos != null
                && tokenBegin.pos < tokens.size();) {

            // Find the first enclosed unknown token
            tokenBegin = Token.findToken(tokens, tokenEnd, kFlagEnclosed, kFlagUnknown);
            if (tokenBegin.token == null)
                return;

            // Continue until a bracket or identifier is found
            tokenEnd = Token.findToken(tokens, tokenBegin, kFlagBracket, kFlagIdentifier);
            if (tokenEnd.token == null || tokenEnd.token.getCategory() != kBracket)
                continue;

            // Ignore if it's not the first non-delimiter token in group
            Result prevToken = Token.findPrevToken(tokens, tokenBegin, TokenFlag.kFlagNotDelimiter);
            if (prevToken.token != null && prevToken.token.getCategory() != kBracket)
                continue;

            int end = tokens.size();
            end = Math.min(tokenEnd.pos, end);
            parserHelper.buildElement(kElementReleaseGroup, true, tokens.subList(tokenBegin.pos, end));
            return;
        }
    }

    /** Search for episode title. */
    private void searchForEpisodeTitle() {
        // Find the first non-enclosed unknown token
        Result tokenBegin = Token.findToken(tokens, kFlagNotEnclosed, kFlagUnknown);
        if (tokenBegin.token == null)
            return;

        // Continue until a bracket or identifier is found
        Result tokenEnd = Token.findToken(tokens, tokenBegin, kFlagBracket, kFlagIdentifier);

        int end = tokens.size();
        if (tokenEnd.pos != null)
            end = Math.min(tokenEnd.pos, end);
        parserHelper.buildElement(kElementEpisodeTitle, false, tokens.subList(tokenBegin.pos, end));
    }

    /** Search for isolated numbers. */
    private void searchForIsolatedNumbers() {
        for (int i = 0; i < tokens.size(); i++) {
            Token token = tokens.get(i);
            if (token.getCategory() != kUnknown || !StringHelper.isNumericString(token.getContent())
                    || !parserHelper.isTokenIsolated(i)) {
                continue;
            }

            int number = StringHelper.stringToInt(token.getContent());

            // Anime year
            if (number >= ParserNumber.kAnimeYearMin && number <= ParserNumber.kAnimeYearMax) {
                if (empty(kElementAnimeYear)) {
                    elements.add(new Element(kElementAnimeYear, token.getContent()));
                    token.setCategory(kIdentifier);
                    continue;
                }
            }

            // Video resolution
            if (number == 480 || number == 720 || number == 1080) {
                // If these numbers are isolated, it's more likely for them to be the
                // video resolution rather than the episode number. Some fansub groups use these without the "p" suffix.
                if (empty(kElementVideoResolution)) {
                    elements.add(new Element(kElementVideoResolution, token.getContent()));
                    token.setCategory(kIdentifier);
                }
            }
        }
    }

    /** Validate Elements. */
    private void validateElements() {
        if (!empty(kElementAnimeType) && !empty(kElementEpisodeTitle)) {
            String episodeTitle = get(kElementEpisodeTitle);

            for (int i = 0; i < elements.size();) {
                Element el = elements.get(i);

                if (el.getCategory() == kElementAnimeType) {
                    if (StringUtils.contains(episodeTitle, el.getValue())) {
                        if (episodeTitle.length() == el.getValue().length()) {
                            elements.removeIf(element -> element.getCategory() == kElementEpisodeTitle); // invalid episode title
                        } else {
                            String keyword = KeywordManager.normalzie(el.getValue());
                            if (KeywordManager.getInstance().contains(kElementAnimeType, keyword)) {
                                i = erase(el); // invalid anime type
                                continue;
                            }
                        }
                    }
                }
                ++i;
            }
        }
    }

    /** Returns whether or not the parser contains this category. */
    private boolean empty(ElementCategory category) {
        return !elements.stream().anyMatch(element -> element.getCategory() == category);
    }

    /** Returns the value of a particular category. */
    private String get(ElementCategory category) {
        Element foundElement = elements.stream().filter(element -> element.getCategory() == category).findAny()
                .orElse(null);

        if (foundElement == null) {
            Element e = new Element(category, "");
            elements.add(e);
            foundElement = e;
        }

        return foundElement.getValue();
    }

    /** Deletes the first element with the same {@code element.category} and returns the deleted elements position. */
    private int erase(Element element) {
        int removedIdx = -1;
        for (ListIterator<Element> itr = elements.listIterator(); itr.hasNext();) {
            int idx = itr.nextIndex();
            Element curE = itr.next();
            if (element.getCategory() == curE.getCategory()) {
                removedIdx = idx;
                itr.remove();
                break;
            }
        }

        return removedIdx;
    }
}