org.yamj.core.tools.LanguageTools.java Source code

Introduction

Here is the source code for org.yamj.core.tools.LanguageTools.java
Source

/*
 *      Copyright (c) 2004-2015 YAMJ Members
 *      https://github.com/organizations/YAMJ/teams
 *
 *      This file is part of the Yet Another Media Jukebox (YAMJ).
 *
 *      YAMJ is free software: you can redistribute it and/or modify
 *      it under the terms of the GNU General Public License as published by
 *      the Free Software Foundation, either version 3 of the License, or
 *      any later version.
 *
 *      YAMJ is distributed in the hope that it will be useful,
 *      but WITHOUT ANY WARRANTY; without even the implied warranty of
 *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *      GNU General Public License for more details.
 *
 *      You should have received a copy of the GNU General Public License
 *      along with YAMJ.  If not, see <http://www.gnu.org/licenses/>.
 *
 *      Web: https://github.com/YAMJ/yamj-v3
 *
 */
package org.yamj.core.tools;

import java.util.Collection;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.PostConstruct;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import org.yamj.common.tools.PropertyTools;
import org.yamj.common.util.KeywordMap;
import org.yamj.common.util.PatternUtils;
import org.yamj.common.util.TokensPatternMap;

@Component
public class LanguageTools {

    private static final Logger LOG = LoggerFactory.getLogger(LanguageTools.class);

    /**
     * Mapping exact tokens to language.
     *
     * Strict mapping is case sensitive and must be obvious, it must avoid confusing movie name words and language markers.
     *
     * For example the English word "it" and Italian language marker "it", or "French" as part of the title and "french" as language
     * marker.
     *
     * However, described above is important only by file naming with token delimiters (see tokens description constants
     * TOKEN_DELIMITERS*). Language detection in non-token separated titles will be skipped automatically.
     *
     * Language markers, found with this pattern are counted as token delimiters (they will cut movie title)
     */
    private final static TokensPatternMap STRICT_LANGUAGE_MAP = new TokensPatternMap() {
        private static final long serialVersionUID = 3630995345545037071L;

        @Override
        protected void put(String key, Collection<String> tokens) {
            StringBuilder tokenBuilder = new StringBuilder();
            for (String s : tokens) {
                if (tokenBuilder.length() > 0) {
                    tokenBuilder.append('|');
                }
                tokenBuilder.append(Pattern.quote(s));
            }
            put(key, PatternUtils.tpatt(tokenBuilder.toString()));
        }
    };

    /**
     * Mapping loose language markers.
     *
     * The second pass of language detection is being started after movie title detection. Language markers will be scanned with
     * loose pattern in order to find out more languages without chance to confuse with movie title.
     *
     * Markers in this map are case insensitive.
     */
    private final static TokensPatternMap LOOSE_LANGUAGE_MAP = new TokensPatternMap() {
        private static final long serialVersionUID = 1383819843117148442L;

        @Override
        protected void put(String key, Collection<String> tokens) {
            StringBuilder tokenBuilder = new StringBuilder();
            for (String token : tokens) {
                // Only add the token if it's not there already
                String quotedToken = Pattern.quote(token.toUpperCase());
                if (tokenBuilder.indexOf(quotedToken) < 0) {
                    if (tokenBuilder.length() > 0) {
                        tokenBuilder.append('|');
                    }
                    tokenBuilder.append(quotedToken);
                }
            }
            put(key, PatternUtils.iwpatt(tokenBuilder.toString()));
        }
    };

    @PostConstruct
    public void init() {
        final KeywordMap languages = PropertyTools.getKeywordMap("language.detection.keywords", null);
        if (!languages.isEmpty()) {
            for (String lang : languages.getKeywords()) {
                String values = languages.get(lang);
                if (values != null) {
                    STRICT_LANGUAGE_MAP.put(lang, values);
                    LOOSE_LANGUAGE_MAP.put(lang, values);
                } else {
                    LOG.info("No values found for language code '{}'", lang);
                }
            }
        }
    }

    /**
     * Decode the language tag passed in, into standard YAMJ language code
     *
     * @param language
     * @return
     */
    public static String determineLanguage(String language) {
        for (Map.Entry<String, Pattern> e : STRICT_LANGUAGE_MAP.entrySet()) {
            Matcher matcher = e.getValue().matcher(language);
            if (matcher.find()) {
                return e.getKey();
            }
        }
        return null;
    }

    /**
     * Get the list of loose languages associated with a language
     *
     * @param language
     * @return
     */
    public static String getLanguageList(String language) {
        if (LOOSE_LANGUAGE_MAP.containsKey(language)) {
            Pattern langPatt = LOOSE_LANGUAGE_MAP.get(language);
            return langPatt.toString().toLowerCase();
        }
        return StringUtils.EMPTY;
    }

    public static TokensPatternMap getStrictLanguageMap() {
        return STRICT_LANGUAGE_MAP;
    }

    public static TokensPatternMap getLooseLanguageMap() {
        return LOOSE_LANGUAGE_MAP;
    }
}