nlp.wikipedia.lang.EnConfig.java Source code

Java tutorial

Introduction

Here is the source code for nlp.wikipedia.lang.EnConfig.java

Source

/**
 * This file is part of Wikiforia.
 *
 * Wikiforia is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * Wikiforia is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Wikiforia. If not, see <http://www.gnu.org/licenses/>.
 */
package nlp.wikipedia.lang;

//Autogenerated from Wikimedia sources at 2015-04-16T13:55:11+00:00

import org.apache.commons.lang.StringUtils;
import nlp.mediawiki.model.Page;
import nlp.wikipedia.WikipediaPageType;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class EnConfig extends TemplateConfig {
    public EnConfig() {
        addNamespaceAlias(-2, "Media");
        addNamespaceAlias(-1, "Special");
        addNamespaceAlias(0, "");
        addNamespaceAlias(1, "Talk");
        addNamespaceAlias(2, "User");
        addNamespaceAlias(3, "User_talk");
        addNamespaceAlias(5, "Wikipedia_talk");
        addNamespaceAlias(6, "File");
        addNamespaceAlias(7, "File_talk");
        addNamespaceAlias(8, "MediaWiki");
        addNamespaceAlias(9, "MediaWiki_talk");
        addNamespaceAlias(10, "Template");
        addNamespaceAlias(11, "Template_talk");
        addNamespaceAlias(12, "Help");
        addNamespaceAlias(13, "Help_talk");
        addNamespaceAlias(14, "Category");
        addNamespaceAlias(15, "Category_talk");

        addI18nCIAlias("redirect", "#REDIRECT");
        addI18nCIAlias("notoc", "__NOTOC__");
        addI18nCIAlias("nogallery", "__NOGALLERY__");
        addI18nCIAlias("forcetoc", "__FORCETOC__");
        addI18nCIAlias("toc", "__TOC__");
        addI18nCIAlias("noeditsection", "__NOEDITSECTION__");
        addI18nAlias("!", "!");
        addI18nAlias("currentmonth", "CURRENTMONTH", "CURRENTMONTH2");
        addI18nAlias("currentmonth1", "CURRENTMONTH1");
        addI18nAlias("currentmonthname", "CURRENTMONTHNAME");
        addI18nAlias("currentmonthnamegen", "CURRENTMONTHNAMEGEN");
        addI18nAlias("currentmonthabbrev", "CURRENTMONTHABBREV");
        addI18nAlias("currentday", "CURRENTDAY");
        addI18nAlias("currentday2", "CURRENTDAY2");
        addI18nAlias("currentdayname", "CURRENTDAYNAME");
        addI18nAlias("currentyear", "CURRENTYEAR");
        addI18nAlias("currenttime", "CURRENTTIME");
        addI18nAlias("currenthour", "CURRENTHOUR");
        addI18nAlias("localmonth", "LOCALMONTH", "LOCALMONTH2");
        addI18nAlias("localmonth1", "LOCALMONTH1");
        addI18nAlias("localmonthname", "LOCALMONTHNAME");
        addI18nAlias("localmonthnamegen", "LOCALMONTHNAMEGEN");
        addI18nAlias("localmonthabbrev", "LOCALMONTHABBREV");
        addI18nAlias("localday", "LOCALDAY");
        addI18nAlias("localday2", "LOCALDAY2");
        addI18nAlias("localdayname", "LOCALDAYNAME");
        addI18nAlias("localyear", "LOCALYEAR");
        addI18nAlias("localtime", "LOCALTIME");
        addI18nAlias("localhour", "LOCALHOUR");
        addI18nAlias("numberofpages", "NUMBEROFPAGES");
        addI18nAlias("numberofarticles", "NUMBEROFARTICLES");
        addI18nAlias("numberoffiles", "NUMBEROFFILES");
        addI18nAlias("numberofusers", "NUMBEROFUSERS");
        addI18nAlias("numberofactiveusers", "NUMBEROFACTIVEUSERS");
        addI18nAlias("numberofedits", "NUMBEROFEDITS");
        addI18nAlias("pagename", "PAGENAME");
        addI18nAlias("pagenamee", "PAGENAMEE");
        addI18nAlias("namespace", "NAMESPACE");
        addI18nAlias("namespacee", "NAMESPACEE");
        addI18nAlias("namespacenumber", "NAMESPACENUMBER");
        addI18nAlias("talkspace", "TALKSPACE");
        addI18nAlias("talkspacee", "TALKSPACEE");
        addI18nAlias("subjectspace", "SUBJECTSPACE", "ARTICLESPACE");
        addI18nAlias("subjectspacee", "SUBJECTSPACEE", "ARTICLESPACEE");
        addI18nAlias("fullpagename", "FULLPAGENAME");
        addI18nAlias("fullpagenamee", "FULLPAGENAMEE");
        addI18nAlias("subpagename", "SUBPAGENAME");
        addI18nAlias("subpagenamee", "SUBPAGENAMEE");
        addI18nAlias("rootpagename", "ROOTPAGENAME");
        addI18nAlias("rootpagenamee", "ROOTPAGENAMEE");
        addI18nAlias("basepagename", "BASEPAGENAME");
        addI18nAlias("basepagenamee", "BASEPAGENAMEE");
        addI18nAlias("talkpagename", "TALKPAGENAME");
        addI18nAlias("talkpagenamee", "TALKPAGENAMEE");
        addI18nAlias("subjectpagename", "SUBJECTPAGENAME", "ARTICLEPAGENAME");
        addI18nAlias("subjectpagenamee", "SUBJECTPAGENAMEE", "ARTICLEPAGENAMEE");
        addI18nCIAlias("msg", "MSG:");
        addI18nCIAlias("subst", "SUBST:");
        addI18nCIAlias("safesubst", "SAFESUBST:");
        addI18nCIAlias("msgnw", "MSGNW:");
        addI18nAlias("img_thumbnail", "thumbnail", "thumb");
        addI18nAlias("img_manualthumb", "thumbnail=$1", "thumb=$1");
        addI18nAlias("img_right", "right");
        addI18nAlias("img_left", "left");
        addI18nAlias("img_none", "none");
        addI18nAlias("img_width", "$1px");
        addI18nAlias("img_center", "center", "centre");
        addI18nAlias("img_framed", "framed", "enframed", "frame");
        addI18nAlias("img_frameless", "frameless");
        addI18nAlias("img_lang", "lang=$1");
        addI18nAlias("img_page", "page=$1", "page $1");
        addI18nAlias("img_upright", "upright", "upright=$1", "upright $1");
        addI18nAlias("img_border", "border");
        addI18nAlias("img_baseline", "baseline");
        addI18nAlias("img_sub", "sub");
        addI18nAlias("img_super", "super", "sup");
        addI18nAlias("img_top", "top");
        addI18nAlias("img_text_top", "text-top");
        addI18nAlias("img_middle", "middle");
        addI18nAlias("img_bottom", "bottom");
        addI18nAlias("img_text_bottom", "text-bottom");
        addI18nAlias("img_link", "link=$1");
        addI18nAlias("img_alt", "alt=$1");
        addI18nAlias("img_class", "class=$1");
        addI18nCIAlias("int", "INT:");
        addI18nAlias("sitename", "SITENAME");
        addI18nCIAlias("ns", "NS:");
        addI18nCIAlias("nse", "NSE:");
        addI18nCIAlias("localurl", "LOCALURL:");
        addI18nCIAlias("localurle", "LOCALURLE:");
        addI18nCIAlias("articlepath", "ARTICLEPATH");
        addI18nCIAlias("pageid", "PAGEID");
        addI18nCIAlias("server", "SERVER");
        addI18nCIAlias("servername", "SERVERNAME");
        addI18nCIAlias("scriptpath", "SCRIPTPATH");
        addI18nCIAlias("stylepath", "STYLEPATH");
        addI18nCIAlias("grammar", "GRAMMAR:");
        addI18nCIAlias("gender", "GENDER:");
        addI18nCIAlias("notitleconvert", "__NOTITLECONVERT__", "__NOTC__");
        addI18nCIAlias("nocontentconvert", "__NOCONTENTCONVERT__", "__NOCC__");
        addI18nAlias("currentweek", "CURRENTWEEK");
        addI18nAlias("currentdow", "CURRENTDOW");
        addI18nAlias("localweek", "LOCALWEEK");
        addI18nAlias("localdow", "LOCALDOW");
        addI18nAlias("revisionid", "REVISIONID");
        addI18nAlias("revisionday", "REVISIONDAY");
        addI18nAlias("revisionday2", "REVISIONDAY2");
        addI18nAlias("revisionmonth", "REVISIONMONTH");
        addI18nAlias("revisionmonth1", "REVISIONMONTH1");
        addI18nAlias("revisionyear", "REVISIONYEAR");
        addI18nAlias("revisiontimestamp", "REVISIONTIMESTAMP");
        addI18nAlias("revisionuser", "REVISIONUSER");
        addI18nAlias("revisionsize", "REVISIONSIZE");
        addI18nCIAlias("plural", "PLURAL:");
        addI18nCIAlias("fullurl", "FULLURL:");
        addI18nCIAlias("fullurle", "FULLURLE:");
        addI18nCIAlias("canonicalurl", "CANONICALURL:");
        addI18nCIAlias("canonicalurle", "CANONICALURLE:");
        addI18nCIAlias("lcfirst", "LCFIRST:");
        addI18nCIAlias("ucfirst", "UCFIRST:");
        addI18nCIAlias("lc", "LC:");
        addI18nCIAlias("uc", "UC:");
        addI18nCIAlias("raw", "RAW:");
        addI18nAlias("displaytitle", "DISPLAYTITLE");
        addI18nAlias("rawsuffix", "R");
        addI18nCIAlias("nocommafysuffix", "NOSEP");
        addI18nAlias("newsectionlink", "__NEWSECTIONLINK__");
        addI18nAlias("nonewsectionlink", "__NONEWSECTIONLINK__");
        addI18nAlias("currentversion", "CURRENTVERSION");
        addI18nCIAlias("urlencode", "URLENCODE:");
        addI18nCIAlias("anchorencode", "ANCHORENCODE");
        addI18nAlias("currenttimestamp", "CURRENTTIMESTAMP");
        addI18nAlias("localtimestamp", "LOCALTIMESTAMP");
        addI18nAlias("directionmark", "DIRECTIONMARK", "DIRMARK");
        addI18nCIAlias("language", "#LANGUAGE:");
        addI18nAlias("contentlanguage", "CONTENTLANGUAGE", "CONTENTLANG");
        addI18nAlias("pagesinnamespace", "PAGESINNAMESPACE:", "PAGESINNS:");
        addI18nAlias("numberofadmins", "NUMBEROFADMINS");
        addI18nCIAlias("formatnum", "FORMATNUM");
        addI18nCIAlias("padleft", "PADLEFT");
        addI18nCIAlias("padright", "PADRIGHT");
        addI18nCIAlias("special", "special");
        addI18nCIAlias("speciale", "speciale");
        addI18nAlias("defaultsort", "DEFAULTSORT:", "DEFAULTSORTKEY:", "DEFAULTCATEGORYSORT:");
        addI18nCIAlias("filepath", "FILEPATH:");
        addI18nCIAlias("tag", "tag");
        addI18nAlias("hiddencat", "__HIDDENCAT__");
        addI18nAlias("pagesincategory", "PAGESINCATEGORY", "PAGESINCAT");
        addI18nAlias("pagesize", "PAGESIZE");
        addI18nAlias("index", "__INDEX__");
        addI18nAlias("noindex", "__NOINDEX__");
        addI18nAlias("numberingroup", "NUMBERINGROUP", "NUMINGROUP");
        addI18nAlias("staticredirect", "__STATICREDIRECT__");
        addI18nAlias("protectionlevel", "PROTECTIONLEVEL");
        addI18nAlias("cascadingsources", "CASCADINGSOURCES");
        addI18nCIAlias("formatdate", "formatdate", "dateformat");
        addI18nCIAlias("url_path", "PATH");
        addI18nCIAlias("url_wiki", "WIKI");
        addI18nCIAlias("url_query", "QUERY");
        addI18nCIAlias("noerror", "defaultsort_noerror", "displaytitle_noerror");
        addI18nCIAlias("noreplace", "defaultsort_noreplace", "displaytitle_noreplace");
        addI18nCIAlias("pagesincategory_all", "all");
        addI18nCIAlias("pagesincategory_pages", "pages");
        addI18nCIAlias("pagesincategory_subcats", "subcats");
        addI18nCIAlias("pagesincategory_files", "files");
    }

    @Override
    protected String getSiteName() {
        return "Wikipedia";
    }

    @Override
    protected String getWikiUrl() {
        return "http://en.wikipedia.org/";
    }

    @Override
    public String getIso639() {
        return "en";
    }

    private static final Pattern stubTextPattern = Pattern.compile("\\{\\{\\s*?(.*?stub)\\s*?\\}\\}",
            Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
    private static final Pattern disambiguationTextPattern = Pattern.compile(
            "\\{\\{\\.*?(disambiguation|disambig|disamb|geodis|hndis|dab)\\.*?\\}\\}",
            Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
    private static final Pattern listTitlePattern = Pattern.compile("(^list\\s+of)",
            Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL);
    private static final Pattern disambiguationTitlePattern = Pattern.compile(".*?\\((\\s)*disambiguation(\\s)*\\)",
            Pattern.CASE_INSENSITIVE);
    private static final Pattern templateContent = Pattern.compile("\\{\\{(.+)?\\}\\}");

    private static final String[] disambiguationParts = { "disambiguation", "disambig", "disamb", "geodis", "hndis",
            "dab" };

    public static boolean matchDisambiguation(String text) {
        Matcher matcher = templateContent.matcher(text);
        while (matcher.find()) {
            String templateContent = matcher.group(1).toLowerCase();
            for (String disambiguationPart : disambiguationParts) {
                if (templateContent.contains(disambiguationPart))
                    return true;
            }
        }

        return false;
    }

    public static boolean matchStub(String text) {
        Matcher matcher = templateContent.matcher(text);
        while (matcher.find()) {
            String templateContent = matcher.group(1);
            if (StringUtils.containsIgnoreCase("stub", text))
                return true;
        }

        return false;
    }

    @Override
    public WikipediaPageType classifyPageType(Page page) {
        WikipediaPageType type = super.classifyPageType(page);
        if (type == WikipediaPageType.ARTICLE) {
            //Matcher matcher = disambiguationTextPattern.matcher(page.getContent());
            if (matchDisambiguation(page.getContent())
                    || disambiguationTitlePattern.matcher(page.getTitle()).find()) {
                return WikipediaPageType.DISAMBIGUATION;
            }

            //Matcher matcher = stubTextPattern.matcher(page.getContent());
            if (matchStub(page.getContent())) {
                return WikipediaPageType.STUB;
            }

            if (listTitlePattern.matcher(page.getTitle()).find())
                return WikipediaPageType.LIST;

            return WikipediaPageType.ARTICLE;
        } else
            return type;
    }
}