org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java Source code

Java tutorial

Introduction

Here is the source code for org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

Source

/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015 Jakob Hende
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package org.xlrnet.metadict.engines.woxikon;

import com.google.common.collect.ImmutableMap;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xlrnet.metadict.api.engine.SearchEngine;
import org.xlrnet.metadict.api.language.GrammaticalGender;
import org.xlrnet.metadict.api.language.Language;
import org.xlrnet.metadict.api.query.*;

import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * Engine for woxikon.de backend. Internal methods work based on reverse-engineered HTML structures.
 */
public class WoxikonEngine implements SearchEngine {

    private static final Logger LOGGER = LoggerFactory.getLogger(WoxikonEngine.class);

    private static final String BASE_URL = "http://www.woxikon.de/";

    private static final Map<Language, String> BASE_URL_PER_LANGUAGE = ImmutableMap.<Language, String>builder()
            .put(Language.FRENCH, "deutsch-franzoesisch").put(Language.SPANISH, "deutsch-spanisch")
            .put(Language.SWEDISH, "deutsch-schwedisch").put(Language.RUSSIAN, "deutsch-russisch")
            .put(Language.FINNISH, "deutsch-finnisch").put(Language.TURKISH, "deutsch-tuerkisch")
            .put(Language.ENGLISH, "deutsch-englisch").put(Language.ITALIAN, "deutsch-italienisch")
            .put(Language.DUTCH, "deutsch-niederlaendisch").put(Language.PORTUGUESE, "deutsch-portugiesisch")
            .put(Language.POLISH, "deutsch-polnisch").put(Language.NORWEGIAN, "deutsch-norwegisch").build();

    private static final Map<String, EntryType> ENTRY_TYPE_MAP = ImmutableMap.<String, EntryType>builder()
            .put("(a)", EntryType.ADJECTIVE).put("(v)", EntryType.VERB).put("(n)", EntryType.NOUN)
            .put("(o)", EntryType.OTHER_WORD).build();

    private static final Map<String, GrammaticalGender> GENDER_MAP = ImmutableMap
            .<String, GrammaticalGender>builder().put("{m}", GrammaticalGender.MASCULINE)
            .put("{f}", GrammaticalGender.FEMININE).put("{n}", GrammaticalGender.NEUTER).build();

    private static final int TIMEOUT_MILLIS = 1500;

    private static final String CLASS_TRANSLATION = "dictionary-table-word";

    private static final String DESCRIPTION_BEGIN = "[";

    private static final String DESCRIPTION_END = "]";

    private static final String CLASS_GENDER = "word-gender";

    private static final String CLASS_WORDTYPE = "word-type";

    private static final String CLASS_DESCRIPTION = "word-description";

    private static final String CLASS_EXTRA_INFO = "word-extra-info";

    @NotNull
    @Override
    public BilingualQueryResult executeBilingualQuery(@NotNull String queryInput, @NotNull Language inputLanguage,
            @NotNull Language outputLanguage, boolean allowBothWay) throws Exception {
        Language targetLanguage = findTargetLanguage(inputLanguage, outputLanguage);
        URL targetUrl = buildTargetUrl(queryInput, targetLanguage);

        Document doc = Jsoup.parse(targetUrl, TIMEOUT_MILLIS);

        return processBilingualDocument(queryInput, doc, targetLanguage);
    }

    private BilingualQueryResult processBilingualDocument(@NotNull String queryInput, @NotNull Document doc,
            @NotNull Language targetLanguage) {
        BilingualQueryResultBuilder resultBuilder = ImmutableBilingualQueryResult.builder();

        processTranslationTable(queryInput, doc, resultBuilder, Language.GERMAN, targetLanguage);
        processTranslationTable(queryInput, doc, resultBuilder, targetLanguage, Language.GERMAN);

        findRecommendations(doc, resultBuilder);

        return resultBuilder.build();
    }

    private void extractBilingualSynonyms(@NotNull String queryString, @NotNull Element synonymsTable,
            @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage) {
        List<Element> synonymNodes = synonymsTable.select("tr").stream()
                .filter(e -> e.getElementsByTag("th").size() == 0).collect(Collectors.toList());

        if (synonymNodes.size() == 0) {
            LOGGER.debug("No synonym entries found");
            return;
        }

        String synonymEntryTitle = synonymsTable.select("span.hl").first().text();

        Map<String, SynonymGroupBuilder> synonymGroupMap = new HashMap<>();

        for (Element synonymNode : synonymNodes) {
            // Extract only information from the "from"-node (i.e. source language)
            DictionaryObject newSynonym = processSingleNode(
                    synonymNode.getElementsByClass(CLASS_TRANSLATION).get(0), sourceLanguage, queryString);
            String groupName = newSynonym.getDescription() != null ? newSynonym.getDescription() : queryString;
            if (groupName != null) {
                SynonymGroupBuilder groupBuilder = synonymGroupMap.computeIfAbsent(groupName,
                        (s) -> ImmutableSynonymGroup.builder()
                                .setBaseMeaning(ImmutableDictionaryObject.createSimpleObject(sourceLanguage, s)));
                groupBuilder.addSynonym(newSynonym);
            } else {
                LOGGER.warn("Synonym group is null");
            }
        }

        SynonymEntryBuilder synonymEntryBuilder = ImmutableSynonymEntry.builder()
                .setBaseObject(ImmutableDictionaryObject.createSimpleObject(sourceLanguage, synonymEntryTitle));

        for (SynonymGroupBuilder synonymGroupBuilder : synonymGroupMap.values()) {
            synonymEntryBuilder.addSynonymGroup(synonymGroupBuilder.build());
        }

        resultBuilder.addSynonymEntry(synonymEntryBuilder.build());
    }

    private void findRecommendations(@NotNull Document doc, @NotNull BilingualQueryResultBuilder resultBuilder) {
        // Determine all candidate nodes:
        Elements alternativeNodes = doc.select("div.cc > p > *");

        Language currentLanguage = null;

        for (Element node : alternativeNodes) {
            // If the next node is a flagicon, try to determine the language for the next entries from the class name
            if (node.tagName().equals("span") && node.hasClass("flagicon")) {
                Set<String> classNames = node.classNames();
                classNames.remove("flagicon");
                for (String className : classNames) {
                    Language candidate = Language.getExistingLanguageById(className);
                    if (candidate != null) {
                        currentLanguage = candidate;
                        break;
                    }
                }
            } else if (node.tagName().equals("a")) {
                String recommendationText = node.text();

                DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder();
                objectBuilder.setLanguage(currentLanguage).setGeneralForm(recommendationText);

                resultBuilder.addSimilarRecommendation(objectBuilder.build());
            }
        }
    }

    private void processTranslationTable(@NotNull String queryString, @NotNull Document document,
            @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage,
            @NotNull Language targetLanguage) {
        // Find main table (german to X)
        String languageIdentifier = sourceLanguage.getIdentifier().toLowerCase() + "-"
                + targetLanguage.getIdentifier().toLowerCase();

        Element translationTable = document.getElementById("dictionary-" + languageIdentifier);

        // Process the main table with its entries
        if (translationTable != null) {
            // Find all relevant entries, filter them by class and process them
            translationTable.getElementsByTag("tr").stream().filter(e -> e.getElementsByTag("th").size() == 0)
                    .forEach(e -> processEntry(queryString, e, resultBuilder, sourceLanguage, targetLanguage));
            // Extract synonyms
            Elements synonymTableCandidates = document.getElementsByClass("dictionary-synonyms-table");
            if (synonymTableCandidates.size() > 0) {
                extractBilingualSynonyms(queryString, synonymTableCandidates.get(0), resultBuilder, sourceLanguage);
            }

        } else {
            LOGGER.debug("Translation table for {} -> {} with query \"{}\" is null", languageIdentifier,
                    targetLanguage.getIdentifier(), queryString);
        }
    }

    private void processEntry(@NotNull String queryString, @NotNull Element entryNode,
            @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage,
            @NotNull Language targetLanguage) {
        if (!StringUtils.equals(entryNode.tag().getName(), "tr")) {
            LOGGER.warn("Expected <tr> tag - got <{}>", entryNode.tag().getName());
            return;
        }
        Elements words = entryNode.getElementsByClass(CLASS_TRANSLATION);

        if (words.size() != 2) {
            LOGGER.warn("Expected 2 elements with class \"" + CLASS_TRANSLATION + "\" - got {}", words.size());
            return;
        }

        BilingualEntryBuilder entryBuilder = ImmutableBilingualEntry.builder();

        entryBuilder.setEntryType(detectEntryType(words.get(0)));
        entryBuilder.setInputObject(processSingleNode(words.get(0), sourceLanguage, queryString));
        entryBuilder.setOutputObject(processSingleNode(words.get(1), targetLanguage, queryString));

        resultBuilder.addBilingualEntry(entryBuilder.build());
    }

    @NotNull
    private DictionaryObject processSingleNode(@NotNull Element element, @NotNull Language language,
            String queryString) {
        DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder();
        objectBuilder.setLanguage(language);

        // Extract entry text:
        String context = StringUtils.substringBefore(element.text(), element.getElementsByTag("a").first().text());
        String generalForm = context + element.getElementsByTag("a").first().text();
        objectBuilder.setGeneralForm(StringUtils.strip(generalForm));

        // Extract description:
        extractDescription(element, queryString, objectBuilder);

        // Extract gender:
        extractGender(element, objectBuilder);

        return objectBuilder.build();
    }

    private void extractDescription(@NotNull Element element, String queryString,
            DictionaryObjectBuilder objectBuilder) {
        Element descriptionNode = element.getElementsByClass(CLASS_DESCRIPTION).first();
        if (descriptionNode == null) {
            // Try to detect the description node with an alternative class (necessary for synonyms)
            descriptionNode = element.getElementsByClass(CLASS_EXTRA_INFO).first();
        }
        if (descriptionNode != null) {
            String description = descriptionNode.text();

            description = StringUtils.removeStart(description, DESCRIPTION_BEGIN);
            description = StringUtils.removeEnd(description, DESCRIPTION_END);

            if (!StringUtils.equalsIgnoreCase(description, queryString)) // Set description only if it is different from request string
                objectBuilder.setDescription(StringUtils.strip(description));
        }
    }

    private void extractGender(@NotNull Element element, DictionaryObjectBuilder objectBuilder) {
        Element genderNode = element.getElementsByClass(CLASS_GENDER).first();
        if (genderNode != null) {
            String gender = genderNode.text();
            if (GENDER_MAP.containsKey(gender))
                objectBuilder.setGrammaticalGender(GENDER_MAP.get(gender));
        }
    }

    private EntryType detectEntryType(@NotNull Element element) {
        Elements wordTypeNodes = element.getElementsByClass(CLASS_WORDTYPE);

        if (wordTypeNodes.size() < 1) {
            LOGGER.debug("No wordType node found - defaulting to {}", EntryType.UNKNOWN);
            return EntryType.UNKNOWN;
        }

        EntryType entryType = ENTRY_TYPE_MAP.getOrDefault(wordTypeNodes.first().text(), EntryType.UNKNOWN);

        if (entryType == EntryType.UNKNOWN)
            LOGGER.debug("Unable to resolve entry type \"{}\"", entryType);

        return entryType;
    }

    /**
     * Returns the one of two languages that is not GERMAN. If none of the two is german, an IllegalArgumentException
     * will be thrown.
     *
     * @param inputLanguage
     *         First language to compare.
     * @param outputLanguage
     *         Second language to compare.
     * @return The non-german language.
     */
    @NotNull
    private Language findTargetLanguage(@NotNull Language inputLanguage, @NotNull Language outputLanguage) {

        if (Language.GERMAN.equals(inputLanguage))
            return outputLanguage;
        else if (Language.GERMAN.equals(outputLanguage)) {
            return inputLanguage;
        } else {
            throw new IllegalArgumentException("Expected at least one language to be german - got "
                    + inputLanguage.getIdentifier() + " and " + outputLanguage.getDisplayName());
        }
    }

    @NotNull
    private URL buildTargetUrl(@NotNull String queryString, @NotNull Language language)
            throws UnsupportedEncodingException, MalformedURLException {
        if (!BASE_URL_PER_LANGUAGE.containsKey(language))
            throw new IllegalArgumentException("Unsupported language request: " + language.toString());

        String encodedQueryString = URLEncoder.encode(queryString, "UTF-8");
        return new URL(BASE_URL + BASE_URL_PER_LANGUAGE.get(language) + "/" + encodedQueryString + ".php");
    }

}