org.xlrnet.metadict.engines.nobordbok.OrdbokEngine.java Source code

Java tutorial

Introduction

Here is the source code for org.xlrnet.metadict.engines.nobordbok.OrdbokEngine.java

Source

/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015 Jakob Hende
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package org.xlrnet.metadict.engines.nobordbok;

import com.google.common.collect.ImmutableMap;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xlrnet.metadict.api.engine.SearchEngine;
import org.xlrnet.metadict.api.language.Language;
import org.xlrnet.metadict.api.language.UnsupportedLanguageException;
import org.xlrnet.metadict.api.query.*;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;

/**
 * Engine for nob-ordbok.no backend.
 */
public class OrdbokEngine implements SearchEngine {

    private static final Logger LOGGER = LoggerFactory.getLogger(OrdbokEngine.class);

    private static final Map<String, EntryType> ENTRY_TYPE_MAP = ImmutableMap.<String, EntryType>builder()
            .put("verb", EntryType.VERB).put("adv.", EntryType.ADVERB).put("adj.", EntryType.PREPOSITION)
            .put("prep.", EntryType.PREPOSITION).put("konj.", EntryType.CONJUNCTION).build();

    private static final String SYLLABLE_SEPARATOR_CHAR = "|";

@NotNull
@Override
public MonolingualQueryResult executeMonolingualQuery(@NotNull String queryString, @NotNull Language queryLanguage) throws Exception {
    if (!(Language.NORWEGIAN_BOKML.equals(queryLanguage) || Language.NORWEGIAN_NYNORSK.equals(queryLanguage) || Language.NORWEGIAN.equals(queryLanguage))) {
        throw new UnsupportedLanguageException(queryLanguage);
    }

    Document document = fetchResponse(queryString, queryLanguage);

    return processDocument(document);
}

    @NotNull
    private String buildTargetUrl(@NotNull String searchRequest, boolean queryBokmaal, boolean queryNynorsk)
            throws UnsupportedEncodingException {
        StringBuilder targetUrlBuilder = new StringBuilder("http://www.nob-ordbok.uio.no/perl/ordbok.cgi?OPP=")
                .append(URLEncoder.encode(searchRequest, "UTF-8")).append("&");

        if (queryBokmaal & queryNynorsk) {
            targetUrlBuilder.append("begge=+&ordbok=begge");
        } else if (queryBokmaal) {
            targetUrlBuilder.append("bokmaal=+&ordbok=bokmaal");
        } else if (queryNynorsk) {
            targetUrlBuilder.append("nynorsk=+&ordbok=nynorsk");
        } else {
            throw new IllegalArgumentException("Either nynorsk or bokmaal must be queried");
        }

        return targetUrlBuilder.toString();
    }

    /**
     * Extract both the general form and all syllables from the oppslagsord-node.
     */
    private void extractGeneralForm(DictionaryObjectBuilder objectBuilder, Element oppslagsord) {
        String rawForm = oppslagsord.text();
        String[] syllabification = StringUtils.split(rawForm, SYLLABLE_SEPARATOR_CHAR);

        String generalForm = StringUtils.remove(rawForm, SYLLABLE_SEPARATOR_CHAR);
        objectBuilder.setGeneralForm(generalForm);
        objectBuilder.setSyllabification(syllabification);
    }

private Document fetchResponse(@NotNull String queryString, @NotNull Language queryLanguage) throws IOException {
    boolean queryBokmaal = false, queryNynorsk = false;

    if (queryLanguage.equals(Language.NORWEGIAN_BOKML) || queryLanguage.equals(Language.NORWEGIAN))
        queryBokmaal = true;
    if (queryLanguage.equals(Language.NORWEGIAN_NYNORSK) || queryLanguage.equals(Language.NORWEGIAN))
        queryNynorsk = true;

    String targetUrl = buildTargetUrl(queryString, queryBokmaal, queryNynorsk);
    URL url = new URL(targetUrl);

    return Jsoup.parse(url, 3000);
}

@NotNull
private MonolingualQueryResult processDocument(@NotNull Document document) {
    MonolingualQueryResultBuilder resultBuilder = ImmutableMonolingualQueryResult.builder();

    Element bokmaalTable = document.getElementById("byttutBM");
    Element nynorskTable = document.getElementById("byttutNN");

    if (bokmaalTable != null)
        processTable(bokmaalTable, Language.NORWEGIAN_BOKML, resultBuilder);
    if (nynorskTable != null)
        processTable(nynorskTable, Language.NORWEGIAN_NYNORSK, resultBuilder);

    return resultBuilder.build();
}

    private void processTable(@NotNull Element table, @NotNull Language language,
            @NotNull MonolingualQueryResultBuilder resultBuilder) {
        Elements tableRows = table.getElementsByTag("tr");

        if (tableRows.size() <= 1) {
            LOGGER.warn("Word table has unexpected size {}", tableRows.size());
            return;
        }

        for (int i = 1; i < tableRows.size(); i++) {
            Element tableRow = tableRows.get(i);
            Optional<MonolingualEntry> entry = processTableRow(tableRow, language);
            if (entry.isPresent())
                resultBuilder.addMonolingualEntry(entry.get());
        }
    }

    @NotNull
    private Optional<MonolingualEntry> processTableRow(@NotNull Element tableRow, @NotNull Language language) {
        MonolingualEntryBuilder entryBuilder = ImmutableMonolingualEntry.builder();
        DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder().setLanguage(language);

        // Extract general form
        Element oppslagsord = tableRow.getElementsByClass("oppslagsord").first();
        if (oppslagsord != null) {
            extractGeneralForm(objectBuilder, oppslagsord);
        } else {
            LOGGER.warn("Unable to find main element - skipping entry.");
            return Optional.empty();
        }

        // Extract wordclass and determine entrytype
        String wordClass = tableRow.getElementsByClass("oppsgramordklasse").first().text();
        entryBuilder.setEntryType(resolveEntryTypeWithWordClass(wordClass));

        // Get meanings
        Elements meaningCandidates = tableRow.select(".artikkelinnhold > .utvidet > .tyding");
        if (meaningCandidates.size() == 0)
            meaningCandidates = tableRow.select(".artikkelinnhold > .utvidet");
        meaningCandidates.forEach(e -> {
            String meaning = e.childNodes().stream()
                    .filter(node -> (node instanceof TextNode) || (!((Element) node).hasClass("doemeliste")
                            && !node.hasAttr("style") && !((Element) node).hasClass("utvidet")
                            && !((Element) node).hasClass("artikkelinnhold")
                            && !((Element) node).hasClass("kompakt")))
                    .map((Node n) -> {
                        if (n instanceof Element)
                            return ((Element) n).text();
                        else
                            return n.toString();
                    }).collect(Collectors.joining());
            meaning = StringEscapeUtils.unescapeHtml4(meaning);
            meaning = StringUtils.strip(meaning);
            if (StringUtils.isNotBlank(meaning))
                objectBuilder.addMeaning(meaning);
        });

        entryBuilder.setContent(objectBuilder.build());

        return Optional.of(entryBuilder.build());
    }

    /**
     * Try to resolve the {@link EntryType} with a given "word class" string from the bokmaalordboka.
     * <p>
     * Supported entries: <ul> <li>mX -> male noun (X is any int)</li> <li>fX -> female noun (X is any int</li> <li>nX
     * -> neuter noun (X is any int)</li> <li>adv. -> adverb</li>> <li>adj. -> adjective</li> <li>aX -> adjective (X is
     * any int)</li> <li>prep. -> preposition</li> <li>konj. -> conjuction</li> <li>vX -> verb (X is any int)</li>
     * <li>verb -> verb</li> </ul>
     *
     * @param wordClass
     *         The word class.
     * @return a valid metadict entry type.
     */
    @NotNull
    private EntryType resolveEntryTypeWithWordClass(@NotNull String wordClass) {
        EntryType entryType = EntryType.UNKNOWN;

        if (ENTRY_TYPE_MAP.containsKey(wordClass))
            entryType = ENTRY_TYPE_MAP.get(wordClass);
        else if (StringUtils.startsWithIgnoreCase(wordClass, "a"))
            entryType = EntryType.ADJECTIVE;
        else if (StringUtils.startsWithIgnoreCase(wordClass, "m"))
            entryType = EntryType.NOUN;
        else if (StringUtils.startsWithIgnoreCase(wordClass, "f"))
            entryType = EntryType.NOUN;
        else if (StringUtils.startsWithIgnoreCase(wordClass, "n"))
            entryType = EntryType.NOUN;
        else if (StringUtils.startsWithIgnoreCase(wordClass, "v"))
            entryType = EntryType.VERB;

        return entryType;
    }
}