de.ingrid.external.gemet.GEMETService.java Source code

Java tutorial

Introduction

Here is the source code for de.ingrid.external.gemet.GEMETService.java

Source

/*-
 * **************************************************-
 * ingrid-external-service-gemet
 * ==================================================
 * Copyright (C) 2014 - 2017 wemove digital solutions GmbH
 * ==================================================
 * Licensed under the EUPL, Version 1.1 or  as soon they will be
 * approved by the European Commission - subsequent versions of the
 * EUPL (the "Licence");
 * 
 * You may not use this work except in compliance with the Licence.
 * You may obtain a copy of the Licence at:
 * 
 * http://ec.europa.eu/idabc/eupl5
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the Licence is distributed on an "AS IS" basis,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the Licence for the specific language governing permissions and
 * limitations under the Licence.
 * **************************************************#
 */
package de.ingrid.external.gemet;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.ResourceBundle;
import java.util.Stack;

import org.apache.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;

import com.hp.hpl.jena.rdf.model.Resource;

import de.ingrid.external.ThesaurusService;
import de.ingrid.external.gemet.GEMETClient.ConceptRelation;
import de.ingrid.external.gemet.GEMETClient.ConceptType;
import de.ingrid.external.gemet.GEMETClient.MatchingConceptsSearchMode;
import de.ingrid.external.om.RelatedTerm;
import de.ingrid.external.om.Term;
import de.ingrid.external.om.Term.TermType;
import de.ingrid.external.om.TreeTerm;
import de.ingrid.external.om.impl.TreeTermImpl;

public class GEMETService implements ThesaurusService {

    private final static Logger log = Logger.getLogger(GEMETService.class);

    GEMETClient gemetClient;
    GEMETMapper gemetMapper;

    /**
     * request RDF format from service where possible (true) or JSON (false)
     * (set from gemet.properties)
     */
    protected boolean doRDF;

    /**
     * Maximum number of keywords to analyze from text (set from
     * gemet.properties)
     */
    protected int analyzeMaxWords;

    /**
     * When looking for terms from a given query ignore the matching type which
     * is passed to the service (CONTAINS, BEGINS_WITH, EXACT) and always use
     * CONTAINS to get maximum number of results (set from gemet.properties)
     */
    protected boolean ignorePassedMatchingType;

    /**
     * Also deliver terms in this language as alternateName. Set to null if no
     * alternateName !
     */
    protected String alternateLanguage = null;

    // Init Method is called by the Spring Framework on initialization
    public void init() throws Exception {
        ResourceBundle gemetProps = ResourceBundle.getBundle("gemet");

        this.doRDF = Boolean.parseBoolean(gemetProps.getString("service.request.rdf"));
        this.analyzeMaxWords = Integer.parseInt(gemetProps.getString("service.analyzeMaxWords"));
        this.ignorePassedMatchingType = Boolean
                .parseBoolean(gemetProps.getString("service.ignorePassedMatchingType"));
        try {
            this.alternateLanguage = gemetProps.getString("service.alternateLanguage");
        } catch (Exception ex) {
            // catch missing property etc., we set to null if problems
            if (log.isDebugEnabled()) {
                log.debug("Problems reading 'service.alternateLanguage' from gemet.properties, we set to null.");
            }
            this.alternateLanguage = null;
        } finally {
            // set to null if empty string to indicate no alternate language !
            if (this.alternateLanguage != null && this.alternateLanguage.length() == 0)
                this.alternateLanguage = null;
        }

        this.gemetClient = new GEMETClient(gemetProps);
        this.gemetMapper = new GEMETMapper();
    }

    /**
     * request RDF format from service where possible (true) or JSON (false)
     * (set from gemet.properties)
     */
    public boolean isDoRDF() {
        return doRDF;
    }

    /**
     * request RDF format from service where possible (true) or JSON (false)
     * (set from gemet.properties)
     */
    public void setDoRDF(boolean doRDF) {
        this.doRDF = doRDF;
    }

    /**
     * When looking for terms from a given query ignore the matching type which
     * is passed to the service (CONTAINS, BEGINS_WITH, EXACT) and always use
     * CONTAINS to get maximum number of results (set from gemet.properties)
     */
    public boolean isIgnorePassedMatchingType() {
        return ignorePassedMatchingType;
    }

    /**
     * When looking for terms from a given query ignore the matching type which
     * is passed to the service (CONTAINS, BEGINS_WITH, EXACT) and always use
     * CONTAINS to get maximum number of results (set from gemet.properties)
     */
    public void setIgnorePassedMatchingType(boolean ignorePassedMatchingType) {
        this.ignorePassedMatchingType = ignorePassedMatchingType;
    }

    /**
     * If not null name of term is also fetched in this language and set as
     * alternateName !
     */
    public String getAlternateLanguage() {
        return alternateLanguage;
    }

    /**
     * If not null name of term is also fetched in this language and set as
     * alternateName !
     */
    public void setAlternateLanguage(String alternateLanguage) {
        this.alternateLanguage = alternateLanguage;
    }

    /**
     * NOTICE: Parameter "addDescriptors" is irrelevant !
     * 
     * @see de.ingrid.external.ThesaurusService#findTermsFromQueryTerm(java.lang.String,
     *      de.ingrid.external.ThesaurusService.MatchingType, boolean,
     *      java.util.Locale)
     */
    @Override
    public Term[] findTermsFromQueryTerm(String queryTerm, MatchingType matching, boolean addDescriptors,
            Locale locale) {
        return findTermsFromQueryTerm(null, queryTerm, matching, addDescriptors, locale);
    }

    /**
     * NOTICE: Parameter "url" and "addDescriptors" are irrelevant ! queryTerm
     * is processed only up to analyzeMaxWords from gemet.properties
     * 
     * @see de.ingrid.external.ThesaurusService#findTermsFromQueryTerm(java.lang.String,
     *      java.lang.String, de.ingrid.external.ThesaurusService.MatchingType,
     *      boolean, java.util.Locale)
     */
    @Override
    public Term[] findTermsFromQueryTerm(String url, String queryTerm, MatchingType matching,
            boolean addDescriptors, Locale locale) {
        if (queryTerm == null || queryTerm.trim().length() == 0) {
            log.warn("Empty queryTerm (" + queryTerm + ") passed, we return empty list !");
            return new Term[] {};
        }

        String language = getGEMETLanguageFilter(locale);
        MatchingConceptsSearchMode gemetSearchMode = getGEMETSearchMode(matching);
        if (this.ignorePassedMatchingType) {
            // we ignore passed search mode and always search concepts
            // containing the query
            gemetSearchMode = MatchingConceptsSearchMode.CONTAINS;
        }

        List<JSONArray> responseList = new ArrayList<JSONArray>();

        // first search exact query term
        responseList.add(gemetClient.getConceptsMatchingKeyword(queryTerm, language, gemetSearchMode));

        // then search single keywords
        String[] keywords = processKeywords(queryTerm.trim().split(" "), this.analyzeMaxWords);
        if (keywords.length > 1) {
            responseList.addAll(gemetClient.getConceptsMatchingKeywords(keywords, language, gemetSearchMode));
        }

        List<Term> resultList = new ArrayList<Term>();
        if (responseList != null && responseList.size() > 0) {
            resultList = gemetMapper.mapToTermsWithKeywordsFilter(responseList, keywords, locale);
        }

        // NOTICE: result list does NOT contain additional localization of term!
        // We fetch additional localization if wanted (alternateLanguage set)
        // AND EXACT matching was requested, where we have only "one" term !
        // see https://dev.informationgrid.eu/redmine/issues/363
        if (alternateLanguage != null && MatchingConceptsSearchMode.EXACT.equals(gemetSearchMode)) {
            // fetch JSON
            for (Term result : resultList) {
                if (!alternateLanguage.equals(language)) {
                    gemetMapper.mapAlternateLanguage(
                            gemetClient.getConceptAsJSON(result.getId(), alternateLanguage), result);
                } else {
                    result.setAlternateName(result.getName());
                }
            }
            // @formatter:off
            // fetch RDF, we comment this one, is slower
            /*
                        for (int i = 0; i < resultList.size(); i++) {
            resultList.set( i, this.getTermFromRDF( resultList.get( i ).getId(), locale ) );
                        }
            */
            // @formatter:on
        }

        return resultList.toArray(new Term[resultList.size()]);
    }

    @Override
    public TreeTerm[] getHierarchyNextLevel(String termId, Locale locale) {
        return getHierarchyNextLevel(null, termId, locale);
    }

    // NOTICE: Parameter "url" is irrelevant !
    @Override
    public TreeTerm[] getHierarchyNextLevel(String url, String termId, Locale locale) {
        if (termId == null) {
            return getHierarchyTopLevel(locale);
        }

        if (termId.trim().length() == 0) {
            log.warn("No termId passed (" + termId + "), we return empty result !");
            return new TreeTerm[] {};
        }

        String language = getGEMETLanguageFilter(locale);

        // get concept itself, this is the parent
        JSONObject parent = gemetClient.getConceptAsJSON(termId, language);
        // we check on null, cause some concepts are buggy in service !
        // (e.g. concept/15041)
        if (parent == null) {
            log.error("Problems fetching " + termId + " we return empty children list !");
            return new TreeTerm[] {};
        }

        JSONArray parentArray = JSONUtils.toJSONArray(parent);

        // get direct children
        List<JSONArray> childrenList = gemetClient.getChildConcepts(termId, language);

        // get children of children (next hierarchy level) and create TreeTerms
        List<TreeTerm> resultList = new ArrayList<TreeTerm>();
        for (JSONArray children : childrenList) {
            Iterator<JSONObject> childrenIterator = children.iterator();
            while (childrenIterator.hasNext()) {

                // map basic TreeTerm
                TreeTerm resultTreeTerm = gemetMapper.mapToTreeTerm(childrenIterator.next(), null, null);

                // add parent to TreeTerm
                gemetMapper.addParentsToTreeTerm(resultTreeTerm, parentArray);

                // get next hierarchy level (subchildren) and add to TreeTerm !
                // This is time consuming, we only do this for terms where we do
                // not know whether there are children !
                // For GROUPS OR SOUPERGROUPS we just add DUMMY CHILD to
                // indicate children, so we reduce requests !
                if (TermType.NODE_LABEL.equals(resultTreeTerm.getType())) {
                    // set DUMMY CHILD to indicate children
                    resultTreeTerm.addChild(new TreeTermImpl());

                } else {
                    List<JSONArray> subChildrenList = gemetClient.getChildConcepts(resultTreeTerm.getId(),
                            language);
                    for (JSONArray subChildren : subChildrenList) {
                        gemetMapper.addChildrenToTreeTerm(resultTreeTerm, subChildren);
                    }
                }

                resultList.add(resultTreeTerm);
            }
        }

        return resultList.toArray(new TreeTerm[resultList.size()]);
    }

    private TreeTerm[] getHierarchyTopLevel(Locale locale) {
        String language = getGEMETLanguageFilter(locale);

        // get top supergroups
        JSONArray children = gemetClient.getTopmostConcepts(ConceptType.SOUPERGROUP, language);

        // get children (next hierarchy level) and create TreeTerms
        List<TreeTerm> resultList = new ArrayList<TreeTerm>();
        Iterator<JSONObject> childrenIterator = children.iterator();
        while (childrenIterator.hasNext()) {

            // map basic TreeTerm
            TreeTerm resultTreeTerm = gemetMapper.mapToTreeTerm(childrenIterator.next(), null, null);

            // NOTICE: Do not set parents in TreeTerm, stays null cause is top
            // term

            // get next hierarchy level (subchildren) and add to TreeTerm !
            // For GROUPS OR SOUPERGROUPS we just add DUMMY CHILD to indicate
            // children, so we reduce requests !
            resultTreeTerm.addChild(new TreeTermImpl());

            // @formatter:off
            /*
                        // get children and add to TreeTerm
                        List<JSONArray> subChildrenList = gemetClient.getChildConcepts( resultTreeTerm.getId(), language );
                        for (JSONArray subChildren : subChildrenList) {
            gemetMapper.addChildrenToTreeTerm( resultTreeTerm, subChildren );
                        }
            */
            // @formatter:on
            resultList.add(resultTreeTerm);
        }

        return resultList.toArray(new TreeTerm[resultList.size()]);
    }

    @Override
    public TreeTerm getHierarchyPathToTop(String termId, Locale locale) {
        return getHierarchyPathToTop(null, termId, locale);
    }

    // NOTICE: Parameter "url" is irrelevant !
    @Override
    public TreeTerm getHierarchyPathToTop(String url, String termId, Locale locale) {
        String language = getGEMETLanguageFilter(locale);

        // get concept and map to TreeTerm
        JSONObject inConcept = gemetClient.getConceptAsJSON(termId, language);
        // we check on null, cause some concepts are buggy in service !
        // (e.g. concept/15041)
        if (inConcept == null) {
            log.error("Problems fetching " + termId + " we return empty TreeTerm !");
            return new TreeTermImpl();
        }

        TreeTerm resultTreeTerm = gemetMapper.mapToTreeTerm(inConcept, null, null);

        // set parents up to top. We only produce ONE PATH, so no multiple
        // parents are set !
        // We process "stack" until stack is empty

        Stack<TreeTerm> parentStack = new Stack<TreeTerm>();
        parentStack.add(resultTreeTerm);

        while (!parentStack.empty()) {
            TreeTerm currentTerm = parentStack.pop();
            if (currentTerm.getParents() == null) {
                // no processed parents yet, add first parent found !
                processParentsOfTerm(currentTerm, language, true);
                // check parents for null, may be top node
                if (currentTerm.getParents() != null) {
                    parentStack.addAll(currentTerm.getParents());
                }
            }
        }

        return resultTreeTerm;
    }

    /**
     * Determine parents of passed TreeTerm and set them in TreeTerm. Only first
     * parent can be set.
     * 
     * @param termToProcess
     *            TreeTerm without parents but id
     * @param language
     *            fetch parents in this language
     * @param onlyFirstParent
     *            pass true if only the first parent should be added to the
     *            parents list of the TreeTerm to get only one path to top !
     * @return the passed TreeTerm with parents !
     */
    private TreeTerm processParentsOfTerm(TreeTerm termToProcess, String language, boolean onlyFirstParent) {
        // get parents
        List<JSONArray> parentsList = gemetClient.getParentConcepts(termToProcess.getId(), language);
        for (JSONArray parents : parentsList) {
            if (onlyFirstParent) {
                // only first parent should be set
                if (parents.size() > 0) {
                    gemetMapper.addParentToTreeTerm(termToProcess, (JSONObject) parents.get(0));
                    break;
                }
            } else {
                gemetMapper.addParentsToTreeTerm(termToProcess, parents);
            }
        }

        return termToProcess;
    }

    @Override
    public RelatedTerm[] getRelatedTermsFromTerm(String termId, Locale locale) {
        if (termId == null || termId.trim().length() == 0) {
            log.warn("No termId passed (" + termId + "), we return empty result !");
            return new RelatedTerm[] {};
        }

        List<RelatedTerm> resultList = new ArrayList<RelatedTerm>();
        String language = getGEMETLanguageFilter(locale);

        // we iterate over all relations !
        for (ConceptRelation concRelation : ConceptRelation.values()) {
            JSONArray relatedConcepts = gemetClient.getRelatedConcepts(termId, concRelation, language);
            resultList.addAll(gemetMapper.mapToRelatedTerms(relatedConcepts, concRelation));
        }

        return resultList.toArray(new RelatedTerm[resultList.size()]);
    }

    /**
     * NOTICE: Parameter "ignoreCase" is irrelevant ! Keywords are processed
     * only up to analyzeMaxWords from gemet.properties
     * 
     * @see de.ingrid.external.ThesaurusService#getSimilarTermsFromNames(java.lang.String[],
     *      boolean, java.util.Locale)
     */
    @Override
    public Term[] getSimilarTermsFromNames(String[] keywords, boolean ignoreCase, Locale locale) {
        String language = getGEMETLanguageFilter(locale);

        // We fetch similar terms from every single keyword and then compare
        // with other keywords when mapping to terms. GEMET service always
        // ignores case !
        keywords = processKeywords(keywords, this.analyzeMaxWords);
        List<JSONArray> response = gemetClient.getConceptsMatchingKeywords(keywords, language,
                MatchingConceptsSearchMode.CONTAINS);

        List<Term> resultList = new ArrayList<Term>();
        if (response != null && response.size() > 0) {
            resultList = gemetMapper.mapToTermsWithKeywordsFilter(response, keywords, locale);
        }

        return resultList.toArray(new Term[resultList.size()]);
    }

    @Override
    public Term getTerm(String termId, Locale locale) {
        // response format determined by property
        Term result = null;
        if (doRDF) {
            result = getTermFromRDF(termId, locale);
        } else {
            result = getTermFromJSON(termId, locale);
        }

        return result;
    }

    /**
     * Fetching term as JSON !
     * 
     * @param termId
     *            e.g. http://www.eionet.europa.eu/gemet/concept/6740
     * @param locale
     *            which language to use
     * @return mapped term from JSON
     */
    private Term getTermFromJSON(String termId, Locale locale) {
        String language = getGEMETLanguageFilter(locale);

        JSONObject response = gemetClient.getConceptAsJSON(termId, language);

        Term result = null;
        if (response != null) {
            result = gemetMapper.mapToTerm(response);

            // handle alternate localization, we have to do another request !
            if (alternateLanguage != null) {
                if (!alternateLanguage.equals(language)) {
                    response = gemetClient.getConceptAsJSON(termId, alternateLanguage);
                }
                gemetMapper.mapAlternateLanguage(response, result);
            }
        }

        return result;
    }

    /**
     * Fetching term as RDF !
     * 
     * @param termId
     *            e.g. http://www.eionet.europa.eu/gemet/concept/6740
     * @param locale
     *            which language to use
     * @return mapped term from RDF
     */
    private Term getTermFromRDF(String termId, Locale locale) {
        Resource response = gemetClient.getConceptAsRDF(termId);

        Term result = null;
        if (response != null) {
            result = gemetMapper.mapToTerm(response, getGEMETLanguageFilter(locale), this.alternateLanguage);
        }

        return result;
    }

    /**
     * NOTICE: Parameter "ignoreCase" is irrelevant ! Parameter
     * "analyzeMaxWords" is overwritten from gemet.properties if larger !
     * 
     * @see de.ingrid.external.ThesaurusService#getTermsFromText(java.lang.String,
     *      int, boolean, java.util.Locale)
     */
    @Override
    public Term[] getTermsFromText(String text, int analyzeMaxWords, boolean ignoreCase, Locale locale) {
        if (text == null || text.trim().length() == 0) {
            log.warn("Empty text (" + text + ") passed, we return empty list !");
            return new Term[] {};
        }

        List<JSONArray> responseList = new ArrayList<JSONArray>();
        if (analyzeMaxWords > this.analyzeMaxWords)
            analyzeMaxWords = this.analyzeMaxWords;

        // split text to words, only maximum of words
        String[] keywords = processKeywords(text.trim().split(" "), analyzeMaxWords);
        if (keywords.length > 1) {
            String language = getGEMETLanguageFilter(locale);
            responseList.addAll(
                    gemetClient.getConceptsMatchingKeywords(keywords, language, MatchingConceptsSearchMode.EXACT));
        }

        List<Term> resultList = new ArrayList<Term>();
        if (responseList != null && responseList.size() > 0) {
            resultList = gemetMapper.mapToTerms(responseList);
        }

        return resultList.toArray(new Term[resultList.size()]);
    }

    /**
     * Prepare keywords for GEMET request, e.g. reduce to max number, remove
     * punctuation ...
     * 
     * @param keywords
     * @param maxKeywords
     * @return
     */
    private String[] processKeywords(String[] keywords, int maxKeywords) {
        if (keywords.length > maxKeywords) {
            String[] keywordsMax = new String[maxKeywords];
            for (int i = 0; i < maxKeywords; i++) {
                keywordsMax[i] = keywords[i];
            }
            keywords = keywordsMax;
        }
        // remove all punctuation
        for (int i = 0; i < keywords.length; i++) {
            keywords[i] = keywords[i].replaceAll("\\p{P}", "");
        }

        return keywords;
    }

    /**
     * Determine language filter for GEMET dependent from passed locale !
     * 
     * @param locale
     * @return language used in gemet
     */
    private String getGEMETLanguageFilter(Locale locale) {
        // default is german !
        String langFilter = "de";
        if (locale != null) {
            langFilter = locale.getLanguage();
        }

        return langFilter;
    }

    private MatchingConceptsSearchMode getGEMETSearchMode(MatchingType matching) {
        if (MatchingType.BEGINS_WITH.equals(matching)) {
            return MatchingConceptsSearchMode.BEGINS_WITH;
        } else if (MatchingType.CONTAINS.equals(matching)) {
            return MatchingConceptsSearchMode.CONTAINS;
        }

        return MatchingConceptsSearchMode.EXACT;
    }
}