eu.eexcess.federatedrecommender.decomposer.PseudoRelevanceWikipediaDecomposer.java Source code

Java tutorial

Introduction

Here is the source code for eu.eexcess.federatedrecommender.decomposer.PseudoRelevanceWikipediaDecomposer.java

Source

/* Copyright (C) 2014 
"Kompetenzzentrum fuer wissensbasierte Anwendungen Forschungs- und EntwicklungsgmbH" 
(Know-Center), Graz, Austria, office@know-center.at.
    
Licensees holding valid Know-Center Commercial licenses may use this file in
accordance with the Know-Center Commercial License Agreement provided with 
the Software or, alternatively, in accordance with the terms contained in
a written agreement between Licensees and Know-Center.
    
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.
    
You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
package eu.eexcess.federatedrecommender.decomposer;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.lang.LocaleUtils;

import at.knowcenter.commons.wikipedia.queryexpansion.WikipediaQueryExpansion;
import at.knowcenter.util.term.TermSet;
import at.knowcenter.util.term.TypedTerm;
import eu.eexcess.dataformats.userprofile.ContextKeyword;
import eu.eexcess.dataformats.userprofile.ExpansionType;
import eu.eexcess.dataformats.userprofile.Language;
import eu.eexcess.dataformats.userprofile.SecureUserProfile;
import eu.eexcess.federatedrecommender.interfaces.SecureUserProfileDecomposer;
import eu.eexcess.utils.LanguageGuesser;

/**
 * Class to provide query expansion from Wikipedia
 * 
 * @author hziak
 *
 */
public class PseudoRelevanceWikipediaDecomposer
        implements SecureUserProfileDecomposer<SecureUserProfile, SecureUserProfile> {

    private static final Logger logger = Logger.getLogger(PseudoRelevanceWikipediaDecomposer.class.getName());

    private Map<String, WikipediaQueryExpansion> localeToQueryExpansion;
    private String[] supportedLocales;

    /**
     * 
     * @param wikipediaBaseIndexDir the base directory for the Wikipedia indices, it is expected to contain folders like "enwiki" and "dewiki"
     * @throws IOException
     */

    public PseudoRelevanceWikipediaDecomposer(String wikipediaBaseIndexDir, String[] supportedLocales)
            throws IOException {
        this.supportedLocales = supportedLocales;
        localeToQueryExpansion = new HashMap<String, WikipediaQueryExpansion>();

        for (String localeName : supportedLocales) {
            Locale locale = LocaleUtils.toLocale(localeName);
            localeToQueryExpansion.put(localeName,
                    new WikipediaQueryExpansion(new File(wikipediaBaseIndexDir, locale + "wiki"), locale));

        }

    }

    @Override
    public SecureUserProfile decompose(SecureUserProfile inputSecureUserProfile) {

        TermSet<TypedTerm> terms = new TermSet<TypedTerm>(new TypedTerm.AddingWeightTermMerger());
        StringBuilder builder = new StringBuilder();
        for (ContextKeyword keyword : inputSecureUserProfile.contextKeywords) {
            if (builder.length() > 0) {
                builder.append(" ");
            }
            builder.append(keyword.text);
            terms.add(new TypedTerm(keyword.text, null, 1));
        }
        String query = builder.toString();

        String localeName = null;
        // first, pick up the language specified by the user
        if (inputSecureUserProfile.languages != null && !inputSecureUserProfile.languages.isEmpty()) {
            Language firstLanguage = inputSecureUserProfile.languages.iterator().next();
            localeName = firstLanguage.iso2;
        } else {
            // then try to detect the language from the query
            String guessedLanguage = LanguageGuesser.getInstance().guessLanguage(query);
            if (guessedLanguage != null) {
                localeName = guessedLanguage;
            }
        }

        WikipediaQueryExpansion wikipediaQueryExpansion = localeToQueryExpansion.get(localeName);
        if (wikipediaQueryExpansion == null) {
            // no query expansion for the current locale, fall back to the first supported locale
            wikipediaQueryExpansion = localeToQueryExpansion.get(supportedLocales[0]);
        }

        try {
            TermSet<TypedTerm> queryExpansionTerms;
            queryExpansionTerms = wikipediaQueryExpansion.expandQuery(query);
            //         for (TypedTerm typedTerm : queryExpansionTerms.getTopTerms(100)) {
            //            System.out.println("TypedTerm: "+ typedTerm.getText() +" Type: "+typedTerm.getType() +" Weight: " +typedTerm.getWeight());
            //         }
            //         System.out.println(query  +" query #############################");
            terms.addAll(queryExpansionTerms.getTopTerms(5));
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Cannot expand the query using Wikipedia", e);
        }

        ArrayList<ContextKeyword> newContextKeywords = new ArrayList<ContextKeyword>();
        for (TypedTerm typedTerm : terms.getTopTerms(5)) {
            newContextKeywords.add(new ContextKeyword(typedTerm.getText(), ExpansionType.EXPANSION));
        }
        inputSecureUserProfile.contextKeywords.addAll(newContextKeywords);
        logger.log(Level.INFO, "Wikipedia Expansion: " + newContextKeywords.toString());
        return inputSecureUserProfile;

        //return null;
    }
}