it.acubelab.smaph.SmaphUtils.java Source code

Introduction

Here is the source code for it.acubelab.smaph.SmaphUtils.java
Source

/**
 *  Copyright 2014 Marco Cornolti
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package it.acubelab.smaph;

import it.unipi.di.acube.batframework.utils.Pair;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Vector;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.lang.StringUtils;
import org.tartarus.snowball.ext.EnglishStemmer;

public class SmaphUtils {

    /**
     * For each word of bold, finds the word in query that has the minimum edit
     * distance, normalized by the word length. Returns the average of those
     * distances.
     * 
     * @param query
     *            a query.
     * @param bold
     *            a bold.
     * @return the averaged normalized word-by-word edit distance of bold
     *         against query.
     */
    public static double getMinEditDist(String query, String bold) {
        return getMinEditDist(query, bold, null);
    }

    /**
     * For each word of bold, finds the word in query that has the minimum edit
     * distance, normalized by the word length. Put that word in minTokens.
     * Returns the average of those distances.
     * 
     * @param query
     *            a query.
     * @param bold
     *            a bold.
     * @param minTokens
     *            the tokens of query having minimum edit distance.
     * @return the averaged normalized word-by-word edit distance of bold
     *         against query.
     */
    public static double getMinEditDist(String query, String bold, List<String> minTokens) {
        List<String> tokensQ = tokenize(query);
        List<String> tokensB = tokenize(bold);

        if (tokensB.size() == 0 || tokensQ.size() == 0)
            return 1;

        float avgMinDist = 0;
        for (String tokenB : tokensB) {
            float minDist = Float.MAX_VALUE;
            String bestQToken = null;
            for (String tokenQ : tokensQ) {
                float relLev = getNormEditDistance(tokenB, tokenQ);
                if (relLev < minDist) {
                    minDist = relLev;
                    bestQToken = tokenQ;
                }
            }
            if (minTokens != null)
                minTokens.add(bestQToken);
            avgMinDist += minDist;
        }
        return avgMinDist / tokensB.size();
    }

    /**
     * @param tokenB
     *            a word.
     * @param tokenQ
     *            another word.
     * @return the normalized edit distance between tokenB and tokenQ.
     */
    public static float getNormEditDistance(String tokenB, String tokenQ) {
        if (tokenQ.isEmpty() || tokenB.isEmpty())
            return 1;
        int lev = StringUtils.getLevenshteinDistance(tokenB, tokenQ);
        return (float) lev / (float) Math.max(tokenB.length(), tokenQ.length());
    }

    /**
     * @param title
     *            the title of a Wikipedia page.
     * @return true iff the title is that of a regular page.
     */
    public static boolean acceptWikipediaTitle(String title) {
        // TODO: this can definitely be done in a cleaner way.
        return !(title.startsWith("Talk:") || title.startsWith("Special:") || title.startsWith("Portal:")
                || title.startsWith("Wikipedia:") || title.startsWith("Wikipedia_talk:")
                || title.startsWith("File:") || title.startsWith("User:") || title.startsWith("Category:")
                || title.startsWith("List") || title.contains("(disambiguation)"));
    }

    /**
     * @param ftrCount
     *            the number of features.
     * @return a vector containing all feature ids from 1 to ftrCount.
     */
    public static Vector<Integer> getAllFtrVect(int ftrCount) {
        Vector<Integer> res = new Vector<>();
        for (int i = 1; i < ftrCount + 1; i++)
            res.add(i);
        return res;
    }

    /**
     * Turns a list of pairs <b,r>, where b is a bold and r is the position in
     * which the bold occurred, to the list of bolds and the hashmap between a
     * position and the list of bolds occurring in that position.
     * 
     * @param boldAndRanks
     *            a list of pairs <b,r>, where b is a bold and r is the position
     *            in which the bold occurred.
     * @param positions
     *            where to store the mapping between a position (rank) and all
     *            bolds that appear in that position.
     * @param bolds
     *            where to store the bolds.
     */
    public static void mapRankToBoldsLC(List<Pair<String, Integer>> boldAndRanks,
            HashMap<Integer, HashSet<String>> positions, HashSet<String> bolds) {

        for (Pair<String, Integer> boldAndRank : boldAndRanks) {
            String spot = boldAndRank.first.toLowerCase();
            int rank = boldAndRank.second;
            if (bolds != null)
                bolds.add(spot);
            if (positions != null) {
                if (!positions.containsKey(rank))
                    positions.put(rank, new HashSet<String>());
                positions.get(rank).add(spot);
            }
        }

    }

    /**
     * Turns a list of pairs <b,r>, where b is a bold and r is the position in
     * which the bold occurred, to a mapping from a bold to the positions in
     * which the bolds occurred.
     * 
     * @param boldAndRanks
     *            a list of pairs <b,r>, where b is a bold and r is the position
     *            in which the bold occurred.
     * @return a mapping from a bold to the positions in which the bold
     *         occurred.
     */
    public static HashMap<String, HashSet<Integer>> findPositionsLC(List<Pair<String, Integer>> boldAndRanks) {
        HashMap<String, HashSet<Integer>> positions = new HashMap<>();
        for (Pair<String, Integer> boldAndRank : boldAndRanks) {
            String bold = boldAndRank.first.toLowerCase();
            int rank = boldAndRank.second;
            if (!positions.containsKey(bold))
                positions.put(bold, new HashSet<Integer>());
            positions.get(bold).add(rank);
        }
        return positions;
    }

    /**
     * Given a string, replaces all words with their stemmed version.
     * 
     * @param str
     *            a string.
     * @param stemmer
     *            the stemmer.
     * @return str with all words stemmed.
     */
    public static String stemString(String str, EnglishStemmer stemmer) {
        String stemmedString = "";
        String[] words = str.split("\\s+");
        for (int i = 0; i < words.length; i++) {
            String word = words[i];
            stemmer.setCurrent(word);
            stemmer.stem();
            stemmedString += stemmer.getCurrent();
            if (i != words.length)
                stemmedString += " ";
        }
        return stemmedString;
    }

    /**
     * Compress a string with GZip.
     * 
     * @param str
     *            the string.
     * @return the compressed string.
     * @throws IOException
     *             if something went wrong during compression.
     */
    public static byte[] compress(String str) throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        GZIPOutputStream gzip = new GZIPOutputStream(out);
        gzip.write(str.getBytes());
        gzip.close();
        return out.toByteArray();
    }

    /**
     * Decompress a GZipped string.
     * 
     * @param compressed
     *            the sequence of bytes
     * @return the decompressed string.
     * @throws IOException
     *             if something went wrong during decompression.
     */
    public static String decompress(byte[] compressed) throws IOException {
        GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(compressed));
        BufferedReader bf = new BufferedReader(new InputStreamReader(gis));
        String outStr = "";
        String line;
        while ((line = bf.readLine()) != null)
            outStr += line;
        return outStr;
    }

    public static List<String> tokenize(String text) {
        text = text.replaceAll("\\W+", " ").toLowerCase();
        Vector<String> tokens = new Vector<>(Arrays.asList(text.split("\\s+")));
        tokens.remove("");
        return tokens;
    }
}