org.aksw.sparqlify.qa.metrics.understandability.SoundingUri.java Source code

Introduction

Here is the source code for org.aksw.sparqlify.qa.metrics.understandability.SoundingUri.java
Source

package org.aksw.sparqlify.qa.metrics.understandability;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Set;

import org.aksw.sparqlify.core.algorithms.ViewQuad;
import org.aksw.sparqlify.core.domain.input.ViewDefinition;
import org.aksw.sparqlify.qa.exceptions.NotImplementedException;
import org.aksw.sparqlify.qa.metrics.MetricImpl;
import org.aksw.sparqlify.qa.metrics.NodeMetric;
import org.aksw.sparqlify.qa.pinpointing.Pinpointer;
import org.aksw.sparqlify.qa.sinks.TriplePosition;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.graph.Node_URI;
import com.hp.hpl.jena.graph.Triple;

/**
 * The SoundingUri metric checks if a given URL has a sounding name i.e. if it
 * is built up of
 * - a sounding host part
 * - sounding path segments
 * - sounding query parameters and fragment determiner
 * 
 * The first targeted approach was to use phonotactic rules of the English
 * language and check, if the parts of the URL of a given resource conform with
 * these rules and so find out if these parts at least sound like English words.
 * 
 * Looking up a dictionary would probably not make that much sense, since words
 * are often abbreviated in URLs or URL parts are build up concatenating more
 * than one word.
 * The problem was to find persisted phonotactic rules provided under a free
 * license. Creating these rules via a learning approach was not suitable since
 * using libraries e.g. implementing the Java Speech API ended up in a lot of
 * overhead just for the phonemization of the training data and the URL parts
 * that should be assessed.
 * Other approaches using Bayesian filters, Markov chains or simple probability
 * distributions were tried out but they didn't fit right or were too complex.
 * 
 * So in the end a trigram based approach was applied using trigram statistics
 * of the English language learned from randomly chosen Wikipedia articles,
 * namely:
 * 
 * - http://en.wikipedia.org/wiki/English_language
 * - http://en.wikipedia.org/wiki/Anna_Anderson
 * - http://en.wikipedia.org/wiki/Daylight_saving_time
 * 
 * The words of these articles were extracted using Python:
 * 
 * >>> file = open('/tmp/enwiki_dst.txt')
 * >>> content = file.read()
 * >>> raw_words = content.split()
 * >>> raw_words[:20]
 * ['Daylight', 'saving', 'time', 'From', 'Wikipedia,', 'the', 'free', ...]
 * >>> words = [ word.lower() for word in raw_words if word.isalpha() ]
 * >>> words[:20]
 * ['daylight', 'saving', 'time', 'from', 'the', 'free', 'encyclopedia', ...]
 * >>> out_file = open('words3.txt', 'w')
 * >>> for word in words:
 * ...   foo = out_file.write(word + '\n')
 * ...
 * >>>
 * 
 * After that all the word lists were concatenated and duplicates were
 * removed (sort words.txt | uniq > uwords.txt).
 * This word list file is used to set up a HashMap containing the trigram
 * statistics. So to improve these statistics or adjust them to another language
 * one can use another word list file containing representational words.
 * 
 * @author Patrick Westphal <patrick.westphal@informatik.uni-leipzig.de>
 *
 */
@Component
public class SoundingUri extends MetricImpl implements NodeMetric {

    @Autowired
    private Pinpointer pinpointer;

    String wordlistFilePath = "src/main/resources/uwords_all.txt";
    HashMap<String, Integer> trigramStats;
    int numTrigrams;
    // TODO: factor chosen arbitrarily
    int scoreFactor = 1000;

    /**
     * default constructor used by the assessment framework
     * 
     * @throws IOException
     */
    public SoundingUri() throws IOException {
        initTrigramStats(wordlistFilePath);
    }

    protected void setWordListFilePath(String wordListFilePath) throws IOException {
        this.wordlistFilePath = wordListFilePath;
        initTrigramStats(wordListFilePath);
    }

    private void initTrigramStats(String wordlistFilePath) throws IOException {
        trigramStats = new HashMap<String, Integer>();
        numTrigrams = 0;

        FileReader fReader = new FileReader(wordlistFilePath);
        BufferedReader bReader = new BufferedReader(fReader);

        String lineBuff = "";

        while ((lineBuff = bReader.readLine()) != null) {
            updateTrigramStats(lineBuff);
        }

        bReader.close();
        fReader.close();
    }

    private void updateTrigramStats(String line) {
        int strLength = line.length();

        if (strLength == 3) {
            Integer currVal = trigramStats.get(line);

            if (currVal != null)
                trigramStats.put(line, currVal++);
            else
                trigramStats.put(line, 1);

            numTrigrams++;

        } else if (strLength > 3) {
            for (int indx = 0; indx <= strLength - 3; indx++) {
                String trigram = line.substring(indx, indx + 3);
                Integer currVal = trigramStats.get(trigram);

                if (currVal != null) {
                    currVal++;
                    trigramStats.put(trigram, currVal);

                } else
                    trigramStats.put(trigram, 1);

                numTrigrams++;
            }
        }
    }

    @Override
    public void assessNodes(Triple triple) throws NotImplementedException, SQLException {
        /* assess subject */
        Node subj = triple.getSubject();

        if (subj.isURI()) {
            float qualityVal = assessResource((Node_URI) subj);

            if (qualityVal < threshold) {
                Set<ViewQuad<ViewDefinition>> viewQuads = pinpointer.getViewCandidates(triple);

                writeNodeTripleMeasureToSink(qualityVal, TriplePosition.SUBJECT, triple, viewQuads);
            }
        }

        /* assess predicate */
        {
            Node pred = triple.getPredicate();

            float qualityVal = assessResource((Node_URI) pred);

            if (qualityVal < threshold) {
                Set<ViewQuad<ViewDefinition>> viewQuads = pinpointer.getViewCandidates(triple);

                writeNodeTripleMeasureToSink(qualityVal, TriplePosition.PREDICATE, triple, viewQuads);
            }
        }

        /* assess object */
        Node obj = triple.getObject();

        if (obj.isURI()) {
            float qualityVal = assessResource((Node_URI) obj);

            if (qualityVal < threshold) {
                Set<ViewQuad<ViewDefinition>> viewQuads = pinpointer.getViewCandidates(triple);

                writeNodeTripleMeasureToSink(qualityVal, TriplePosition.OBJECT, triple, viewQuads);
            }
        }
    }

    private float assessResource(Node_URI res) {
        float score = 0;
        int numAssessedTrigrams = 0;

        String uri = res.getURI();
        String[] parts = uri.split("/");

        if (parts.length < 2) {
            // broken URI assumed
            return 0;

        } else {
            /* 
             * e.g.:
             * parts
             * (java.lang.String[]) [http:, , ex.org, employee, Chaplin, Charly]
             */

            // starting with partNum=2 means, that the protocol prefix and the
            // empty tring (resulting from "http://ex.org....".split("/")) are
            // not considered here
            for (int partNum = 2; partNum < parts.length; partNum++) {
                String part = parts[partNum];

                int partLength = part.length();

                // applying trigram comparison makes no sense here
                if (partLength < 3)
                    continue;

                else {
                    for (int idx = 0; idx <= partLength - 3; idx++) {

                        String trigram = part.substring(idx, idx + 3).toLowerCase();
                        Integer trigramCount = trigramStats.get(trigram);

                        if (trigramCount == null)
                            numAssessedTrigrams++;

                        else {
                            float trigramScore = trigramCount * scoreFactor / (float) numTrigrams;
                            score += trigramScore;
                            numAssessedTrigrams++;
                        }
                    }
                }
            }
        }
        return score / (float) numAssessedTrigrams;
    }
}