org.apache.nutch.scoring.similarity.cosine.Model.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.scoring.similarity.cosine.Model.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.scoring.similarity.cosine;

import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
import org.apache.nutch.scoring.similarity.util.LuceneTokenizer;
import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class creates a model used to store Document vector representation of the corpus. 
 *
 */
public class Model {

    //Currently only one file, but in future could accept a corpus hence an ArrayList
    public static ArrayList<DocVector> docVectors = new ArrayList<>();
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static boolean isModelCreated = false;
    private static List<String> stopWords;

    public static synchronized void createModel(Configuration conf) throws IOException {
        if (isModelCreated) {
            LOG.info("Model exists, skipping model creation");
            return;
        }
        LOG.info("Creating Cosine model");
        try {
            //If user has specified a stopword file other than the template
            if (!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) {
                stopWords = new ArrayList<String>();
                String stopWord;
                BufferedReader br = new BufferedReader(
                        conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file"))));
                while ((stopWord = br.readLine()) != null) {
                    stopWords.add(stopWord);
                }
                LOG.info("Loaded custom stopwords from {}", conf.get("scoring.similarity.stopword.file"));
            }

            int[] ngramArr = retrieveNgrams(conf);
            int mingram = ngramArr[0];
            int maxgram = ngramArr[1];
            LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram);

            // TODO : Allow for corpus of documents to be provided as gold standard. 
            String line;
            StringBuilder sb = new StringBuilder();
            BufferedReader br = new BufferedReader(
                    conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file"))));
            while ((line = br.readLine()) != null) {
                sb.append(line);
            }
            DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram);
            if (goldStandard != null)
                docVectors.add(goldStandard);
            else {
                throw new Exception("Could not create DocVector for goldstandard");
            }
        } catch (Exception e) {
            LOG.warn("Failed to add {} to model : {}",
                    conf.get("cosine.goldstandard.file", "goldstandard.txt.template"),
                    StringUtils.stringifyException(e));
        }
        if (docVectors.size() > 0) {
            LOG.info("Cosine model creation complete");
            isModelCreated = true;
        } else
            LOG.info("Cosine model creation failed");
    }

    /**
     * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
     * cycle to create a DocVector of the currently parsed page from the parseText attribute value
     * @param content The text to tokenize
     * @param mingram Value of mingram for tokenizing
     * @param maxgram Value of maxgram for tokenizing
     */
    public static DocVector createDocVector(String content, int mingram, int maxgram) {
        LuceneTokenizer tokenizer;

        if (mingram > 1 && maxgram > 1) {
            LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                    maxgram);
            tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                    mingram, maxgram);
        } else if (mingram > 1) {
            maxgram = mingram;
            LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram,
                    maxgram);
            tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER,
                    mingram, maxgram);
        } else if (stopWords != null) {
            tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
                    StemFilterType.PORTERSTEM_FILTER);
        } else {
            tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true,
                    StemFilterType.PORTERSTEM_FILTER);
        }
        TokenStream tStream = tokenizer.getTokenStream();
        HashMap<String, Integer> termVector = new HashMap<>();
        try {
            CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
            tStream.reset();
            while (tStream.incrementToken()) {
                String term = charTermAttribute.toString();
                LOG.debug(term);
                if (termVector.containsKey(term)) {
                    int count = termVector.get(term);
                    count++;
                    termVector.put(term, count);
                } else {
                    termVector.put(term, 1);
                }
            }
            DocVector docVector = new DocVector();
            docVector.setTermFreqVector(termVector);
            return docVector;
        } catch (IOException e) {
            LOG.error("Error creating DocVector : {}", StringUtils.stringifyException(e));
        }
        return null;
    }

    public static float computeCosineSimilarity(DocVector docVector) {
        float scores[] = new float[docVectors.size()];
        int i = 0;
        float maxScore = 0;
        for (DocVector corpusDoc : docVectors) {
            float numerator = docVector.dotProduct(corpusDoc);
            float denominator = docVector.getL2Norm() * corpusDoc.getL2Norm();
            float currentScore = numerator / denominator;
            scores[i++] = currentScore;
            maxScore = (currentScore > maxScore) ? currentScore : maxScore;
        }
        // Returning the max score amongst all documents in the corpus
        return maxScore;
    }

    /**
     * Retrieves mingram and maxgram from configuration
     * @param conf Configuration to retrieve mingram and maxgram
     * @return ngram array as mingram at first index and maxgram at second index
       */
    public static int[] retrieveNgrams(Configuration conf) {
        int[] ngramArr = new int[2];
        //Check if user has specified mingram or ngram for ngram cosine model
        String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1");
        //mingram
        ngramArr[0] = Integer.parseInt(ngramStr[0]);
        if (ngramStr.length > 1) {
            //maxgram
            ngramArr[1] = Integer.parseInt(ngramStr[1]);
        } else {
            //maxgram
            ngramArr[1] = ngramArr[0];
        }
        return ngramArr;
    }
}