retriever.TermFreq.java Source code

Introduction

Here is the source code for retriever.TermFreq.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package retriever;

import evaluator.Evaluator;
import indexing.TrecDocIndexer;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Properties;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import simfunctions.CubicBezierTF;
import simfunctions.FactoryCubicBezierTF;
import simfunctions.GeneralizedTfIdfSimilarity;
import trec.TRECQuery;
import trec.TRECQueryParser;

/**
 *
 * @author Debasis
 */

class TermFreq implements Comparable<TermFreq> {
    String term;
    int tf; // document component
    float idf;

    public TermFreq(String term, int tf, float idf) {
        this.term = term;
        this.tf = tf;
        this.idf = idf;
    }

    @Override
    public int compareTo(TermFreq t) {
        return -1 * new Integer(tf).compareTo(t.tf); // descending
    }
}

class ScoreDocComparator_DocId implements Comparator<ScoreDoc> {

    @Override
    public int compare(ScoreDoc t, ScoreDoc t1) {
        return new Integer(t.doc).compareTo(t1.doc);
    }
}

public class TrecDocRetriever {
    IndexReader reader;
    IndexSearcher searcher;
    Analyzer analyzer;
    Properties prop;
    int numDocsInCollection;
    String runName;
    int numWanted;
    int trecCode;
    Evaluator evaluator;

    static final public String FIELD_ID = "id";
    static final public String FIELD_ANALYZED_CONTENT = "words"; // Standard analyzer w/o stopwords.    

    public int getTrecCode() {
        return trecCode;
    }

    protected List<String> buildStopwordList(String stopwordFileName) {
        List<String> stopwords = new ArrayList<>();
        String stopFile = prop.getProperty(stopwordFileName);
        String line;

        try {
            FileReader fr = new FileReader(stopFile);
            BufferedReader br = new BufferedReader(fr);

            while ((line = br.readLine()) != null) {
                stopwords.add(line.trim());
            }
            br.close();
            fr.close();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return stopwords;
    }

    public IndexReader getReader() {
        return reader;
    }

    public Analyzer getAnalyzer() {
        return analyzer;
    }

    public TrecDocRetriever(String propFile) throws Exception {
        this.prop = new Properties();
        prop.load(new FileReader(propFile));

        String indexDir = prop.getProperty("index");

        reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));
        searcher = new IndexSearcher(reader);

        analyzer = new EnglishAnalyzer(Version.LUCENE_4_9,
                StopFilter.makeStopSet(Version.LUCENE_4_9, buildStopwordList("stopfile"))); // default analyzer

        numDocsInCollection = reader.numDocs();
        runName = prop.getProperty("retrieve.runname", "noname");
        numWanted = Integer.parseInt(prop.getProperty("retrieve.num_wanted", "1000"));
        trecCode = Integer.parseInt(prop.getProperty("trec.code", "6"));

        evaluator = new Evaluator(this.prop, reader);
    }

    public Properties getProperties() {
        return prop;
    }

    public IndexSearcher getSearcher() {
        return searcher;
    }

    public List<TRECQuery> constructQueries(int trecCode) throws Exception {
        String key = "trec." + trecCode;
        List<TRECQuery> queries;

        String queryFilePropName = key + ".query.file"; // trec.6.query.file
        String queryFile = prop.getProperty(queryFilePropName);
        TRECQueryParser parser = new TRECQueryParser(queryFile, analyzer);
        parser.parse();
        queries = parser.getQueries();

        return queries;
    }

    public String getTuples(TRECQuery query, TopDocs topDocs, String runName) throws Exception {
        StringBuffer buff = new StringBuffer();
        ScoreDoc[] hits = topDocs.scoreDocs;
        for (int i = 0; i < hits.length; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            buff.append(query.id.trim()).append("\tQ0\t").append(d.get(FIELD_ID)).append("\t").append((i + 1))
                    .append("\t").append(hits[i].score).append("\t").append(runName).append("\n");
        }
        return buff.toString();
    }

    // Batch retrieve with a particular setting of the tf function.
    // Note that there is one for the document and one for the query.
    // trecCode is either 6, 7 or 8
    private TopDocs randomize(TopDocs topDocs) {
        // Randomize a given ranked list
        // Simply sorting by the docids will be good enough to randomize
        // a list...
        ScoreDoc[] rerankedScoreDocs = Arrays.copyOf(topDocs.scoreDocs, topDocs.scoreDocs.length);
        Arrays.sort(rerankedScoreDocs, new ScoreDocComparator_DocId());
        return new TopDocs(topDocs.scoreDocs.length, rerankedScoreDocs, topDocs.getMaxScore());
    }

    /*
    public TopDocs retrieveWithCustomReranker(String query) throws Exception {
        
    DoclistReranker reranker;        
    searcher.setSimilarity(new BM25Similarity());
        
    // We retrieve 2000 documents so as to give the reranker a chance
    // to retrieve docs beyond 1000 (the standard for TREC)
    TopScoreDocCollector collector = TopScoreDocCollector.create(
            numWanted<<1, true);
    Query luceneQuery = buildQuery(query);
        
    searcher.search(luceneQuery, collector);
    TopDocs initialList = collector.topDocs();
        
    TopDocs randomizedList = randomize(initialList);
    reranker = new DoclistReranker(reader, luceneQuery, randomizedList);
    TopDocs rerankedDocs = reranker.rerank();
        
    return rerankedDocs;
    }
    */

    String analyze(String query) throws Exception {
        StringBuffer buff = new StringBuffer();
        TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            term = term.toLowerCase();
            buff.append(term).append(" ");
        }
        stream.end();
        stream.close();
        return buff.toString();
    }

    Query buildQuery(String queryStr) throws Exception {
        BooleanQuery q = new BooleanQuery();
        Term thisTerm = null;
        Query tq = null;
        String[] terms = analyze(queryStr).split("\\s+");

        for (String term : terms) {
            thisTerm = new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term);
            tq = new TermQuery(thisTerm);
            q.add(tq, BooleanClause.Occur.SHOULD);
        }
        return q;
    }

    boolean isNumber(String term) {
        int len = term.length();
        for (int i = 0; i < len; i++) {
            char ch = term.charAt(i);
            if (Character.isDigit(ch))
                return true;
        }
        return false;
    }

    public String getTfVectorString(int docId) throws Exception {
        Terms terms = reader.getTermVector(docId, FIELD_ANALYZED_CONTENT);
        if (terms == null || terms.size() == 0)
            return "";

        TermsEnum termsEnum;
        BytesRef term;
        List<TermFreq> tfvec = new ArrayList<>();

        // Construct the normalized tf vector
        termsEnum = terms.iterator(null); // access the terms for this field
        while ((term = termsEnum.next()) != null) { // explore the terms for this field
            String termStr = term.utf8ToString();
            if (isNumber(termStr))
                continue;
            DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
            while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                //get the term frequency in the document
                int tf = docsEnum.freq();
                float idf = numDocsInCollection / (float) reader.docFreq(new Term(FIELD_ANALYZED_CONTENT, term));
                tfvec.add(new TermFreq(termStr, tf, idf));
            }
        }

        Collections.sort(tfvec);
        StringBuffer buff = new StringBuffer();
        for (TermFreq tf : tfvec)
            buff.append(tf.term).append(":").append(tf.tf).append(", ").append(tf.idf).append(" ");

        if (buff.length() > 2) {
            buff.deleteCharAt(buff.length() - 1);
            buff.deleteCharAt(buff.length() - 1);
        }

        return buff.toString();
    }

    public void saveRetrievedTuples(FileWriter fw, TRECQuery query, TopDocs topDocs) throws Exception {
        StringBuffer buff = new StringBuffer();
        ScoreDoc[] hits = topDocs.scoreDocs;
        int len = Math.min(numWanted, hits.length);
        for (int i = 0; i < len; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            buff.append(query.id.trim()).append("\tQ0\t").append(d.get(TrecDocIndexer.FIELD_ID)).append("\t")
                    .append((i + 1)).append("\t").append(hits[i].score).append("\t").append(runName).append("\n");
        }
        fw.write(buff.toString());
    }

    public float batchRetrieveTREC(CubicBezierTF dtfFunc) throws Exception {
        System.out.println("Batch retrieving for TREC " + this.getTrecCode());

        Similarity sim = new GeneralizedTfIdfSimilarity(dtfFunc);
        searcher.setSimilarity(sim);

        List<TRECQuery> queries = constructQueries(trecCode);
        float map = 0f;

        for (TRECQuery query : queries) {
            System.out.println("Retrieving for query " + query.id + ": " + query);
            TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
            Query luceneQuery = buildQuery(query.title);
            searcher.search(luceneQuery, collector);

            TopDocs initialList = collector.topDocs();
            map += evaluator.computeAP(query.id, initialList);

            /*
            // Re-rank based on the custom tf functions for doc and qry
            DoclistReranker reranker = new DoclistReranker(reader,
                dtfFunc, luceneQuery, initialList);
            TopDocs rerankedDocs = reranker.rerank();
                
            map += evaluator.computeAP(query.id, rerankedDocs);
            */
        }

        float numQueries = (float) queries.size();
        map /= numQueries;
        System.out.println("MAP " + dtfFunc.toString() + ": " + map);

        // Evaluate
        // TODO: Write code here to evaluate and keep track of the
        // function settings which yields the highest MAP till now.
        dtfFunc.setMAP(map);
        return map;
    }

    public static void main(String[] args) {
        if (args.length < 1) {
            args = new String[1];
            args[0] = "init.properties";
        }
        try {
            TrecDocRetriever searcher = new TrecDocRetriever(args[0]);
            FactoryCubicBezierTF bezierFactory = new FactoryCubicBezierTF(searcher);
            bezierFactory.exploreFunctionSpace();
            bezierFactory.showTopFunctions();
            //searcher.batchRetrieveTREC(new CubicBezierTF(.06f, .23f, .4f, .5f, .8f));
            //searcher.batchRetrieveTREC(new CubicBezierTF(.1f, .2f, .4f, .5f, .9f));
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
}