org.apache.tika.eval.tokens.LuceneTokenCounter.java Source code

Introduction

Here is the source code for org.apache.tika.eval.tokens.LuceneTokenCounter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.eval.tokens;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.commons.math3.util.FastMath;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BytesRef;

/**
 * Experimental class uses Lucene's MemoryIndex to effectively build the
 * token info.
 */
public class LuceneTokenCounter {
    private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a";

    private final LeafReader leafReader;
    private final MemoryIndex memoryIndex;
    private final Analyzer generalAnalyzer;
    private int topN = 10;

    Map<String, TokenStatistics> fieldStats = new HashMap<>();

    public LuceneTokenCounter(Analyzer generalAnalyzer) throws IOException {
        memoryIndex = new MemoryIndex();
        IndexSearcher searcher = memoryIndex.createSearcher();
        leafReader = (LeafReader) searcher.getIndexReader();
        this.generalAnalyzer = generalAnalyzer;
    }

    public void add(String field, String content) throws IOException {
        memoryIndex.addField(field, content, generalAnalyzer);
        //memoryIndex.addField(field+ALPHA_IDEOGRAPH_SUFFIX,
        //        content, alphaIdeographAnalyzer);
        count(field);
        //count(field+ALPHA_IDEOGRAPH_SUFFIX);

    }

    void count(String field) throws IOException {
        long tokenCount = leafReader.getSumTotalTermFreq(field);
        if (tokenCount > Integer.MAX_VALUE) {
            throw new IllegalArgumentException("can't handle longs");
        }
        int tokenCountInt = (int) tokenCount;
        int uniqueTokenCount = 0;
        SummaryStatistics summStats = new SummaryStatistics();
        double ent = 0.0d;
        double p = 0.0d;
        double base = 2.0;

        Terms terms = leafReader.terms(field);
        if (terms == null) {
            //if there were no terms
            fieldStats.put(field,
                    new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, summStats));
            return;

        }
        TermsEnum termsEnum = terms.iterator();
        BytesRef bytesRef = termsEnum.next();
        TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

        while (bytesRef != null) {

            long termFreq = termsEnum.totalTermFreq();
            if (termFreq > Integer.MAX_VALUE) {
                throw new IllegalArgumentException("Sorry can't handle longs yet");
            }
            int tf = (int) termFreq;
            //TODO: figure out how to avoid Stringifying this
            //to get codepoint count
            String t = bytesRef.utf8ToString();
            int len = t.codePointCount(0, t.length());
            for (int i = 0; i < tf; i++) {
                summStats.addValue(len);
            }
            p = (double) tf / (double) tokenCount;
            ent += p * FastMath.log(base, p);

            if (queue.top() == null || queue.size() < topN || tf >= queue.top().getValue()) {
                queue.insertWithOverflow(new TokenIntPair(t, tf));
            }

            uniqueTokenCount++;
            bytesRef = termsEnum.next();
        }
        if (tokenCountInt > 0) {
            ent = (-1.0d / (double) tokenCountInt) * ent;
        }

        fieldStats.put(field,
                new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, summStats));
    }

    public void setTopN(int topN) {
        this.topN = topN;
    }

    public TokenStatistics getTokenStatistics(String field) {
        return fieldStats.get(field);
    }

    public Terms getTerms(String field) throws IOException {
        return leafReader.terms(field);
    }

    public void clear() {
        memoryIndex.reset();
        fieldStats.clear();
    }
    /*
        public ContrastStatistics contrast(String fieldA, String fieldB) throws IOException {
    long diceDenom = getUniqueTokenCount(fieldA) +
            getUniqueTokenCount(fieldB);
        
    long diceNum = 0;
    long overlapNum = 0;
        
    Terms termsA = getTerms(fieldA);
    Terms termsB = getTerms(fieldB);
        
    TermsEnum termsEnumA = termsA.iterator();
    TermsEnum termsEnumB = termsB.iterator();
        
    BytesRef bytesRefA = termsEnumA.next();
    BytesRef bytesRefB = termsEnumB.next();
        
    while (bytesRefA != null) {
        int compare = bytesRefA.compareTo(bytesRefB);
        while (compare > 0) {
            if (bytesRefB == null) {
                break;
            }
            //handle term in B, but not A
        
            compare = bytesRefA.compareTo(bytesRefB);
            bytesRefB = termsEnumB.next();
        }
        if (compare == 0) {
            diceNum += 2;
            overlapNum += 2 * Math.min(termsEnumA.totalTermFreq(), termsEnumB.totalTermFreq());
        }
        
        bytesRefA = termsEnumA.next();
    }
        
        
    for (PairCount p : tokens.values()) {
        if (p.a > 0 && p.b > 0) {
            diceNum += 2;
            overlapNum += 2 * Math.min(p.a, p.b);
        }
    }
        
    float dice = (float) diceNum / (float) diceDenom;
    float overlap = (float) overlapNum / (float) (theseTokens.getTokenCount() + thoseTokens.getTokenCount());
        }
    */
}