org.apache.tika.eval.tools.TopCommonTokenCounter.java Source code

Introduction

Here is the source code for org.apache.tika.eval.tools.TopCommonTokenCounter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.eval.tools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.tika.eval.tokens.AnalyzerManager;

/**
 * Utility class that reads in a UTF-8 input file with one document per row
 * and outputs the 20000 tokens with the highest document frequencies.
 *
 * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
 * but includes bigrams for cjk.
 *
 * It also has a white list for __email__ and __url__ and a black list
 * for common html markup terms.
 */
public class TopCommonTokenCounter {
    private static final String FIELD = "f";
    private static int TOP_N = 20000;
    private static int MIN_DOC_FREQ = 10;
    //these should exist in every list
    static Set<String> WHITE_LIST = new HashSet<>(Arrays.asList(new String[] { "___email___", "___url___" }));

    //words to ignore
    //these are common 4 letter html markup words that we do
    //not want to count in case of failed markup processing.
    //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
    static Set<String> BLACK_LIST = new HashSet<>(
            Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname", "lang", "style",
                    "script", "strong", "blockquote", "form", "iframe", "section", "colspan", "rowspan"));

    public static void main(String[] args) throws Exception {
        Path inputFile = Paths.get(args[0]);
        Path commonTokensFile = Paths.get(args[1]);
        TopCommonTokenCounter counter = new TopCommonTokenCounter();
        counter.execute(inputFile, commonTokensFile);
    }

    private void execute(Path inputFile, Path commonTokensFile) throws Exception {
        Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
        AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
        try {
            Directory directory = FSDirectory.open(luceneDir);
            AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);

            Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
            int maxLen = 1000000;
            int len = 0;
            try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
                List<Document> docs = new ArrayList<>();
                try (BufferedReader reader = getReader(inputFile)) {
                    String line = reader.readLine();
                    while (line != null) {
                        len += line.length();
                        Document document = new Document();
                        document.add(new TextField(FIELD, line, Field.Store.NO));
                        docs.add(document);
                        if (len > maxLen) {
                            writer.addDocuments(docs);
                            docs.clear();
                            len = 0;
                        }
                        line = reader.readLine();
                    }
                }
                if (docs.size() > 0) {
                    writer.addDocuments(docs);
                }
                writer.commit();
                writer.flush();
            }
            try (IndexReader reader = DirectoryReader.open(directory)) {
                LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
                Terms terms = wrappedReader.terms(FIELD);
                TermsEnum termsEnum = terms.iterator();
                BytesRef bytesRef = termsEnum.next();
                int docsWThisField = wrappedReader.getDocCount(FIELD);
                while (bytesRef != null) {
                    int df = termsEnum.docFreq();
                    long tf = termsEnum.totalTermFreq();
                    if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
                        bytesRef = termsEnum.next();
                        continue;
                    }

                    if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
                        String t = bytesRef.utf8ToString();
                        if (!WHITE_LIST.contains(t) && !BLACK_LIST.contains(t)) {
                            queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                        }

                    }
                    bytesRef = termsEnum.next();
                }
            }
        } finally {
            FileUtils.deleteDirectory(luceneDir.toFile());
        }

        writeTopN(commonTokensFile, queue);

    }

    private BufferedReader getReader(Path inputFile) throws IOException {
        InputStream is = Files.newInputStream(inputFile);
        if (inputFile.toString().endsWith(".gz")) {
            is = new GzipCompressorInputStream(is);
        }
        return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
    }

    private static void writeTopN(Path path, AbstractTokenTFDFPriorityQueue queue) throws IOException {
        if (Files.isRegularFile(path)) {
            System.err.println("File " + path.getFileName() + " already exists. Skipping.");
            return;
        }
        Files.createDirectories(path.getParent());
        BufferedWriter writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8);
        StringBuilder sb = new StringBuilder();
        //add these tokens no matter what
        for (String t : WHITE_LIST) {
            writer.write(t);
            writer.newLine();
        }
        for (TokenDFTF tp : queue.getArray()) {
            writer.write(getRow(sb, tp) + "\n");

        }
        writer.flush();
        writer.close();
    }

    private static String getRow(StringBuilder sb, TokenDFTF tp) {
        sb.setLength(0);
        sb.append(clean(tp.token));
        //sb.append("\t").append(tp.df);
        //sb.append("\t").append(tp.tf);
        return sb.toString();
    }

    private static String clean(String s) {
        if (s == null) {
            return "";
        }
        return s.replaceAll("\\s+", " ").trim();
    }

    private abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {

        AbstractTokenTFDFPriorityQueue(int maxSize) {
            super(maxSize);
        }

        public TokenDFTF[] getArray() {
            TokenDFTF[] topN = new TokenDFTF[size()];
            //now we reverse the queue
            TokenDFTF term = pop();
            int i = topN.length - 1;
            while (term != null && i > -1) {
                topN[i--] = term;
                term = pop();
            }
            return topN;
        }
    }

    private class TokenDFTF {

        final String token;
        final int df;
        final long tf;

        public TokenDFTF(String token, int df, long tf) {
            this.token = token;
            this.df = df;
            this.tf = tf;
        }

        public long getTF() {
            return tf;
        }

        public int getDF() {
            return df;
        }

        public String getToken() {
            return token;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o)
                return true;
            if (o == null || getClass() != o.getClass())
                return false;

            TokenDFTF tokenDFTF = (TokenDFTF) o;

            if (df != tokenDFTF.df)
                return false;
            if (tf != tokenDFTF.tf)
                return false;
            return token != null ? token.equals(tokenDFTF.token) : tokenDFTF.token == null;
        }

        @Override
        public int hashCode() {
            int result = token != null ? token.hashCode() : 0;
            result = 31 * result + df;
            result = 31 * result + (int) (tf ^ (tf >>> 32));
            return result;
        }

        @Override
        public String toString() {
            return "TokenDFTF{" + "token='" + token + '\'' + ", df=" + df + ", tf=" + tf + '}';
        }
    }

    private class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {

        TokenDFPriorityQueue(int maxSize) {
            super(maxSize);
        }

        @Override
        protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
            if (arg0.df < arg1.df) {
                return true;
            } else if (arg0.df > arg1.df) {
                return false;
            }
            return arg1.token.compareTo(arg0.token) < 0;
        }

        public TokenDFTF[] getArray() {
            TokenDFTF[] topN = new TokenDFTF[size()];
            //now we reverse the queue
            TokenDFTF term = pop();
            int i = topN.length - 1;
            while (term != null && i > -1) {
                topN[i--] = term;
                term = pop();
            }
            return topN;
        }
    }
}