dkpro.similarity.algorithms.vsm.store.convert.ConvertLuceneToVectorIndex.java Source code

Introduction

Here is the source code for dkpro.similarity.algorithms.vsm.store.convert.ConvertLuceneToVectorIndex.java
Source

/*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package dkpro.similarity.algorithms.vsm.store.convert;

import static org.apache.commons.io.FileUtils.deleteQuietly;
import static org.apache.commons.lang.StringUtils.isNumericSpace;

import java.io.File;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import no.uib.cipr.matrix.Vector;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.FSDirectory;

import de.tudarmstadt.ukp.dkpro.lab.ProgressMeter;
import dkpro.similarity.algorithms.vsm.store.LuceneVectorReader;
import dkpro.similarity.algorithms.vsm.store.VectorReader;
import dkpro.similarity.algorithms.vsm.store.vectorindex.VectorIndexWriter;

/**
 * Convert an index created using {@link EsaIndex} to a ESA Vector Cache index which intelligently
 * compresses and stores vectors using {@link VectorIndexWriter}.
 *
 * @author Richard Eckart de Castilho
 */
public class ConvertLuceneToVectorIndex {
    private static Matcher monetaryPattern = Pattern.compile("^[$]\\s*[0-9,.]+$").matcher("");
    private static Matcher cardinalNumber = Pattern.compile("^#[0-9]+$").matcher("");

    private static boolean isMonetary(String aTerm) {
        monetaryPattern.reset(aTerm);
        return monetaryPattern.matches();
    }

    private static boolean isCardinal(String aTerm) {
        cardinalNumber.reset(aTerm);
        return cardinalNumber.matches();
    }

    public static void main(String[] args) throws Exception {
        File inputPath = new File(args[0]);
        File outputPath = new File(args[1]);

        deleteQuietly(outputPath);
        outputPath.mkdirs();

        boolean ignoreNumerics = true;
        boolean ignoreCardinal = true;
        boolean ignoreMonetary = true;
        int minTermLength = 3;
        int minDocFreq = 5;

        System.out.println("Quality criteria");
        System.out.println("Minimum term length            : " + minTermLength);
        System.out.println("Minimum document frequency     : " + minDocFreq);
        System.out.println("Ignore numeric tokens          : " + ignoreNumerics);
        System.out.println("Ignore cardinal numeric tokens : " + ignoreNumerics);
        System.out.println("Ignore money values            : " + ignoreMonetary);

        System.out.print("Fetching terms list... ");

        IndexReader reader = IndexReader.open(FSDirectory.open(inputPath));
        TermEnum termEnum = reader.terms();
        Set<String> terms = new HashSet<String>();
        int ignoredTerms = 0;
        while (termEnum.next()) {
            String term = termEnum.term().text();
            if (((minTermLength > 0) && (term.length() < minTermLength)) || (ignoreCardinal && isCardinal(term))
                    || (ignoreMonetary && isMonetary(term)) || (ignoreNumerics && isNumericSpace(term))
                    || ((minDocFreq > 0) && (termEnum.docFreq() < minDocFreq))) {
                ignoredTerms++;
                continue;
            }

            terms.add(term);
        }
        reader.close();

        System.out.println(terms.size() + " terms found. " + ignoredTerms + " terms ignored.");

        System.out.println("Opening source ESA index " + inputPath);
        VectorReader source = new LuceneVectorReader(inputPath);
        System.out.println("Opening destination ESA index " + inputPath);
        VectorIndexWriter esaWriter = new VectorIndexWriter(outputPath, source.getConceptCount());

        ProgressMeter p = new ProgressMeter(terms.size());
        for (String term : terms) {
            Vector vector = source.getVector(term);
            esaWriter.put(term, vector);

            p.next();
            System.out.println("[" + term + "] " + p);
        }

        esaWriter.close();
    }
}