org.dbpedia.spotlight.lucene.index.CandidateIndexer.java Source code

Introduction

Here is the source code for org.dbpedia.spotlight.lucene.index.CandidateIndexer.java
Source

/*
 * Copyright 2011 DBpedia Spotlight Development Team
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 *  Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
 */

package org.dbpedia.spotlight.lucene.index;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.store.FSDirectory;
import org.dbpedia.spotlight.exceptions.IndexException;
import org.dbpedia.spotlight.lucene.LuceneManager;
import org.dbpedia.spotlight.model.Candidate;
import org.dbpedia.spotlight.model.DBpediaResource;
import org.dbpedia.spotlight.model.SpotlightConfiguration;
import org.dbpedia.spotlight.model.SurfaceForm;
import org.semanticweb.yars.nx.Node;
import org.semanticweb.yars.nx.parser.NxParser;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;
import java.util.Scanner;

/**
 * Class to index surrogates mapping (surface form -> set of resources) in lucene.
 * This does not index context (paragraphs around an entity mention). For that see @link{org.dbpedia.spotlight.index.OccurrenceContextIndexer}
 * @author pablomendes
 * @author maxjakob
 */
public class CandidateIndexer extends BaseIndexer<Candidate> {

    final static Log LOG = LogFactory.getLog(BaseIndexer.class);

    /**
     * Constructs a surrogate indexer that follows the policy specified by the {@link org.dbpedia.spotlight.lucene.LuceneManager} implementation used.
     * @param indexManager For a caseInsensitive behavior, use {@link org.dbpedia.spotlight.lucene.LuceneManager.CaseInsensitiveSurfaceForms}.
     * @throws java.io.IOException
     */
    public CandidateIndexer(LuceneManager indexManager) throws IOException {
        super(indexManager);
    }

    public void add(SurfaceForm surfaceForm, DBpediaResource resource) throws IndexException {
        Document newDoc = mLucene.createDocument(surfaceForm, resource);
        try {
            mWriter.addDocument(newDoc); // do not commit for faster indexing.
        } catch (IOException e) {
            throw new IndexException("Error adding candidate map to the index. ", e);
        }

        LOG.trace("Added to " + mLucene.directory().toString() + ": " + surfaceForm.toString() + " -> "
                + resource.toString());
    }

    public void add(List<SurfaceForm> surfaceForms, DBpediaResource resource) throws IndexException {
        Document newDoc = mLucene.createDocument(surfaceForms.get(0), resource);
        for (int i = 1; i < surfaceForms.size(); i++) {
            newDoc.add(mLucene.getField(surfaceForms.get(i)));
        }
        try {
            mWriter.addDocument(newDoc); // do not commit for faster indexing.
        } catch (IOException e) {
            throw new IndexException("Error adding candidate map to the index. ", e);
        }

        LOG.trace("Added to " + mLucene.directory().toString() + ": " + surfaceForms.toString() + " -> "
                + resource.toString());
    }

    public void add(SurfaceForm surfaceForm, DBpediaResource resource, int nTimes) throws IndexException {
        Document newDoc = mLucene.createDocument(surfaceForm, resource, nTimes);
        try {
            mWriter.addDocument(newDoc); //TODO ATTENTION need to merge with existing doc if URI is already in index
        } catch (IOException e) {
            throw new IndexException("Error adding candidate map to the index. ", e);
        }

        LOG.trace("Added to " + mLucene.directory().toString() + ": " + surfaceForm.toString() + " -> "
                + resource.toString());
    }

    public void add(Candidate candidate) throws IndexException {
        add(candidate.surfaceForm(), candidate.resource());
    }

    /**
     * Index surrogates mapping from a triples file.
     */
    public void addFromNTfile(File surfaceFormsDataSet) throws IOException, IndexException {
        LOG.info("Indexing candidate map from " + surfaceFormsDataSet.getName() + " to " + mLucene.directory()
                + "...");

        NxParser nxParser = new NxParser(new FileInputStream(surfaceFormsDataSet), false);
        while (nxParser.hasNext()) {
            Node[] nodes = nxParser.next();
            String resourceString = nodes[0].toString().replace(SpotlightConfiguration.DEFAULT_NAMESPACE, "");
            String surfaceFormString = nodes[2].toString();
            List<SurfaceForm> surfaceForms = AddSurfaceFormsToIndex
                    .fromTitlesToAlternativesJ(new SurfaceForm(surfaceFormString));
            add(surfaceForms, new DBpediaResource(resourceString));
        }

        LOG.info("Done.");
    }

    /**
     * Index surrogates mapping from a tab separated file.
     */
    public void addFromTSVfile(File surfaceFormsDataSet) throws IOException, IndexException {
        LOG.info("Indexing candidate map from " + surfaceFormsDataSet.getName() + " to " + mLucene.directory()
                + "...");

        String separator = "\t";
        Scanner tsvScanner = new Scanner(new FileInputStream(surfaceFormsDataSet), "UTF-8");

        while (tsvScanner.hasNextLine()) {
            String[] line = tsvScanner.nextLine().split(separator);
            String surfaceFormString = line[0];
            String resourceString = line[1];
            //TODO read counts and set DBpediaResource.support
            // int countSfRes = new Integer(line[2])
            DBpediaResource res = new DBpediaResource(resourceString);
            // DBpediaResource res = new DBpediaResource(resourceString,countSfRes)
            List<SurfaceForm> surfaceForms = AddSurfaceFormsToIndex
                    .fromTitlesToAlternativesJ(new SurfaceForm(surfaceFormString));
            add(surfaceForms, res);
        }

        LOG.info("Done.");
    }

    /**
     * Index surrogates mapping from a tab separated file.
     */
    public void addFromCounts(File surfaceFormsDataSet, int minCount) throws IOException, IndexException {
        LOG.info("Indexing candidate map from " + surfaceFormsDataSet.getName() + " to " + mLucene.directory()
                + "...");

        String separator = "\t";
        Scanner tsvScanner = new Scanner(new FileInputStream(surfaceFormsDataSet), "UTF-8");

        while (tsvScanner.hasNextLine()) {
            String[] line = tsvScanner.nextLine().split(separator);
            try {
                String countAndSf = line[0];
                int count = Integer.valueOf(countAndSf.substring(0, 7).trim());
                String resourceString = countAndSf.substring(8);
                String surfaceFormString = line[1];
                if (count > minCount)
                    add(new SurfaceForm(surfaceFormString), new DBpediaResource(resourceString), count);
                List<SurfaceForm> surfaceForms = AddSurfaceFormsToIndex
                        .fromTitlesToAlternativesJ(new SurfaceForm(surfaceFormString));
                add(surfaceForms, new DBpediaResource(resourceString));
            } catch (ArrayIndexOutOfBoundsException e) {
                LOG.error("Error parsing line: " + line);
                e.printStackTrace();
            }
        }

        LOG.info("Done.");
    }

    /**
     * Optimize the index to speed up queries.
     *
     * @throws java.io.IOException
     */
    public void optimize() throws IOException {
        LOG.info("Optimizing candidate map index in " + mLucene.directory() + " ...");
        mWriter.optimize();
        LOG.info("Done.");
    }

    /**
     * Example:
     * java CandidateIndexer candidateMap.count candidateMapCI 2 --case-sensitive --overwrite
     *
     * @param args
     * @throws IOException
     * @throws IndexException
     */
    public static void main(String[] args) throws IOException, IndexException {
        String inputFileName = args[0]; // DBpedia surface forms mapping
        String outputDirName = args[1]; // target Lucene mContextIndexDir
        int minCount = 3;
        String luceneManagerType = "case-insensitive"; //case-insensitive
        boolean shouldOverwrite = false;

        try {
            minCount = Integer.valueOf(args[2]);
        } catch (ArrayIndexOutOfBoundsException ignored) {
        }
        try {
            luceneManagerType = args[3];
        } catch (ArrayIndexOutOfBoundsException ignored) {
        }
        try {
            shouldOverwrite = args[4].contains("overwrite");
        } catch (Exception ignored) {
        }

        LuceneManager mLucene;
        if (luceneManagerType.contains("case-sensitive")) {
            mLucene = new LuceneManager.CaseSensitiveSurfaceForms(FSDirectory.open(new File(outputDirName)));
        } else if (luceneManagerType.contains("buffered")) {
            mLucene = new LuceneManager.BufferedMerging(FSDirectory.open(new File(outputDirName)));
        } else if (luceneManagerType.contains("phonetic")) {
            mLucene = new LuceneManager.PhoneticSurfaceForms(FSDirectory.open(new File(outputDirName)));
        } else {
            mLucene = new LuceneManager.CaseInsensitiveSurfaceForms(FSDirectory.open(new File(outputDirName)));
        }
        mLucene.shouldOverwrite = shouldOverwrite;

        CandidateIndexer si = new CandidateIndexer(mLucene);

        if (inputFileName.toLowerCase().endsWith(".nt")) {
            si.addFromNTfile(new File(inputFileName));
        } else if (inputFileName.toLowerCase().endsWith(".tsv")) {
            si.addFromTSVfile(new File(inputFileName));
        } else if (inputFileName.toLowerCase().endsWith(".count")) {
            si.addFromCounts(new File(inputFileName), minCount);
        }
        si.optimize();
        si.close();
    }
}