org.apache.tika.parser.geo.topic.GeoNameResolver.java Source code

Introduction

Here is the source code for org.apache.tika.parser.geo.topic.GeoNameResolver.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.geo.topic;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.logging.*;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

//import sun.util.logging.resources.logging;

public class GeoNameResolver {
    private static final String INDEXDIR_PATH = "src/main/java/org/apache/tika/parser/geo/topic/model/indexDirectory";
    private static final Double OUT_OF_BOUNDS = 999999.0;
    private static Analyzer analyzer = new StandardAnalyzer();
    private static IndexWriter indexWriter;
    private static Directory indexDir;
    private static int hitsPerPage = 8;

    /**
     * Search corresponding GeoName for each location entity
     * 
     * @param querystr
     *            it's the NER actually
     * @return HashMap each name has a list of resolved entities
     * @throws IOException
     * @throws RuntimeException
     */

    public HashMap<String, ArrayList<String>> searchGeoName(ArrayList<String> locationNameEntities)
            throws IOException {

        if (locationNameEntities.size() == 0 || locationNameEntities.get(0).length() == 0)
            return new HashMap<String, ArrayList<String>>();

        Logger logger = Logger.getLogger(this.getClass().getName());

        if (!DirectoryReader.indexExists(indexDir)) {
            logger.log(Level.SEVERE, "No Lucene Index Dierctory Found, Invoke indexBuild() First !");
            System.exit(1);
        }

        IndexReader reader = DirectoryReader.open(indexDir);

        if (locationNameEntities.size() >= 200)
            hitsPerPage = 5; // avoid heavy computation
        IndexSearcher searcher = new IndexSearcher(reader);

        Query q = null;

        HashMap<String, ArrayList<ArrayList<String>>> allCandidates = new HashMap<String, ArrayList<ArrayList<String>>>();

        for (String name : locationNameEntities) {

            if (!allCandidates.containsKey(name)) {
                try {
                    // q = new QueryParser("name", analyzer).parse(name);
                    q = new MultiFieldQueryParser(new String[] { "name", "alternatenames" }, analyzer).parse(name);
                    TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage);
                    searcher.search(q, collector);
                    ScoreDoc[] hits = collector.topDocs().scoreDocs;
                    ArrayList<ArrayList<String>> topHits = new ArrayList<ArrayList<String>>();

                    for (int i = 0; i < hits.length; ++i) {
                        ArrayList<String> tmp1 = new ArrayList<String>();
                        ArrayList<String> tmp2 = new ArrayList<String>();
                        int docId = hits[i].doc;
                        Document d;
                        try {
                            d = searcher.doc(docId);
                            tmp1.add(d.get("name"));
                            tmp1.add(d.get("longitude"));
                            tmp1.add(d.get("latitude"));
                            if (!d.get("alternatenames").equalsIgnoreCase(d.get("name"))) {
                                tmp2.add(d.get("alternatenames"));
                                tmp2.add(d.get("longitude"));
                                tmp2.add(d.get("latitude"));
                            }
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                        topHits.add(tmp1);
                        if (tmp2.size() != 0)
                            topHits.add(tmp2);
                    }
                    allCandidates.put(name, topHits);
                } catch (org.apache.lucene.queryparser.classic.ParseException e) {
                    e.printStackTrace();
                }
            }
        }

        HashMap<String, ArrayList<String>> resolvedEntities = new HashMap<String, ArrayList<String>>();
        pickBestCandidates(resolvedEntities, allCandidates);
        reader.close();

        return resolvedEntities;

    }

    /**
     * Select the best match for each location name extracted from a document,
     * choosing from among a list of lists of candidate matches. Filter uses the
     * following features: 1) edit distance between name and the resolved name,
     * choose smallest one 2) content (haven't implemented)
     * 
     * @param resolvedEntities
     *            final result for the input stream
     * @param allCandidates
     *            each location name may hits several documents, this is the
     *            collection for all hitted documents
     * @throws IOException
     * @throws RuntimeException
     */

    private void pickBestCandidates(HashMap<String, ArrayList<String>> resolvedEntities,
            HashMap<String, ArrayList<ArrayList<String>>> allCandidates) {

        for (String extractedName : allCandidates.keySet()) {
            ArrayList<ArrayList<String>> cur = allCandidates.get(extractedName);
            int minDistance = Integer.MAX_VALUE, minIndex = -1;
            for (int i = 0; i < cur.size(); ++i) {
                String resolvedName = cur.get(i).get(0);// get cur's ith
                // resolved entry's name
                int distance = StringUtils.getLevenshteinDistance(extractedName, resolvedName);
                if (distance < minDistance) {
                    minDistance = distance;
                    minIndex = i;
                }
            }
            if (minIndex == -1)
                continue;
            resolvedEntities.put(extractedName, cur.get(minIndex));
        }
    }

    /**
     * Build the gazetteer index line by line
     * 
     * @param GAZETTEER_PATH
     *            path of the gazetter file
     * @throws IOException
     * @throws RuntimeException
     */
    public void buildIndex(String GAZETTEER_PATH) throws IOException {
        File indexfile = new File(INDEXDIR_PATH);
        indexDir = FSDirectory.open(indexfile.toPath());
        if (!DirectoryReader.indexExists(indexDir)) {
            IndexWriterConfig config = new IndexWriterConfig(analyzer);
            indexWriter = new IndexWriter(indexDir, config);
            Logger logger = Logger.getLogger(this.getClass().getName());
            logger.log(Level.WARNING, "Start Building Index for Gazatteer");
            BufferedReader filereader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(GAZETTEER_PATH), "UTF-8"));
            String line;
            int count = 0;
            while ((line = filereader.readLine()) != null) {
                try {
                    count += 1;
                    if (count % 100000 == 0) {
                        logger.log(Level.INFO, "Indexed Row Count: " + count);
                    }
                    addDoc(indexWriter, line);

                } catch (RuntimeException re) {
                    logger.log(Level.WARNING, "Skipping... Error on line: {}", line);
                }
            }
            logger.log(Level.WARNING, "Building Finished");
            filereader.close();
            indexWriter.close();
        }
    }

    /**
     * Index gazetteer's one line data by built-in Lucene Index functions
     * 
     * @param indexWriter
     *            Lucene indexWriter to be loaded
     * @param line
     *            a line from the gazetteer file
     * @throws IOException
     * @throws NumberFormatException
     */
    private static void addDoc(IndexWriter indexWriter, final String line) {
        String[] tokens = line.split("\t");

        int ID = Integer.parseInt(tokens[0]);
        String name = tokens[1];
        String alternatenames = tokens[3];

        Double latitude = -999999.0;
        try {
            latitude = Double.parseDouble(tokens[4]);
        } catch (NumberFormatException e) {
            latitude = OUT_OF_BOUNDS;
        }
        Double longitude = -999999.0;
        try {
            longitude = Double.parseDouble(tokens[5]);
        } catch (NumberFormatException e) {
            longitude = OUT_OF_BOUNDS;
        }

        Document doc = new Document();
        doc.add(new IntField("ID", ID, Field.Store.YES));
        doc.add(new TextField("name", name, Field.Store.YES));
        doc.add(new DoubleField("longitude", longitude, Field.Store.YES));
        doc.add(new DoubleField("latitude", latitude, Field.Store.YES));
        doc.add(new TextField("alternatenames", alternatenames, Field.Store.YES));
        try {
            indexWriter.addDocument(doc);
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

}