org.folg.places.tools.AnalyzePlaces.java Source code

Introduction

Here is the source code for org.folg.places.tools.AnalyzePlaces.java
Source

/*
 * Copyright 2012 Foundation for On-Line Genealogy, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.folg.places.tools;

import org.apache.commons.lang.math.NumberUtils;
import org.folg.places.standardize.Normalizer;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.xml.sax.SAXParseException;

import java.io.*;
import java.util.List;

/**
 * User: RyanK
 * Date: 1/1/12
 */
public class AnalyzePlaces {

    @Option(name = "-i", required = true, usage = "places file in")
    private File placesIn;

    @Option(name = "-o", required = false, usage = "directory for analysis file output")
    private File analysisPlacesOut;

    // break apart words, so North Grinston is split into separate words
    private static String SPLIT_REGEX = "[, ]+";

    private int REVERSE_EVERY_N = 10;

    private CountsCollector placesCountCC;
    private int totalPlacesCount;

    private CountsCollector wordsCountCC;
    private int totalWordsCount;

    private CountsCollector numbersCountCC;
    private int totalNumbersCount;

    private CountsCollector endingsOfPlacesCC;
    private int endingsOfPlacesTotalCount;

    /**
     * This section controls the Normalizer Tokenizer in the analysis *
     */
    private boolean useTokenizer = true;

    private CountsCollector tokenizerPlacesCountCC;
    private int totalTokenizerPlacesCount;

    //The total number of lines to test in the places file
    //when the tokenizer is turned on things get significantly slower so
    private int TOKENIZE_EVERY_N = 1;

    public AnalyzePlaces() {
        placesCountCC = new CountsCollector();
        totalPlacesCount = 0;

        wordsCountCC = new CountsCollector();
        totalWordsCount = 0;

        numbersCountCC = new CountsCollector();
        totalNumbersCount = 0;

        endingsOfPlacesCC = new CountsCollector();
        endingsOfPlacesTotalCount = 0;

        if (useTokenizer) {
            tokenizerPlacesCountCC = new CountsCollector();
            totalTokenizerPlacesCount = 0;
        }
    }

    private void doMain() throws SAXParseException, IOException {

        Normalizer normalizer = null;
        if (useTokenizer) {
            normalizer = Normalizer.getInstance();
        }

        PrintWriter reversedWordsWriter = analysisPlacesOut != null
                ? new PrintWriter(new File(analysisPlacesOut, "reversedWords.txt"))
                : new PrintWriter(System.out);

        BufferedReader bufferedReader = new BufferedReader(new FileReader(placesIn));

        int lineCount = 0;
        while (bufferedReader.ready()) {
            String nextLine = bufferedReader.readLine();
            nextLine = nextLine.trim().toLowerCase();
            if (nextLine.length() == 0)
                continue;

            lineCount++;
            if (lineCount % 5000 == 0)
                System.out.println("indexing line " + lineCount);

            placesCountCC.add(nextLine);
            totalPlacesCount++;

            String[] placeList = nextLine.split(SPLIT_REGEX);

            for (String place : placeList) {
                place = place.trim();

                if (place.length() == 0)
                    continue;

                if (NumberUtils.isNumber(place)) {
                    numbersCountCC.add(place);
                    totalNumbersCount++;
                } else {
                    wordsCountCC.add(place);
                    totalWordsCount++;
                }
            }

            int lastCommaIndx = nextLine.lastIndexOf(",");
            String lastWord = nextLine.substring(lastCommaIndx + 1).trim();
            if (lastWord.length() > 0) {
                endingsOfPlacesCC.add(lastWord);
                endingsOfPlacesTotalCount++;
            }

            if (lineCount % REVERSE_EVERY_N == 0) {
                StringBuilder reversedWord = new StringBuilder(nextLine);
                reversedWordsWriter.println(reversedWord.reverse());
            }

            if ((useTokenizer) && (lineCount % TOKENIZE_EVERY_N == 0)) {
                List<List<String>> levels = normalizer.tokenize(nextLine);
                for (List<String> levelWords : levels) {
                    tokenizerPlacesCountCC.addAll(levelWords);
                    totalTokenizerPlacesCount += levelWords.size();
                }
            }
        }

        System.out.println("total number of lines in files " + lineCount);

        System.out.println("Indexed a total of " + totalPlacesCount + " places.");
        System.out.println("Found a total of " + getPlacesCountCC().size() + " unique places.");
        getPlacesCountCC().writeSorted(false, 1,
                analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "placesCount.txt"))
                        : new PrintWriter(System.out));

        System.out.println("Indexed a total of " + totalWordsCount + " words.");
        System.out.println("Found a total of " + getWordsCountCC().size() + " unique words.");
        getWordsCountCC().writeSorted(false, 1,
                analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "wordsCount.txt"))
                        : new PrintWriter(System.out));

        System.out.println("Indexed a total of " + totalNumbersCount + " numbers.");
        System.out.println("Found a total of " + getNumbersCountCC().size() + " unique numbers.");
        getNumbersCountCC().writeSorted(false, 1,
                analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "numbersCount.txt"))
                        : new PrintWriter(System.out));

        System.out.println("Indexed a total of " + endingsOfPlacesTotalCount + " endings.");
        System.out.println("Found a total of " + getEndingsOfPlacesCC().size() + " unique endings.");
        getEndingsOfPlacesCC().writeSorted(false, 1,
                analysisPlacesOut != null ? new PrintWriter(new File(analysisPlacesOut, "endingsCount.txt"))
                        : new PrintWriter(System.out));

        if (useTokenizer) {
            System.out.println("Indexed a total of " + totalTokenizerPlacesCount + " normalized words.");
            System.out.println("Found a total of " + getTokenizerPlacesCountCC().size() + " normalized words.");
            getTokenizerPlacesCountCC().writeSorted(false, 1,
                    analysisPlacesOut != null
                            ? new PrintWriter(new File(analysisPlacesOut, "normalizedWordsCount.txt"))
                            : new PrintWriter(System.out));
        }

    }

    public CountsCollector getPlacesCountCC() {
        return placesCountCC;
    }

    public CountsCollector getWordsCountCC() {
        return wordsCountCC;
    }

    public CountsCollector getNumbersCountCC() {
        return numbersCountCC;
    }

    public CountsCollector getEndingsOfPlacesCC() {
        return endingsOfPlacesCC;
    }

    public CountsCollector getTokenizerPlacesCountCC() {
        return tokenizerPlacesCountCC;
    }

    public static void main(String[] args) throws SAXParseException, IOException {
        AnalyzePlaces self = new AnalyzePlaces();
        CmdLineParser parser = new CmdLineParser(self);
        try {
            parser.parseArgument(args);
            self.doMain();
        } catch (CmdLineException e) {
            System.err.println(e.getMessage());
            parser.printUsage(System.err);
        }
    }
}