org.trnltk.apps.tokenizer.UniqueWordFinderApp.java Source code

Introduction

Here is the source code for org.trnltk.apps.tokenizer.UniqueWordFinderApp.java
Source

/*
 * Copyright  2013  Ali Ok (aliokATapacheDOTorg)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.trnltk.apps.tokenizer;

import com.google.common.base.Charsets;
import com.google.common.base.Splitter;
import com.google.common.io.Files;
import org.apache.commons.lang3.Validate;
import org.apache.commons.lang3.time.StopWatch;
import org.junit.runner.RunWith;
import org.trnltk.apps.commons.App;
import org.trnltk.apps.commons.AppRunner;

import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

@RunWith(AppRunner.class)
@Deprecated
/**
 * Finds unique words in files in a folder.
 *
 * Is not very efficient and required very huge memory if files are big.
 *
 * Use WordCountFinderApp instead as it is doing multi-pass and then merging.
 * @deprecated Use WordCountFinderApp
 */
public class UniqueWordFinderApp {

    @App("Goes thru tokenized files, finds unique words")
    public void findWordHistogram() throws InterruptedException {
        final StopWatch taskStopWatch = new StopWatch();
        taskStopWatch.start();

        final File parentFolder = new File("D:\\devl\\data\\aakindan");
        final File sourceFolder = new File(parentFolder, "src_split_tokenized");
        final File[] files = sourceFolder.listFiles();
        Validate.notNull(files);

        final List<File> filesToRead = new ArrayList<File>();
        for (File file : files) {
            if (file.isDirectory())
                continue;

            filesToRead.add(file);
        }

        int NUMBER_OF_THREADS = 8;
        final ThreadPoolExecutor pool = (ThreadPoolExecutor) Executors.newFixedThreadPool(NUMBER_OF_THREADS);
        Map[] countMaps = new Map[NUMBER_OF_THREADS];
        for (int i = 0; i < countMaps.length; i++) {
            countMaps[i] = new HashMap(1000000);
        }

        for (int i = 0; i < filesToRead.size(); i++) {
            File file = filesToRead.get(i);
            //noinspection unchecked
            pool.execute(new HistogramCommand(countMaps[i % NUMBER_OF_THREADS], file));
        }

        pool.shutdown();
        while (!pool.isTerminated()) {
            //System.out.println("Waiting pool to be terminated!");
            pool.awaitTermination(3000, TimeUnit.MILLISECONDS);
        }

        System.out.println("Merging countMaps");
        final HashMap<String, Integer> mergeMap = new HashMap<String, Integer>(
                countMaps[0].size() * NUMBER_OF_THREADS); //approx
        for (Map<String, Integer> countMap : countMaps) {
            for (Map.Entry<String, Integer> stringIntegerEntry : countMap.entrySet()) {
                final String surface = stringIntegerEntry.getKey();
                final Integer newCount = stringIntegerEntry.getValue();
                final Integer existingCount = mergeMap.get(surface);
                if (existingCount == null)
                    mergeMap.put(surface, newCount);
                else
                    mergeMap.put(surface, existingCount + newCount);
            }
        }

        System.out.println("Sorting mergeMaps");
        final Map<String, Integer> sortedMergeMap = new TreeMap<String, Integer>(new Comparator<String>() {
            @Override
            public int compare(String a, String b) {
                Integer x = mergeMap.get(a);
                Integer y = mergeMap.get(b);
                if (x.equals(y)) {
                    return a.compareTo(b);
                }
                return y.compareTo(x);
            }
        });

        sortedMergeMap.putAll(mergeMap);

        System.out.println("Writing to file");
        int numberOfTokens = 0;
        final File outputFile = new File(parentFolder, "wordHistogram.txt");
        BufferedWriter bufferedWriter = null;
        try {
            bufferedWriter = Files.newWriter(outputFile, Charsets.UTF_8);
            for (Map.Entry<String, Integer> entry : sortedMergeMap.entrySet()) {
                numberOfTokens += entry.getValue();
                bufferedWriter.write(entry.getKey() + " " + entry.getValue() + "\n");
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (bufferedWriter != null)
                try {
                    bufferedWriter.close();
                } catch (IOException e) {
                    System.err.println("Unable to close file ");
                    e.printStackTrace();
                }
        }

        taskStopWatch.stop();

        System.out.println("Total time :" + taskStopWatch.toString());
        System.out.println("Nr of tokens : " + numberOfTokens);
        System.out.println("Nr of unique tokens : " + sortedMergeMap.size());
    }

    private static class HistogramCommand implements Runnable {
        private final Map<String, Integer> countMap;
        private final File sourceFile;

        private HistogramCommand(Map<String, Integer> countMap, File sourceFile) {
            this.countMap = countMap;
            this.sourceFile = sourceFile;
        }

        @Override
        public void run() {
            System.out.println("Reading sourceFile " + sourceFile);
            final Splitter splitter = Splitter.on(' ').omitEmptyStrings().trimResults();

            try {
                final List<String> lines = Files.readLines(sourceFile, Charsets.UTF_8);
                for (int i = 0; i < lines.size(); i++) {
                    if (i % 10000 == 0)
                        System.out.println("Source file " + sourceFile + "  line " + i);
                    final String line = lines.get(i);
                    final Iterable<String> words = splitter.split(line);
                    for (String word : words) {
                        final Integer count = countMap.get(word);
                        if (count == null)
                            countMap.put(word, 1);
                        else
                            countMap.put(word, count + 1);
                    }
                }
            } catch (IOException e) {
                System.err.println("Error reading file " + sourceFile);
                e.printStackTrace();
            }
        }
    }

}