di.uniba.it.tee2.text.TextDirIndex.java Source code

Java tutorial

Introduction

Here is the source code for di.uniba.it.tee2.text.TextDirIndex.java

Source

/**
 * Copyright (c) 2014, the TEE2 AUTHORS.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * Neither the name of the University of Bari nor the names of its contributors
 * may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007
 *
 */
package di.uniba.it.tee2.text;

import di.uniba.it.tee2.wiki.*;
import di.uniba.it.tee2.index.TemporalEventIndexingTS;
import di.uniba.it.tee2.util.Counter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;

/**
 *
 * @author pierpaolo
 */
public class TextDirIndex {

    private int minTextLegth = 4000;

    private static final Logger logger = Logger.getLogger(TextDirIndex.class.getName());

    private TemporalEventIndexingTS tee;

    private int numberOfThreads = 4;

    public static BlockingQueue<TxtDoc> pages = new ArrayBlockingQueue<>(1000);

    public int getMinTextLegth() {
        return minTextLegth;
    }

    public void setMinTextLegth(int minTextLegth) {
        this.minTextLegth = minTextLegth;
    }

    public void init(String lang, String mainDir, int nt) throws Exception {
        tee = new TemporalEventIndexingTS();
        tee.init(lang, mainDir);
        this.numberOfThreads = nt;
    }

    public void build(String xmlDumpFilename, String language) throws Exception {
        build(new File(xmlDumpFilename), language);
    }

    private void processFile(File inFile) throws IOException, InterruptedException {
        if (inFile.isDirectory()) {
            File[] files = inFile.listFiles();
            for (File file : files) {
                processFile(file);
            }
        } else {
            BufferedReader reader = new BufferedReader(new FileReader(inFile));
            String title = "";
            StringBuilder content = new StringBuilder();
            if (reader.ready()) {
                title = reader.readLine();
            }
            while (reader.ready()) {
                content.append(reader.readLine()).append("\n");
            }
            pages.put(new TxtDoc(title, content.toString(), inFile.getName()));
        }
    }

    private void build(File startDir, String language) throws Exception {
        try {
            Counter.init();
            TxtDoc poisonPage = new TxtDoc("***POISON_PAGE***", "");
            List<Thread> threads = new ArrayList<>();
            for (int i = 0; i < numberOfThreads; i++) {
                Thread thread = new IndexThread(tee, language, minTextLegth);
                threads.add(thread);
                thread.start();
            }
            processFile(startDir);

            for (int i = 0; i < numberOfThreads; i++) {
                pages.put(poisonPage);
            }

            for (int i = 0; i < numberOfThreads; i++) {
                threads.get(i).join();
            }

            logger.log(Level.INFO, "Indexed pages: {0}", Counter.get());
            tee.close();

        } catch (IOException | InterruptedException ex) {
            logger.log(Level.SEVERE, "Error to build index", ex);
        }

    }

    static final Options options;

    static CommandLineParser cmdParser = new BasicParser();

    static {
        options = new Options();
        options.addOption("l", true, "language (italian, english)")
                .addOption("i", true, "the input directory (files to index)")
                .addOption("o", true, "the output directory")
                .addOption("n", true, "number of threads (optional, default 2)");
    }

    /**
     * language_0 starting_dir_1 output_dir_2 n_thread_3
     *
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        try {
            CommandLine cmd = cmdParser.parse(options, args);
            if (cmd.hasOption("l") && cmd.hasOption("i") && cmd.hasOption("o")) {
                int nt = Integer.parseInt(cmd.getOptionValue("n", "2"));
                TextDirIndex builder = new TextDirIndex();
                builder.init(cmd.getOptionValue("l"), cmd.getOptionValue("o"), nt);
                builder.build(cmd.getOptionValue("l"), cmd.getOptionValue("i"));
            } else {
                HelpFormatter helpFormatter = new HelpFormatter();
                helpFormatter.printHelp("Index a directory", options, true);
            }
        } catch (Exception ex) {
            Logger.getLogger(TextDirIndex.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

}