ctrus.pa.bow.DefaultBagOfWords.java Source code

Java tutorial

Introduction

Here is the source code for ctrus.pa.bow.DefaultBagOfWords.java

Source

/*******************************************************************************
 * Copyright (c) 2015, 2016  Naveen Kulkarni
 *
 * This file is part of Bag of Words program. 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Contributors:
 *     Naveen Kulkarni (naveen.kulkarni@research.iiit.ac.in)
 *     
 *******************************************************************************/

package ctrus.pa.bow;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Collection;

import org.apache.commons.cli.MissingOptionException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.DirectoryFileFilter;
import org.apache.commons.io.filefilter.WildcardFileFilter;

import ctrus.pa.bow.core.BOWOptions;
import ctrus.pa.bow.core.UnWeightedBagOfWords;
import ctrus.pa.bow.core.Vocabulary;
import ctrus.pa.util.CtrusHelper;

public abstract class DefaultBagOfWords extends UnWeightedBagOfWords {

    protected static final String DEFAULT_OUTPUT_DIR = "output";
    protected static final String DEFAULT_OUTPUT_FILE = "bow.txt";
    protected static final String DEFAULT_TERM_VOCAB_FILE = "term_vocab.txt";
    protected static final String DEFAULT_TERM_FREQ_FILE = "vocab_freq.txt";
    protected static final String DEFAULT_DOC_VOCAB_FILE = "doc.txt";

    protected BOWOptions _options = null;
    protected File _outputDir = null;

    private OutputStream _out = null;
    private boolean _singleFileOut = false;

    protected DefaultBagOfWords() {
    }

    protected abstract void setup();

    public void setup(BOWOptions options) {
        // Setup options
        this._options = options;

        // Prepare the output directory
        setupOutputDir();
        CtrusHelper.printToConsole("Output folder - " + _outputDir.getAbsolutePath());

        // Prepare the single output file
        _singleFileOut = hasOption(DefaultOptions.OUTPUT_SINGLE_FILE);
        if (_singleFileOut) {
            String singleOutputFileName;
            try {
                singleOutputFileName = getOption(DefaultOptions.OUTPUT_SINGLE_FILE);
            } catch (MissingOptionException e) {
                singleOutputFileName = DEFAULT_OUTPUT_FILE;
            }
            try {
                _out = new FileOutputStream(new File(_outputDir, singleOutputFileName));
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
        }

        // perform additional setup
        setup();
    }

    protected void setupOutputDir() {
        String outputDirString = null;
        try {
            outputDirString = getOption(DefaultOptions.OUTPUT_DIR);
        } catch (MissingOptionException ex) {
            outputDirString = DEFAULT_OUTPUT_DIR;
        }
        _outputDir = new File(outputDirString);
        if (!_outputDir.exists())
            _outputDir.mkdirs();
    }

    protected String getOption(String key) throws MissingOptionException {
        return _options.getOption(key);
    }

    protected boolean hasOption(String key) {
        return _options.hasOption(key);
    }

    // Write the terms collected to output file
    protected void writeToOutput(String doc) throws IOException {
        if (_singleFileOut) {
            IOUtils.write(doc + "=", _out);
            writeTo(_out);
        } else {
            // replace special characters in the file name 
            doc = doc.replace('<', '[');
            doc = doc.replace('>', ']');

            _out = new FileOutputStream(new File(_outputDir, doc));
            writeTo(_out);
            _out.close();
            _out = null;
        }
    }

    protected String getDocumentId(String name) {
        String did = (_options.hasOption(DefaultOptions.PRESERVE_DOC_ID)) ? name
                : CtrusHelper.uniqueId(name).toString();
        // Handle duplicate document names
        // Duplicate names are appended with a count
        Vocabulary voc = Vocabulary.getInstance(_options);
        int count = 1;
        String newid = null;
        while (voc.hasDocument(did)) {
            newid = name + "_" + count;
            did = (_options.hasOption(DefaultOptions.PRESERVE_DOC_ID)) ? newid
                    : CtrusHelper.uniqueId(newid).toString();
            count++;
        }
        return did;
    }

    protected Collection<File> getSourceDocuments(String wildCard) throws MissingOptionException {
        File sourceDir = new File(_options.getOption(DefaultOptions.SOURCE_DIR));
        CtrusHelper.printToConsole("Choosen source folder - " + sourceDir.getAbsolutePath());
        if (sourceDir.exists()) {
            return FileUtils.listFiles(sourceDir, new WildcardFileFilter(wildCard), DirectoryFileFilter.DIRECTORY);
        } else {
            throw new MissingOptionException("Unable to find source directory!");
        }
    }

    public void printVocabulary() throws IOException {
        if (_options.hasOption(DefaultOptions.PRINT_VOCABULARY)) {
            String outputFileString;
            try {
                outputFileString = _options.getOption(DefaultOptions.OUTPUT_DIR);
                if (!outputFileString.endsWith(File.separator))
                    outputFileString = outputFileString + File.separator;
            } catch (MissingOptionException ex) {
                outputFileString = "." + File.separator;
            }
            String outputFileString1 = outputFileString + File.separator + DEFAULT_TERM_VOCAB_FILE;
            String outputFileString2 = outputFileString + File.separator + DEFAULT_TERM_FREQ_FILE;
            String outputFileString3 = outputFileString + File.separator + DEFAULT_DOC_VOCAB_FILE;

            CtrusHelper.printToConsole("Writing vocabulary to " + outputFileString + " ...");

            // Write terms
            FileOutputStream fos1 = FileUtils.openOutputStream(new File(outputFileString1));
            Vocabulary.getInstance(_options).writeTermVocabularyTo(fos1);
            fos1.close();

            // Write frequency
            FileOutputStream fos2 = FileUtils.openOutputStream(new File(outputFileString2));
            Vocabulary.getInstance(_options).writeTermFrequencyTo(fos2);
            fos2.close();

            // Write doc
            FileOutputStream fos3 = FileUtils.openOutputStream(new File(outputFileString3));
            Vocabulary.getInstance(_options).writeDocVocabularyTo(fos3);
            fos3.close();
        }
    }
}