ctrus.pa.bow.core.UnWeightedBagOfWords.java Source code

Java tutorial

Introduction

Here is the source code for ctrus.pa.bow.core.UnWeightedBagOfWords.java

Source

/*******************************************************************************
 * Copyright (c) 2015, 2016  Naveen Kulkarni
 *
 * This file is part of Bag of Words program. 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Contributors:
 *     Naveen Kulkarni (naveen.kulkarni@research.iiit.ac.in)
 *     
 *******************************************************************************/

package ctrus.pa.bow.core;

import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.NotImplementedException;

import ctrus.pa.bow.term.TermFilteration;
import ctrus.pa.bow.term.TermTransformation;

public abstract class UnWeightedBagOfWords implements BagOfWords {

    private List<String> _terms = new ArrayList<String>();
    private TermTransformation _transformations = null;
    private TermFilteration _filterations = null;

    protected final UnWeightedBagOfWords setTransformations(TermTransformation transformations) {
        _transformations = transformations;
        return this;
    }

    public double getTermCount() {
        return _terms.size();
    }

    protected final UnWeightedBagOfWords setFilterations(TermFilteration filterations) {
        _filterations = filterations;
        return this;
    }

    public final <E extends Enum<E>> void writeTo(OutputStream out, E identifier) throws IOException {
        throw new NotImplementedException("WriteTo with identifier is not supported in UnWeightedBagOfWords");
    }

    public final void writeTo(OutputStream out) throws IOException {

        Iterator<String> terms = _terms.iterator();
        while (terms.hasNext())
            IOUtils.write(terms.next() + " ", out);
        IOUtils.write("\n", out);
        out.flush();
    }

    public final void addTerm(String word, double weight) {
        throw new NotImplementedException("Term weight is not supported in UnWeightedBagOfWords");
    }

    public final <E extends Enum<E>> void addTerm(String term, String doc, E identifier) {
        throw new NotImplementedException("Term with identifier is not supported in UnWeightedBagOfWords");
    }

    public final void addTerms(String[] terms, String doc) {
        for (String term : terms)
            addTerm(term, doc);
    }

    public final void addTerm(String term, String doc) {
        // Check if term is null or empty
        if (term == null || term.length() == 0)
            return;

        // Is it required to be added?   // First filtration
        if (_filterations.filter(term))
            return;

        // Transform the term
        String transformedTerm = _transformations.transform(term);

        // Add transformed term(s) to the list
        if (transformedTerm != null && transformedTerm.length() != 0) {
            if (transformedTerm.indexOf(" ") == -1) { // Not a compound term            
                if (!_filterations.filter(transformedTerm)) { // Second filtration                
                    _terms.add(transformedTerm);
                    // Add to vocabulary
                    Vocabulary.getInstance().addTerm(transformedTerm, doc);
                }
            } else { // compound term, split and add            
                String[] terms = transformedTerm.split(" ");
                for (String eachTerm : terms) {
                    addTerm(eachTerm, doc);
                }
            }
        }

    }

    public final void reset() {
        // Clean up all terms for recycling bag of words
        _terms.clear();
    }
}