net.sf.classifier4J.bayesian.WordProbability.java Source code

Java tutorial

Introduction

Here is the source code for net.sf.classifier4J.bayesian.WordProbability.java

Source

/*
 * ====================================================================
 * 
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2003 Nick Lothian. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution, if
 *    any, must include the following acknowlegement:  
 *       "This product includes software developed by the 
 *        developers of Classifier4J (http://classifier4j.sf.net/)."
 *    Alternately, this acknowlegement may appear in the software itself,
 *    if and wherever such third-party acknowlegements normally appear.
 *
 * 4. The name "Classifier4J" must not be used to endorse or promote 
 *    products derived from this software without prior written 
 *    permission. For written permission, please contact   
 *    http://sourceforge.net/users/nicklothian/.
 *
 * 5. Products derived from this software may not be called 
 *    "Classifier4J", nor may "Classifier4J" appear in their names 
 *    without prior written permission. For written permission, please 
 *    contact http://sourceforge.net/users/nicklothian/.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 */

package net.sf.classifier4J.bayesian;

import net.sf.classifier4J.ICategorisedClassifier;
import net.sf.classifier4J.IClassifier;
import net.sf.classifier4J.util.*;
import net.sf.classifier4J.util.CompareToBuilder;
import net.sf.classifier4J.util.EqualsBuilder;
import net.sf.classifier4J.util.ToStringBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.Serializable;

/**
 * Represents the probability of a particular word. The user of this object
 * can either:
 * <ol>
 *       <li>Set a specific probability for a particular word <I>or</I></li>
 *       <li>Define the matching and non-matching counts for the particular word. 
 *        This class then calculates the probability for you.</li>
 * </ol>
 * 
 * @author Nick Lothian
 * @author Peter Leschev
 */
public class WordProbability implements Comparable, Serializable {

    private static final int UNDEFINED = -1;

    private String word = "";
    private String category = ICategorisedClassifier.DEFAULT_CATEGORY;

    private long matchingCount = UNDEFINED;
    private long nonMatchingCount = UNDEFINED;

    private double probability = IClassifier.NEUTRAL_PROBABILITY;

    public WordProbability() {
        setMatchingCount(0);
        setNonMatchingCount(0);
    }

    public WordProbability(String w) {
        setWord(w);
        setMatchingCount(0);
        setNonMatchingCount(0);
    }

    public WordProbability(String c, String w) {
        setCategory(c);
        setWord(w);
        setMatchingCount(0);
        setNonMatchingCount(0);
    }

    public WordProbability(String w, double probability) {
        setWord(w);
        setProbability(probability);
    }

    public WordProbability(String w, long matchingCount, long nonMatchingCount) {
        setWord(w);
        setMatchingCount(matchingCount);
        setNonMatchingCount(nonMatchingCount);
    }

    public void setWord(String w) {
        this.word = w;
    }

    public void setCategory(String category) {
        this.category = category;
    }

    public void setProbability(double probability) {
        this.probability = probability;
        this.matchingCount = UNDEFINED;
        this.nonMatchingCount = UNDEFINED;
    }

    public void setMatchingCount(long matchingCount) {
        if (matchingCount < 0) {
            throw new IllegalArgumentException("matchingCount must be greater than 0");
        }
        this.matchingCount = matchingCount;
        calculateProbability();
    }

    public void setNonMatchingCount(long nonMatchingCount) {
        if (nonMatchingCount < 0) {
            throw new IllegalArgumentException("nonMatchingCount must be greater than 0");
        }
        this.nonMatchingCount = nonMatchingCount;
        calculateProbability();
    }

    public void registerMatch() {
        if (matchingCount == Long.MAX_VALUE) {
            throw new UnsupportedOperationException("Long.MAX_VALUE reached, can't register more matches");
        }
        matchingCount++;
        calculateProbability();
    }

    public void registerNonMatch() {
        if (nonMatchingCount == Long.MAX_VALUE) {
            throw new UnsupportedOperationException("Long.MAX_VALUE reached, can't register more matches");
        }
        nonMatchingCount++;
        calculateProbability();
    }

    private void calculateProbability() {
        // the logger can't be a field because this class might be serialized 
        Log log = LogFactory.getLog(this.getClass());

        String method = "calculateProbability() ";

        if (log.isDebugEnabled()) {
            log.debug(method + "START");

            log.debug(method + "matchingCount = " + matchingCount);
            log.debug(method + "nonMatchingCount = " + nonMatchingCount);
        }

        double result = IClassifier.NEUTRAL_PROBABILITY;

        if (matchingCount == 0) {
            if (nonMatchingCount == 0) {
                result = IClassifier.NEUTRAL_PROBABILITY;
            } else {
                result = IClassifier.LOWER_BOUND;
            }
        } else {
            result = BayesianClassifier
                    .normaliseSignificance((double) matchingCount / (double) (matchingCount + nonMatchingCount));
        }

        probability = result;

        if (log.isDebugEnabled()) {
            log.debug(method + "END Calculated [" + probability + "]");
        }
    }

    /**
     * @return
     */
    public double getProbability() {
        return probability;
    }

    public long getMatchingCount() {

        if (matchingCount == UNDEFINED) {
            throw new UnsupportedOperationException("MatchingCount has not been defined");
        }

        return matchingCount;
    }

    public long getNonMatchingCount() {

        if (nonMatchingCount == UNDEFINED) {
            throw new UnsupportedOperationException("nonMatchingCount has not been defined");
        }

        return nonMatchingCount;
    }

    /**
     * @return
     */
    public String getWord() {
        return word;
    }

    public String getCategory() {
        return category;
    }

    public boolean equals(Object o) {
        if (!(o instanceof WordProbability)) {
            return false;
        }
        WordProbability rhs = (WordProbability) o;
        return new EqualsBuilder().append(getWord(), rhs.getWord()).append(getCategory(), rhs.getCategory())
                .isEquals();
    }

    public int compareTo(Object o) {
        if (!(o instanceof WordProbability)) {
            throw new ClassCastException(o.getClass() + " is not a " + this.getClass());
        }
        WordProbability rhs = (WordProbability) o;
        return new CompareToBuilder().append(this.getCategory(), rhs.getCategory())
                .append(this.getWord(), rhs.getWord()).toComparison();
    }

    public String toString() {
        return new ToStringBuilder(this).append("word", word).append("category", category)
                .append("probability", probability).append("matchingCount", matchingCount)
                .append("nonMatchingCount", nonMatchingCount).toString();
    }

    public int hashCode() {
        // you pick a hard-coded, randomly chosen, non-zero, odd number
        // ideally different for each class
        return new HashCodeBuilder(17, 37).append(word).append(category).toHashCode();
    }
}