Java tutorial
/* * (c) 2005 David B. Bracewell * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.davidbracewell.ml.indexing.transform; import com.davidbracewell.collection.CollectionUtils; import com.davidbracewell.collection.Counter; import com.davidbracewell.math.CompactCounter; import com.davidbracewell.math.DoubleEntry; import com.davidbracewell.ml.Feature; import com.davidbracewell.ml.FeatureSet; import com.davidbracewell.ml.Instance; import com.google.common.base.Function; import org.apache.commons.math3.util.FastMath; import javax.annotation.Nullable; import java.util.ArrayList; import java.util.List; /** * <p>A <code>Transform</code> that converts the values of a given set of features to TFIDF using the formula: * <pre>(0.5 + (0.5*TF)/maxTF) * log((1.0+N)/DF)</pre> where TF is the frequency of a given term, maxTF is the maximum * term frequency in the document, N is the total documents in the corpus, and DF is the number of documents the term * occurs in. </p> * * @author David B. Bracewell */ public class TfIdfTransform extends RestrictedFeatureTransform { private static final long serialVersionUID = 1L; Counter<Feature> docFrequency = new CompactCounter<>(); double maxTF = 0; public TfIdfTransform(String featurePrefix) { super(featurePrefix); } @Override public void collectImpl(Instance instance) { FeatureSet features = instance.getFeatures(); for (DoubleEntry entry : CollectionUtils.asIterable(instance.nonZeroIterator())) { if (shouldTransformFeature(features.get(entry.index).getName()) && features.get(entry.index).getType().isReal()) { docFrequency.increment(features.get(entry.index)); } } } @Override protected void transformImpl(Instance input) { FeatureSet features = input.getFeatures(); maxTF = Double.NEGATIVE_INFINITY; List<DoubleEntry> validEntries = new ArrayList<>(); for (DoubleEntry entry : CollectionUtils.asIterable(input.nonZeroIterator())) { if (shouldTransformFeature(features.get(entry.index).getName())) { maxTF = Math.max(maxTF, entry.value); validEntries.add(entry); } } for (DoubleEntry entry : validEntries) { Feature feature = features.get(entry.index); input.set(entry.index, (0.5 + (0.5 * entry.value) / maxTF) * docFrequency.get(feature)); } } @Override protected void finishImpl() { docFrequency = docFrequency.adjustValues(new Function<Double, Double>() { @Nullable @Override public Double apply(@Nullable Double input) { return FastMath.log((getCollectionSize() + 1.0) / (input + 1.0)); } }); } }//END OF TfIdfTransform