Java examples for java.lang:Math Algorithm
compute Information Gain
//package com.java2s; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class Main { public static double computeInformationGain(double entropy, List<String> inputData, List<Integer> rowIds, int currentFeatureIndex, int targetColumnIndex, String separator) { Map<String, List<Integer>> fetureRowDistribution = getFetureRowIdsDistribution( inputData, rowIds, currentFeatureIndex, separator); double entropyPerValuesType = 0.0; double weightFactor = 0.0; for (List<Integer> rowIdsForValue : fetureRowDistribution.values()) { weightFactor = ((double) rowIdsForValue.size()) / rowIds.size(); entropyPerValuesType = entropyPerValuesType + (weightFactor * computeEntropy(inputData, rowIdsForValue, targetColumnIndex, separator)); }//from ww w . j a va2 s. co m return entropy - entropyPerValuesType; } public static Map<String, List<Integer>> getFetureRowIdsDistribution( List<String> inputData, List<Integer> rowIds, int currentFeatureIndex, String separator) { Map<String, List<Integer>> classRowDistribution = new HashMap<>(); String featureValue = null; for (int rowid : rowIds) { featureValue = inputData.get(rowid).split(separator)[currentFeatureIndex - 1]; if (!classRowDistribution.containsKey(featureValue)) { classRowDistribution.put(featureValue, new ArrayList<Integer>()); } classRowDistribution.get(featureValue).add(rowid); } return classRowDistribution; } /** * * @param inputData * : list of un-splitted string separated by separator * @param rowIds * : contains row id's starting from zero column index * @param targetColumnIndex * : based on 1 column index not 0 based * @param * @return */ public static double computeEntropy(List<String> inputData, List<Integer> rowIds, int targetColumnIndex, String separator) { Map<String, Integer> classDistribution = new HashMap<>(); double entropy = 0.0; String classLabel = null; // if we can avoid redundant data rows split operation somehow we can // improve performance. for (int rowid : rowIds) { classLabel = inputData.get(rowid).split(separator)[targetColumnIndex - 1]; if (!classDistribution.containsKey(classLabel)) { classDistribution.put(classLabel, 1); continue; } classDistribution.put(classLabel, classDistribution.get(classLabel) + 1); } int noOfDataPoints = rowIds.size(); double probability = 0.0; for (int countOfClass : classDistribution.values()) { probability = ((double) countOfClass) / noOfDataPoints; entropy = entropy - probability * Math.log(probability); } return entropy; } }