Java tutorial
/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.modeling.alignment.learner; import java.math.BigDecimal; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.Vector; import com.google.common.base.Function; import com.google.common.collect.Multimap; import com.google.common.collect.Multimaps; import edu.isi.karma.modeling.ModelingConfiguration; import edu.isi.karma.rep.alignment.ColumnNode; import edu.isi.karma.rep.alignment.InternalNode; import edu.isi.karma.rep.alignment.Node; import edu.isi.karma.util.RandomGUID; public class SteinerNodes implements Comparable<SteinerNodes> { private static final double MIN_CONFIDENCE = 1E-6; private Set<Node> nodes; private Map<ColumnNode, ColumnNode> mappingToSourceColumns; private List<Double> confidenceList; private List<CoherenceItem> coherenceList; private double confidence; private double coherence; private int frequency; private double score; private int semanticTypeCount; // class ValueComparator implements Comparator<String> { // // Map<String, Set<Node>> base; // public ValueComparator(Map<String, Set<Node>> base) { // this.base = base; // } // // public int compare(String a, String b) { // if (base.get(a).size() > base.get(b).size()) // return 1; // else if (base.get(a).size() < base.get(b).size()) // return -1; // else // return 0; // } // } public SteinerNodes() { this.nodes = new HashSet<Node>(); this.mappingToSourceColumns = new HashMap<ColumnNode, ColumnNode>(); this.semanticTypeCount = 0; this.confidenceList = new Vector<Double>(); this.coherenceList = new ArrayList<CoherenceItem>(); this.frequency = 0; // this.confidence = 1.0; this.confidence = 0.0; this.coherence = 0.0; this.score = 0.0; } public SteinerNodes(SteinerNodes steinerNodes) { this.nodes = new HashSet<Node>(steinerNodes.getNodes()); this.mappingToSourceColumns = new HashMap<ColumnNode, ColumnNode>(steinerNodes.getMappingToSourceColumns()); this.confidenceList = new Vector<Double>(steinerNodes.getConfidenceVector()); this.coherenceList = new ArrayList<CoherenceItem>(steinerNodes.getCoherenceList()); this.frequency = steinerNodes.getFrequency(); this.confidence = steinerNodes.getConfidence(); this.coherence = steinerNodes.getCoherence(); this.semanticTypeCount = steinerNodes.getSemanticTypeCount(); this.score = steinerNodes.getScore(); } public Set<Node> getNodes() { return Collections.unmodifiableSet(this.nodes); } public Map<ColumnNode, ColumnNode> getMappingToSourceColumns() { return mappingToSourceColumns; } public int getSemanticTypeCount() { return semanticTypeCount; } public boolean addNodes(ColumnNode sourceColumn, InternalNode n1, ColumnNode n2, double confidence) { if (this.nodes.contains(n1) && this.nodes.contains(n2)) return false; this.semanticTypeCount++; this.nodes.add(n1); this.nodes.add(n2); this.mappingToSourceColumns.put(n2, sourceColumn); if (confidence <= 0 || confidence > 1) confidence = MIN_CONFIDENCE; this.confidenceList.add(confidence); this.frequency += n1.getModelIds() == null ? 0 : n1.getModelIds().size(); this.frequency += n2.getModelIds() == null ? 0 : n2.getModelIds().size(); this.computeCoherenceList(); this.computeCoherenceValue(); this.computeConfidenceValue(); this.computeScore(); return true; } public List<Double> getConfidenceVector() { return Collections.unmodifiableList(this.confidenceList); } public int getNodeCount() { return this.nodes.size(); } public double getScore() { return this.score; } public List<CoherenceItem> getCoherenceList() { return Collections.unmodifiableList(this.coherenceList); } public int getFrequency() { return frequency; } public double getConfidence() { return confidence; } public double getCoherence() { return coherence; } // private int computeFrequency() { // int frequency = 0; // for (Node n : this.nodes) // frequency += n.getPatternIds().size(); // return frequency; // } // private double computeConfidenceValue() { // // if (this.confidenceList.size() == 1) // return 1e-10; // // double confidence = 1.0; // // for (double d : this.confidenceList) { // if (d == 0) // confidence *= 1e-10; // else // confidence *= d; // } // // return confidence; // } private void computeConfidenceValue() { double sum = 0.0; double mult = 1.0; int count = 0; for (Double d : this.confidenceList) { if (d != null) { count++; sum += d.doubleValue(); mult *= d == 0.0 ? 0.1 : d.doubleValue(); } } this.confidence = mult; this.confidence = sum / (double) count; } private void computeCoherenceList() { if (nodes == null || nodes.size() == 0) return; Map<String, Integer> patternSize = new HashMap<String, Integer>(); Map<String, String> patternGuid = new HashMap<String, String>(); int guidSize = new RandomGUID().toString().length(); for (Node n : nodes) { for (String p : n.getModelIds()) { Integer size = patternSize.get(p); if (size == null) patternSize.put(p, 1); else patternSize.put(p, ++size); if (!patternGuid.containsKey(p)) { String guid = new RandomGUID().toString(); patternGuid.put(p, guid); } } } // find the maximum pattern size int maxPatternSize = 0; for (Entry<String, Integer> entry : patternSize.entrySet()) { if (entry.getValue().intValue() > maxPatternSize) maxPatternSize = entry.getValue().intValue(); } List<String> listOfNodesLargestPatterns = new ArrayList<String>(); for (Node n : nodes) { List<String> patternIds = new ArrayList<String>(n.getModelIds()); Collections.sort(patternIds); String[] nodeMaxPatterns = new String[maxPatternSize]; Arrays.fill(nodeMaxPatterns, ""); for (String p : patternIds) { int size = patternSize.get(p).intValue(); nodeMaxPatterns[size - 1] += patternGuid.get(p); } for (int i = maxPatternSize - 1; i >= 0; i--) { if (nodeMaxPatterns[i] != null && nodeMaxPatterns[i].trim().length() > 0) { listOfNodesLargestPatterns.add(nodeMaxPatterns[i]); break; } } } Function<String, String> stringEqualiy = new Function<String, String>() { @Override public String apply(final String s) { return s; } }; Multimap<String, String> index = Multimaps.index(listOfNodesLargestPatterns, stringEqualiy); this.coherenceList.clear(); int x, y; for (String s : index.keySet()) { if (s.trim().length() == 0) continue; x = index.get(s).size(); y = x > 0 ? index.get(s).iterator().next().length() / guidSize : 0; CoherenceItem ci = new CoherenceItem(x, y); this.coherenceList.add(ci); } Collections.sort(this.coherenceList); } private void computeCoherenceValue() { BigDecimal value = BigDecimal.ZERO; BigDecimal denominator = BigDecimal.ONE; BigDecimal factor = new BigDecimal(100); BigDecimal b; double normalizedCoherence; for (CoherenceItem ci : this.coherenceList) { normalizedCoherence = (double) ci.getX() / (double) (this.getNodeCount()); normalizedCoherence *= 100; normalizedCoherence = (double) ((int) normalizedCoherence); denominator = denominator.multiply(factor); b = new BigDecimal(normalizedCoherence); // b = new BigDecimal(ci.getDouble()); b = b.divide(denominator); value = value.add(b); } this.coherence = value.doubleValue(); } private double getNormalizedSizeReduction() { int minSize = this.semanticTypeCount; int maxSize = this.semanticTypeCount * 2; //feature scaling: (x - min) / (max - min) // here: x: reduction in size --- min reduction: 0 --- max reduction: maxSize - minSize return (double) (maxSize - this.getNodeCount()) / (double) (maxSize - minSize); } private double getNormalizedConfidence() { return getConfidence(); } private double getHarmonicMean(double[] input) { double result = 0.0; if (input == null) return result; double min = 1E-6; double sum = 0.0; for (double d : input) { if (d <= 0.0) d = min; sum += 1.0 / d; } if (sum == 0.0) return result; result = (double) input.length / sum; return result; } private double getArithmeticMean(double[] input) { double result = 0.0; if (input == null) return 0.0; double sum = 0.0; for (double d : input) { if (d < 0.0) d = 0.0; sum += d; } result = sum / (double) input.length; return result; } private void computeScore() { double confidence = this.getNormalizedConfidence(); double sizeReduction = this.getNormalizedSizeReduction(); double coherence = this.getCoherence(); //int frequency = this.getFrequency(); double alpha = ModelingConfiguration.getScoringConfidenceCoefficient(); double beta = ModelingConfiguration.getScoringCoherenceSCoefficient(); double gamma = ModelingConfiguration.getScoringSizeCoefficient(); // // this.score = alpha * coherence + // beta * distanceToMaxSize + // gamma * confidence; double[] measures = new double[3]; measures[0] = alpha * confidence; measures[1] = beta * coherence; measures[2] = gamma * sizeReduction; // this.score = sizeReduction; // this.score = coherence; // this.score = confidence; this.score = getHarmonicMean(measures); this.score = getArithmeticMean(measures); } @Override public int compareTo(SteinerNodes target) { double score1 = this.getScore(); double score2 = target.getScore(); if (score1 < score2) return 1; else if (score1 > score2) return -1; else return 0; } private static double roundTwoDecimals(double d) { DecimalFormat twoDForm = new DecimalFormat("#.##"); return Double.valueOf(twoDForm.format(d)); } public String getScoreDetailsString() { // this.computeCoherenceList(); StringBuffer sb = new StringBuffer(); // if (this.nodes != null) // for (Node n : this.nodes) { // if (n instanceof InternalNode) // sb.append(n.getLocalId()); // else { // if (mappingToSourceColumns.containsKey((ColumnNode)n)) // sb.append(mappingToSourceColumns.get((ColumnNode)n).getColumnName() ); // else // sb.append( ((ColumnNode)n).getColumnName() ); // } // sb.append("|"); // } // sb.append("\n"); sb.append("\n"); sb.append("coherence list: "); for (CoherenceItem ci : this.coherenceList) { sb.append("(" + ci.getX() + "," + ci.getY() + ")"); } // sb.append("\n"); sb.append("--- coherence value: " + this.coherence); sb.append("\n"); sb.append("size: " + this.getNodeCount() + ", max size: " + (this.semanticTypeCount * 2) + "---" + "normalized size reduction: " + roundTwoDecimals(this.getNormalizedSizeReduction())); sb.append("\n"); sb.append("confidence list: ("); for (Double cf : this.confidenceList) { if (cf != null) sb.append(roundTwoDecimals(cf.doubleValue()) + ","); } sb.append(") --- "); sb.append("normalized confidence: " + roundTwoDecimals(this.getNormalizedConfidence())); sb.append("\n"); // sb.append("total number of patterns: " + this.frequency); // sb.append("\n"); sb.append("final score: " + roundTwoDecimals(this.getScore()) + " - [arithmetic mean]"); sb.append("\n"); return sb.toString(); } }