Java tutorial
/** * PromniCAT - Collection and Analysis of Business Process Models * Copyright (C) 2012 Cindy Fhnrich, Tobias Hoppe, Andrina Mascher * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.uni_potsdam.hpi.bpt.promnicat.analysisModules.clustering; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.HashSet; import weka.core.Attribute; import weka.core.Debug.Random; import weka.core.FastVector; import weka.core.Instances; import weka.core.UnassignedClassException; import weka.core.Utils; import weka.core.converters.ArffLoader.ArffReader; import weka.core.converters.ConverterUtils.DataSource; /** * Custom class of the data element container for the * {@link HierarchicalProcessClusterer}. Extends WEKAs {@link Instances} and is * the container for the customized data elements {@link ProcessInstance} * * @author Cindy Fhnrich * */ public class ProcessInstances extends Instances { /** for serialization */ static final long serialVersionUID = -19412345060742748L; /** stores the string attributes and their weights extra */ private FastVector m_String_Attributes; /** * Reads an ARFF file from a reader, and assigns a weight of one to each * instance. Lets the index of the class attribute be undefined (negative). * * @param reader * the reader * @throws IOException * if the ARFF file is not read successfully */ public ProcessInstances(/* @non_null@ */Reader reader) throws IOException { super("", new FastVector(), 0); ArffReader arff = new ArffReader(reader); ProcessInstances dataset = (ProcessInstances) arff.getData(); initialize(dataset, dataset.numInstances()); dataset.copyInstances(0, this, dataset.numInstances()); compactify(); } /** * Reads the header of an ARFF file from a reader and reserves space for the * given number of instances. Lets the class index be undefined (negative). * * @param reader * the reader * @param capacity * the capacity * @throws IllegalArgumentException * if the header is not read successfully or the capacity is * negative. * @throws IOException * if there is a problem with the reader. * @deprecated instead of using this method in conjunction with the * <code>readInstance(Reader)</code> method, one should use the * <code>ArffLoader</code> or <code>DataSource</code> class * instead. * @see weka.core.converters.ArffLoader * @see weka.core.converters.ConverterUtils.DataSource */ // @ requires capacity >= 0; // @ ensures classIndex() == -1; @Deprecated public ProcessInstances(/* @non_null@ */Reader reader, int capacity) throws IOException { super("", new FastVector(), 0); ArffReader arff = new ArffReader(reader, 0); ProcessInstances header = (ProcessInstances) arff.getStructure(); initialize(header, capacity); m_Lines = arff.getLineNo(); } /** * Constructor copying all instances and references to the header * information from the given set of instances. * * @param dataset * the set to be copied */ public ProcessInstances(/* @non_null@ */ProcessInstances dataset) { this(dataset, dataset.numInstances()); dataset.copyInstances(0, this, dataset.numInstances()); } /** * Constructor creating an empty set of instances. Copies references to the * header information from the given set of instances. Sets the capacity of * the set of instances to 0 if its negative. * * @param dataset * the instances from which the header information is to be taken * @param capacity * the capacity of the new dataset */ public ProcessInstances(/* @non_null@ */ProcessInstances dataset, int capacity) { super("", new FastVector(), 0); initialize(dataset, capacity); } /** * Constructor creating an empty set of instances. Copies references to the * header information from the given set of instances. Sets the capacity of * the set of instances to 0 if its negative. * * @param dataset * the instances from which the header information is to be taken * @param atts * the numeric attributes of the new dataset * @param capacity * the capacity of the new dataset */ public ProcessInstances(/* @non_null@ */ProcessInstance data, FastVector atts, int capacity) { super("", atts, 0); m_Attributes = atts; m_String_Attributes = null; m_Instances = new FastVector(0); m_Instances.addElement(data); } /** * Constructor creating an empty set of instances. Copies references to the * header information from the given set of instances. Sets the capacity of * the set of instances to 0 if its negative. * * @param dataset * the instances from which the header information is to be taken * @param atts * the numeric attributes of the new dataset * @param strAtts * the string attributes of the new dataset * @param capacity * the capacity of the new dataset */ public ProcessInstances(/* @non_null@ */ProcessInstance data, FastVector atts, FastVector strAtts, int capacity) { super("", atts, 0); m_Attributes = atts; m_String_Attributes = strAtts; m_Instances = new FastVector(0); m_Instances.addElement(data); } /** * initializes with the header information of the given dataset, sets the * capacity, numeric and nominal attributes of the set of instances. * * @param dataset * the dataset to use as template * @param capacity * the number of rows to reserve */ protected void initialize(ProcessInstances dataset, int capacity) { if (capacity < 0) capacity = 0; // Strings only have to be "shallow" copied because // they can't be modified. m_ClassIndex = dataset.m_ClassIndex; m_RelationName = dataset.m_RelationName; m_Attributes = dataset.m_Attributes; m_String_Attributes = dataset.m_String_Attributes; m_Instances = new FastVector(capacity); } /** * Creates a new set of instances by copying a subset of another set. * * @param source * the set of instances from which a subset is to be created * @param first * the index of the first instance to be copied * @param toCopy * the number of instances to be copied * @throws IllegalArgumentException * if first and toCopy are out of range */ // @ requires 0 <= first; // @ requires 0 <= toCopy; // @ requires first + toCopy <= source.numInstances(); public ProcessInstances(/* @non_null@ */ProcessInstances source, int first, int toCopy) { this(source, toCopy); if ((first < 0) || ((first + toCopy) > source.numInstances())) { throw new IllegalArgumentException("Parameters first and/or toCopy out " + "of range"); } source.copyInstances(first, this, toCopy); } /** * Creates an empty set of instances. Uses the given numeric and string attribute information. * Sets the capacity of the set of instances to 0 if its negative. Given * attribute information must not be changed after this constructor has been * used. * * @param name * the name of the relation * @param attInfo * the numeric attribute information * @param strAttInfo * the string attribute information * @param capacity * the capacity of the set */ public ProcessInstances(/* @non_null@ */String name, /* @non_null@ */FastVector attInfo, FastVector strAttInfo, int capacity) { // check whether the attribute names are unique super("", new FastVector(), 0); HashSet<String> names = new HashSet<String>(); StringBuffer nonUniqueNames = new StringBuffer(); int max = 0; if (attInfo != null) { max = attInfo.size(); } for (int i = 0; i < max; i++) { if (names.contains(((Attribute) attInfo.elementAt(i)).name())) { nonUniqueNames.append("'" + ((Attribute) attInfo.elementAt(i)).name() + "' "); } names.add(((Attribute) attInfo.elementAt(i)).name()); } if (names.size() != attInfo.size()) throw new IllegalArgumentException( "Attribute names are not unique!" + " Causes: " + nonUniqueNames.toString()); names.clear(); m_RelationName = name; m_ClassIndex = -1; m_Attributes = attInfo; m_String_Attributes = strAttInfo; m_Instances = new FastVector(capacity); } /** * Create a copy of the structure if the data has string or relational * attributes, "cleanses" string types (i.e. doesn't contain references to * the strings seen in the past) and all relational attributes. * * @return a copy of the instance structure. */ public ProcessInstances stringFreeStructure() { FastVector newAtts = new FastVector(); for (int i = 0; i < m_Attributes.size(); i++) { Attribute att = (Attribute) m_Attributes.elementAt(i); if (att.type() == Attribute.STRING) { newAtts.addElement(new Attribute(att.name(), (FastVector) null, i)); } else if (att.type() == Attribute.RELATIONAL) { newAtts.addElement( new Attribute(att.name(), new ProcessInstances((ProcessInstances) att.relation(), 0), i)); } } if (newAtts.size() == 0) { return new ProcessInstances(this, 0); } FastVector atts = (FastVector) m_Attributes.copy(); for (int i = 0; i < newAtts.size(); i++) { atts.setElementAt(newAtts.elementAt(i), ((Attribute) newAtts.elementAt(i)).index()); } ProcessInstances result = new ProcessInstances(this, 0); result.m_Attributes = atts; return result; } /** * Adds one instance to the end of the set. Shallow copies instance before * it is added. Increases the size of the dataset if it is not large enough. * Does not check if the instance is compatible with the dataset. Note: * String or relational values are not transferred. * * @param instance * the instance to be added */ public void addInstance(/* @non_null@ */ProcessInstance instance) { ProcessInstance newInstance = (ProcessInstance) instance.copy(); newInstance.setDataset(this); m_Instances.addElement(newInstance); } /** * Checks if the given instance is compatible with this dataset. Only looks * at the size of the instance and the ranges of the values for nominal and * string attributes. * * @param instance * the instance to check * @return true if the instance is compatible with the dataset */ public/* @pure@ */boolean checkInstance(ProcessInstance instance) { if (instance.numAttributes() != numAttributes()) { return false; } if (instance.numStrAttributes() != numStrAttributes()) { return false; } for (int i = 0; i < numAttributes(); i++) { if (instance.isMissing(i)) { continue; } else if (attribute(i).isNominal() || attribute(i).isString()) { if (!(Utils.eq(instance.value(i), (double) (int) instance.value(i)))) { return false; } else if (Utils.sm(instance.value(i), 0) || Utils.gr(instance.value(i), attribute(i).numValues())) { return false; } } } return true; } /** * Returns the number of string attributes. * * @return the number of string attributes as an integer */ //@ ensures \result == m_Attributes.size(); public /*@pure@*/ int numStrAttributes() { return m_String_Attributes.size(); } /** * Checks if two headers are equivalent. * * @param dataset * another dataset * @return true if the header of the given dataset is equivalent to this * header */ public/* @pure@ */boolean equalHeaders(ProcessInstances dataset) { // Check class and all attributes if (m_ClassIndex != dataset.m_ClassIndex) { return false; } if (m_Attributes.size() != dataset.m_Attributes.size()) { return false; } for (int i = 0; i < m_Attributes.size(); i++) { if (!(attribute(i).equals(dataset.attribute(i)))) { return false; } } return true; } /** * Returns the first instance in the set. * * @return the first instance in the set */ // @ requires numInstances() > 0; public ProcessInstance getFirstInstance() { return (ProcessInstance) m_Instances.firstElement(); } /** * Returns the instance at the given position. * * @param index * the instance's index (index starts with 0) * @return the instance at the given position */ // @ requires 0 <= index; // @ requires index < numInstances(); public ProcessInstance getInstance(int index) { return (ProcessInstance) m_Instances.elementAt(index); } /** * Returns the last instance in the set. * * @return the last instance in the set */ // @ requires numInstances() > 0; public/* @non_null pure@ */ProcessInstance getLastInstance() { return (ProcessInstance) m_Instances.lastElement(); } /** * Returns the number of distinct values of a given attribute. The value * 'missing' is not counted. * * @param attIndex * the attribute (index starts with 0) * @return the number of distinct values of a given attribute */ // @ requires 0 <= attIndex; // @ requires attIndex < numAttributes(); public/* @pure@ */int numDistinctValues(int attIndex) { if (attribute(attIndex).isNumeric()) { double[] attVals = attributeToDoubleArray(attIndex); int[] sorted = Utils.sort(attVals); double prev = 0; int counter = 0; for (int i = 0; i < sorted.length; i++) { ProcessInstance current = getInstance(sorted[i]); if (current.isMissing(attIndex)) { break; } if ((i == 0) || (current.value(attIndex) > prev)) { prev = current.value(attIndex); counter++; } } return counter; } //same for string values String[] strAttVals = attributeToStringArray(attIndex); int[] sorted = new int[strAttVals.length]; for (int i = 0; i < strAttVals.length; i++) { sorted[i] = i; } String prev = ""; int counter = 0; for (int i = 0; i < sorted.length; i++) { ProcessInstance current = getInstance(sorted[i]); if (current.isMissing(attIndex)) { break; } if ((i == 0) || (!current.stringValue(attIndex).equals(prev))) { prev = current.stringValue(attIndex); counter++; } } return counter; } /** * Gets the value of all instances in this dataset for a particular * string attribute. Useful in conjunction with Utils.sort to allow iterating * through the dataset in sorted order for some attribute. * * @param index the index of the attribute. * @return an array containing the value of the desired attribute for * each instance in the dataset. */ //@ requires 0 <= index && index < numAttributes(); public /*@pure@*/ String[] attributeToStringArray(int index) { String[] result = new String[numInstances()]; for (int i = 0; i < result.length; i++) { result[i] = instance(i).stringValue(index); } return result; } /** * Reads a single instance from the reader and appends it to the dataset. * Automatically expands the dataset if it is not large enough to hold the * instance. This method does not check for carriage return at the end of * the line. * * @param reader * the reader * @return false if end of file has been reached * @throws IOException * if the information is not read successfully * @deprecated instead of using this method in conjunction with the * <code>readInstance(Reader)</code> method, one should use the * <code>ArffLoader</code> or <code>DataSource</code> class * instead. * @see weka.core.converters.ArffLoader * @see weka.core.converters.ConverterUtils.DataSource */ @Deprecated public boolean readInstance(Reader reader) throws IOException { ArffReader arff = new ArffReader(reader, this, m_Lines, 1); ProcessInstance inst = (ProcessInstance) arff.readInstance(arff.getData(), false); m_Lines = arff.getLineNo(); if (inst != null) { addInstance(inst); return true; } else { return false; } } /** * Creates a new dataset of the same size using random sampling with * replacement. * * @param random * a random number generator * @return the new dataset */ public ProcessInstances resample(Random random) { ProcessInstances newData = new ProcessInstances(this, numInstances()); while (newData.numInstances() < numInstances()) { newData.addInstance(getInstance(random.nextInt(numInstances()))); } return newData; } /** * Creates a new dataset of the same size using random sampling with * replacement according to the current instance weights. The weights of the * instances in the new dataset are set to one. * * @param random * a random number generator * @return the new dataset */ public ProcessInstances resampleWithWeights(Random random) { double[] weights = new double[numInstances()]; for (int i = 0; i < weights.length; i++) { weights[i] = getInstance(i).weight(); } return resampleWithWeights(random, weights); } /** * Creates a new dataset of the same size using random sampling with * replacement according to the given weight vector. The weights of the * instances in the new dataset are set to one. The length of the weight * vector has to be the same as the number of instances in the dataset, and * all weights have to be positive. * * @param random * a random number generator * @param weights * the weight vector * @return the new dataset * @throws IllegalArgumentException * if the weights array is of the wrong length or contains * negative weights. */ public ProcessInstances resampleWithWeights(Random random, double[] weights) { if (weights.length != numInstances()) { throw new IllegalArgumentException("weights.length != numInstances."); } ProcessInstances newData = new ProcessInstances(this, numInstances()); if (numInstances() == 0) { return newData; } double[] probabilities = new double[numInstances()]; double sumProbs = 0, sumOfWeights = Utils.sum(weights); for (int i = 0; i < numInstances(); i++) { sumProbs += random.nextDouble(); probabilities[i] = sumProbs; } Utils.normalize(probabilities, sumProbs / sumOfWeights); // Make sure that rounding errors don't mess things up probabilities[numInstances() - 1] = sumOfWeights; int k = 0; int l = 0; sumProbs = 0; while ((k < numInstances() && (l < numInstances()))) { if (weights[l] < 0) { throw new IllegalArgumentException("Weights have to be positive."); } sumProbs += weights[l]; while ((k < numInstances()) && (probabilities[k] <= sumProbs)) { newData.addInstance(getInstance(l)); newData.getInstance(k).setWeight(1); k++; } l++; } return newData; } /** * Stratifies a set of instances according to its class values if the class * attribute is nominal (so that afterwards a stratified cross-validation * can be performed). * * @param numFolds * the number of folds in the cross-validation * @throws UnassignedClassException * if the class is not set */ public void stratify(int numFolds) { if (numFolds <= 1) { throw new IllegalArgumentException("Number of folds must be greater than 1"); } if (m_ClassIndex < 0) { throw new UnassignedClassException("Class index is negative (not set)!"); } if (classAttribute().isNominal()) { // sort by class int index = 1; while (index < numInstances()) { ProcessInstance instance1 = getInstance(index - 1); for (int j = index; j < numInstances(); j++) { ProcessInstance instance2 = getInstance(j); if ((instance1.classValue() == instance2.classValue()) || (instance1.classIsMissing() && instance2.classIsMissing())) { swap(index, j); index++; } } index++; } stratStep(numFolds); } } /** * Creates the test set for one fold of a cross-validation on the dataset. * * @param numFolds * the number of folds in the cross-validation. Must be greater * than 1. * @param numFold * 0 for the first fold, 1 for the second, ... * @return the test set as a set of weighted instances * @throws IllegalArgumentException * if the number of folds is less than 2 or greater than the * number of instances. */ // @ requires 2 <= numFolds && numFolds < numInstances(); // @ requires 0 <= numFold && numFold < numFolds; public ProcessInstances testCV(int numFolds, int numFold) { int numInstForFold, first, offset; ProcessInstances test; if (numFolds < 2) { throw new IllegalArgumentException("Number of folds must be at least 2!"); } if (numFolds > numInstances()) { throw new IllegalArgumentException("Can't have more folds than instances!"); } numInstForFold = numInstances() / numFolds; if (numFold < numInstances() % numFolds) { numInstForFold++; offset = numFold; } else offset = numInstances() % numFolds; test = new ProcessInstances(this, numInstForFold); first = numFold * (numInstances() / numFolds) + offset; copyInstances(first, test, numInstForFold); return test; } /** * Creates the training set for one fold of a cross-validation on the * dataset. * * @param numFolds * the number of folds in the cross-validation. Must be greater * than 1. * @param numFold * 0 for the first fold, 1 for the second, ... * @return the training set * @throws IllegalArgumentException * if the number of folds is less than 2 or greater than the * number of instances. */ // @ requires 2 <= numFolds && numFolds < numInstances(); // @ requires 0 <= numFold && numFold < numFolds; public ProcessInstances trainCV(int numFolds, int numFold) { int numInstForFold, first, offset; ProcessInstances train; if (numFolds < 2) { throw new IllegalArgumentException("Number of folds must be at least 2!"); } if (numFolds > numInstances()) { throw new IllegalArgumentException("Can't have more folds than instances!"); } numInstForFold = numInstances() / numFolds; if (numFold < numInstances() % numFolds) { numInstForFold++; offset = numFold; } else offset = numInstances() % numFolds; train = new ProcessInstances(this, numInstances() - numInstForFold); first = numFold * (numInstances() / numFolds) + offset; copyInstances(0, train, first); copyInstances(first + numInstForFold, train, numInstances() - first - numInstForFold); return train; } /** * Creates the training set for one fold of a cross-validation on the * dataset. The data is subsequently randomized based on the given random * number generator. * * @param numFolds * the number of folds in the cross-validation. Must be greater * than 1. * @param numFold * 0 for the first fold, 1 for the second, ... * @param random * the random number generator * @return the training set * @throws IllegalArgumentException * if the number of folds is less than 2 or greater than the * number of instances. */ // @ requires 2 <= numFolds && numFolds < numInstances(); // @ requires 0 <= numFold && numFold < numFolds; public ProcessInstances trainCV(int numFolds, int numFold, Random random) { ProcessInstances train = trainCV(numFolds, numFold); train.randomize(random); return train; } /** * Copies instances from one set to the end of another one. * * @param from * the position of the first instance to be copied * @param dest * the destination for the instances * @param num * the number of instances to be copied */ // @ requires 0 <= from && from <= numInstances() - num; // @ requires 0 <= num; protected void copyInstances(int from, /* @non_null@ */ ProcessInstances dest, int num) { for (int i = 0; i < num; i++) { dest.addInstance(getInstance(from + i)); } } /** * Merges two sets of ProcessInstances together. The resulting set will have * all the attributes of the first set plus all the attributes of the second * set. The number of instances in both sets must be the same. * * @param first * the first set of ProcessInstances * @param second * the second set of ProcessInstances * @return the merged set of ProcessInstances * @throws IllegalArgumentException * if the datasets are not the same size */ public static ProcessInstances mergeInstances(ProcessInstances first, ProcessInstances second) { if (first.numInstances() != second.numInstances()) { throw new IllegalArgumentException("Instance sets must be of the same size"); } // Create the vector of merged attributes FastVector newAttributes = new FastVector(); for (int i = 0; i < first.numAttributes(); i++) { newAttributes.addElement(first.attribute(i)); } for (int i = 0; i < second.numAttributes(); i++) { newAttributes.addElement(second.attribute(i)); } FastVector newStrAttributes = new FastVector(); for (int i = 0; i < first.numStrAttributes(); i++) { newStrAttributes.addElement(first.strAttribute(i)); } for (int i = 0; i < second.numStrAttributes(); i++) { newStrAttributes.addElement(second.strAttribute(i)); } // Create the set of ProcessInstances ProcessInstances merged = new ProcessInstances(first.relationName() + '_' + second.relationName(), newAttributes, newStrAttributes, first.numInstances()); // Merge each instance for (int i = 0; i < first.numInstances(); i++) { merged.addInstance(first.getInstance(i).mergeInstance(second.getInstance(i))); } return merged; } /** * Returns string attribute at a given index. * * @param index the string attribute's index (index starts with 0) * @return the string attribute at the given position */ //@ requires 0 <= index; //@ requires index < m_Attributes.size(); //@ ensures \result != null; public /*@pure@*/ Attribute strAttribute(int index) { return (Attribute) m_String_Attributes.elementAt(index); } /** * Returns the instances * @return the instances */ public FastVector getInstances() { return m_Instances; } /** * Returns the numeric attributes * @return the numeric attributes */ public FastVector getAttributes() { return m_Attributes; } /** * Returns the string attributes * @return the string attributes */ public FastVector getStringAttributes() { return m_String_Attributes; } /** * Method for testing this class. * * @param argv * should contain one element: the name of an ARFF file */ // @ requires argv != null; // @ requires argv.length == 1; // @ requires argv[0] != null; public static void test(String[] argv) { ProcessInstances instances, secondInstances, train, test, empty; Random random = new Random(2); Reader reader; int start, num; FastVector testAtts, testVals; int i, j; try { if (argv.length > 1) { throw (new Exception("Usage: ProcessInstances [<filename>]")); } // Creating set of instances from scratch testVals = new FastVector(2); testVals.addElement("first_value"); testVals.addElement("second_value"); testAtts = new FastVector(2); testAtts.addElement(new Attribute("nominal_attribute", testVals)); testAtts.addElement(new Attribute("numeric_attribute")); instances = new ProcessInstances("test_set", testAtts, new FastVector(), 10); instances.addInstance(new ProcessInstance(instances.numAttributes())); instances.addInstance(new ProcessInstance(instances.numAttributes())); instances.addInstance(new ProcessInstance(instances.numAttributes())); instances.setClassIndex(0); System.out.println("\nSet of instances created from scratch:\n"); System.out.println(instances); if (argv.length == 1) { String filename = argv[0]; reader = new FileReader(filename); // Read first five instances and print them System.out.println("\nFirst five instances from file:\n"); instances = new ProcessInstances(reader, 1); instances.setClassIndex(instances.numAttributes() - 1); i = 0; while ((i < 5) && (instances.readInstance(reader))) { i++; } System.out.println(instances); // Read all the instances in the file reader = new FileReader(filename); instances = new ProcessInstances(reader); // Make the last attribute be the class instances.setClassIndex(instances.numAttributes() - 1); // Print header and instances. System.out.println("\nDataset:\n"); System.out.println(instances); System.out.println("\nClass index: " + instances.classIndex()); } // Test basic methods based on class index. System.out.println("\nClass name: " + instances.classAttribute().name()); System.out.println("\nClass index: " + instances.classIndex()); System.out.println("\nClass is nominal: " + instances.classAttribute().isNominal()); System.out.println("\nClass is numeric: " + instances.classAttribute().isNumeric()); System.out.println("\nClasses:\n"); for (i = 0; i < instances.numClasses(); i++) { System.out.println(instances.classAttribute().value(i)); } System.out.println("\nClass values and labels of instances:\n"); for (i = 0; i < instances.numInstances(); i++) { ProcessInstance inst = instances.getInstance(i); System.out.print(inst.classValue() + "\t"); System.out.print(inst.toString(inst.classIndex())); if (instances.getInstance(i).classIsMissing()) { System.out.println("\tis missing"); } else { System.out.println(); } } // Create random weights. System.out.println("\nCreating random weights for instances."); for (i = 0; i < instances.numInstances(); i++) { instances.getInstance(i).setWeight(random.nextDouble()); } // Print all instances and their weights (and the sum of weights). System.out.println("\nInstances and their weights:\n"); System.out.println(instances.instancesAndWeights()); System.out.print("\nSum of weights: "); System.out.println(instances.sumOfWeights()); // Insert an attribute secondInstances = new ProcessInstances(instances); Attribute testAtt = new Attribute("Inserted"); secondInstances.insertAttributeAt(testAtt, 0); System.out.println("\nSet with inserted attribute:\n"); System.out.println(secondInstances); System.out.println("\nClass name: " + secondInstances.classAttribute().name()); // Delete the attribute secondInstances.deleteAttributeAt(0); System.out.println("\nSet with attribute deleted:\n"); System.out.println(secondInstances); System.out.println("\nClass name: " + secondInstances.classAttribute().name()); // Test if headers are equal System.out.println("\nHeaders equal: " + instances.equalHeaders(secondInstances) + "\n"); // Print data in internal format. System.out.println("\nData (internal values):\n"); for (i = 0; i < instances.numInstances(); i++) { for (j = 0; j < instances.numAttributes(); j++) { if (instances.getInstance(i).isMissing(j)) { System.out.print("? "); } else { System.out.print(instances.getInstance(i).value(j) + " "); } } System.out.println(); } // Just print header System.out.println("\nEmpty dataset:\n"); empty = new ProcessInstances(instances, 0); System.out.println(empty); System.out.println("\nClass name: " + empty.classAttribute().name()); // Create copy and rename an attribute and a value (if possible) if (empty.classAttribute().isNominal()) { Instances copy = new ProcessInstances(empty, 0); copy.renameAttribute(copy.classAttribute(), "new_name"); copy.renameAttributeValue(copy.classAttribute(), copy.classAttribute().value(0), "new_val_name"); System.out.println("\nDataset with names changed:\n" + copy); System.out.println("\nOriginal dataset:\n" + empty); } // Create and prints subset of instances. start = instances.numInstances() / 4; num = instances.numInstances() / 2; System.out.print("\nSubset of dataset: "); System.out.println(num + " instances from " + (start + 1) + ". instance"); secondInstances = new ProcessInstances(instances, start, num); System.out.println("\nClass name: " + secondInstances.classAttribute().name()); // Print all instances and their weights (and the sum of weights). System.out.println("\nInstances and their weights:\n"); System.out.println(secondInstances.instancesAndWeights()); System.out.print("\nSum of weights: "); System.out.println(secondInstances.sumOfWeights()); // Create and print training and test sets for 3-fold // cross-validation. System.out.println("\nTrain and test folds for 3-fold CV:"); if (instances.classAttribute().isNominal()) { instances.stratify(3); } for (j = 0; j < 3; j++) { train = instances.trainCV(3, j, new Random(1)); test = instances.testCV(3, j); // Print all instances and their weights (and the sum of // weights). System.out.println("\nTrain: "); System.out.println("\nInstances and their weights:\n"); System.out.println(train.instancesAndWeights()); System.out.print("\nSum of weights: "); System.out.println(train.sumOfWeights()); System.out.println("\nClass name: " + train.classAttribute().name()); System.out.println("\nTest: "); System.out.println("\nInstances and their weights:\n"); System.out.println(test.instancesAndWeights()); System.out.print("\nSum of weights: "); System.out.println(test.sumOfWeights()); System.out.println("\nClass name: " + test.classAttribute().name()); } // Randomize instances and print them. System.out.println("\nRandomized dataset:"); instances.randomize(random); // Print all instances and their weights (and the sum of weights). System.out.println("\nInstances and their weights:\n"); System.out.println(instances.instancesAndWeights()); System.out.print("\nSum of weights: "); System.out.println(instances.sumOfWeights()); // Sort instances according to first attribute and // print them. System.out.print("\nInstances sorted according to first attribute:\n "); instances.sort(0); // Print all instances and their weights (and the sum of weights). System.out.println("\nInstances and their weights:\n"); System.out.println(instances.instancesAndWeights()); System.out.print("\nSum of weights: "); System.out.println(instances.sumOfWeights()); } catch (Exception e) { e.printStackTrace(); } } /** * Main method for this class. The following calls are possible: * <ul> * <li> * <code>weka.core.Instances</code> help<br/> * prints a short list of possible commands.</li> * <li> * <code>weka.core.Instances</code> <filename><br/> * prints a summary of a set of instances.</li> * <li> * <code>weka.core.Instances</code> merge <filename1> * <filename2><br/> * merges the two datasets (must have same number of instances) and outputs * the results on stdout.</li> * <li> * <code>weka.core.Instances</code> append <filename1> * <filename2><br/> * appends the second dataset to the first one (must have same headers) and * outputs the results on stdout.</li> * <li> * <code>weka.core.Instances</code> headers <filename1> * <filename2><br/> * Compares the headers of the two datasets and prints whether they match or * not.</li> * <li> * <code>weka.core.Instances</code> randomize <seed> <filename><br/> * randomizes the dataset with the given seed and outputs the result on * stdout.</li> * </ul> * * @param args * the commandline parameters */ public static void main(String[] args) { try { ProcessInstances i; // read from stdin and print statistics if (args.length == 0) { DataSource source = new DataSource(System.in); i = (ProcessInstances) source.getDataSet(); System.out.println(i.toSummaryString()); } // read file and print statistics else if ((args.length == 1) && (!args[0].equals("-h")) && (!args[0].equals("help"))) { DataSource source = new DataSource(args[0]); i = (ProcessInstances) source.getDataSet(); System.out.println(i.toSummaryString()); } // read two files, merge them and print result to stdout else if ((args.length == 3) && (args[0].toLowerCase().equals("merge"))) { DataSource source1 = new DataSource(args[1]); DataSource source2 = new DataSource(args[2]); i = ProcessInstances.mergeInstances((ProcessInstances) source1.getDataSet(), (ProcessInstances) source2.getDataSet()); System.out.println(i); } // read two files, append them and print result to stdout else if ((args.length == 3) && (args[0].toLowerCase().equals("append"))) { DataSource source1 = new DataSource(args[1]); DataSource source2 = new DataSource(args[2]); if (!source1.getStructure().equalHeaders(source2.getStructure())) throw new Exception("The two datasets have different headers!"); Instances structure = source1.getStructure(); System.out.println(source1.getStructure()); while (source1.hasMoreElements(structure)) System.out.println(source1.nextElement(structure)); structure = source2.getStructure(); while (source2.hasMoreElements(structure)) System.out.println(source2.nextElement(structure)); } // read two files and compare their headers else if ((args.length == 3) && (args[0].toLowerCase().equals("headers"))) { DataSource source1 = new DataSource(args[1]); DataSource source2 = new DataSource(args[2]); if (source1.getStructure().equalHeaders(source2.getStructure())) System.out.println("Headers match"); else System.out.println("Headers don't match"); } // read file and seed value, randomize data and print result to // stdout else if ((args.length == 3) && (args[0].toLowerCase().equals("randomize"))) { DataSource source = new DataSource(args[2]); i = (ProcessInstances) source.getDataSet(); i.randomize(new Random(Integer.parseInt(args[1]))); System.out.println(i); } // wrong parameters else { System.err.println( "\nUsage:\n" + "\tweka.core.Instances help\n" + "\tweka.core.Instances <filename>\n" + "\tweka.core.Instances merge <filename1> <filename2>\n" + "\tweka.core.Instances append <filename1> <filename2>\n" + "\tweka.core.Instances headers <filename1> <filename2>\n" + "\tweka.core.Instances randomize <seed> <filename>\n"); } } catch (Exception ex) { ex.printStackTrace(); System.err.println(ex.getMessage()); } } }