List of usage examples for weka.core Instances add
@Override public boolean add(Instance instance)
From source file:jwebminer2.FeatureValueFileSaver.java
/** * Save the given text to the given location in the given format or * save the stored feature values, depending on the chosen_file_extension. * A progress bar is displayed (although not incremented). * * @param chosen_file_extension The file extension (corresponding to one * of the extensions published by the * getFileFormatExtension method) to use when * saving data_to_save, and the corresponding * file format. * @param data_to_save The HTML code displayed on-screen. May be * null for non-HTML saving. * @param save_location The file to save data_to_save to. * @throws Exception Throws an Exception if the file cannot be * saved.// ww w. j a v a2s .c o m */ public void saveContents(String chosen_file_extension, String data_to_save, File save_location) throws Exception { // Prepare the progress bar SimpleProgressBarDialog progress_bar = new SimpleProgressBarDialog(1, results_panel); // Write the whole contents of data_to_save verbatim as an HTML file // if an HTML file is to be saved if (chosen_file_extension.equals("HTML")) { DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods .getDataOutputStream(save_location); writer.writeBytes(data_to_save); writer.close(); } // Only save the table of final feature values itself if a non-HTML // file format is to be saved else { // Access information to store double[][] feature_table = results_panel.feature_values; String[] column_labels = results_panel.column_labels; String[] row_labels = results_panel.row_labels; String[] orig_column_labels = column_labels; if (AnalysisProcessor.lastfm_enabled && AnalysisProcessor.is_cross_tabulation && (AnalysisProcessor.yahoo_application_id != null || AnalysisProcessor.google_license_key != null)) { String[] column_labels_lastfm_websearch = new String[2 * column_labels.length]; for (int i = 0; i < column_labels.length; i++) { column_labels_lastfm_websearch[i] = column_labels[i] + "_WS"; column_labels_lastfm_websearch[i + column_labels.length] = column_labels[i] + "_LastFM"; } column_labels = column_labels_lastfm_websearch; } else { column_labels = orig_column_labels; } // Save as tab delimited text file if (chosen_file_extension.equals("TXT")) { // Calculate the table to save String[][] results_table = new String[row_labels.length + 1][column_labels.length + 1]; results_table[0][0] = ""; for (int i = 0; i < results_table.length; i++) { for (int j = 0; j < results_table[i].length; j++) { if (i == 0) { if (j != 0) results_table[i][j] = column_labels[j - 1]; } else { if (j == 0) results_table[i][j] = row_labels[i - 1]; else results_table[i][j] = String.valueOf(feature_table[i - 1][j - 1]); } } } // Save the table DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods .getDataOutputStream(save_location); for (int i = 0; i < results_table.length; i++) { for (int j = 0; j < results_table[i].length; j++) { // Write the table entry writer.writeBytes(results_table[i][j]); // Add a tab or a line break if (j == results_table[i].length - 1) writer.writeBytes("\n"); else writer.writeBytes("\t"); } } // Close the writing stream writer.close(); } // Save as ACE XML file else if (chosen_file_extension.equals("ACE XML")) { // Set the name of the dataset to the name of the file // that is tob be saved String data_set_name = mckay.utilities.staticlibraries.StringMethods .removeExtension(save_location.getName()); // Prepare feature definitions and store feature names to // put in DataSets FeatureDefinition[] feature_definitions = new FeatureDefinition[column_labels.length]; String[] feature_names = new String[column_labels.length]; for (int feat = 0; feat < feature_definitions.length; feat++) { feature_definitions[feat] = new FeatureDefinition(column_labels[feat], "", false, 1); feature_names[feat] = column_labels[feat]; } // Prepare the the DataSets to write DataSet[] data_sets = new DataSet[row_labels.length]; for (int instance = 0; instance < data_sets.length; instance++) { // Instantiate the DataSet data_sets[instance] = new DataSet(); // Store the instance names data_sets[instance].identifier = row_labels[instance]; // Store the names of the features data_sets[instance].feature_names = feature_names; // Store the features for this DataSet as well as the // feature names double[][] these_feature_values = new double[feature_table[instance].length][1]; for (int feat = 0; feat < these_feature_values.length; feat++) these_feature_values[feat][0] = feature_table[instance][feat]; data_sets[instance].feature_values = these_feature_values; // Validate, order and compact the DataSet data_sets[instance].orderAndCompactFeatures(feature_definitions, true); } // Save the feature values DataSet.saveDataSets(data_sets, feature_definitions, save_location, "Features extracted with jWebMiner 2.0"); } // Save as Weka ARFF file else if (chosen_file_extension.equals("Weka ARFF")) { // Set the name of the dataset to the name of the file // that is to be saved String data_set_name = mckay.utilities.staticlibraries.StringMethods .removeExtension(save_location.getName()); // Set the Attributes (feature names and class names) FastVector attributes_vector = new FastVector(column_labels.length + 1); // extra 1 is for class name for (int feat = 0; feat < column_labels.length; feat++) attributes_vector.addElement(new Attribute(column_labels[feat])); FastVector class_names_vector = new FastVector(column_labels.length); for (int cat = 0; cat < orig_column_labels.length; cat++) class_names_vector.addElement(orig_column_labels[cat]); attributes_vector.addElement(new Attribute("Class", class_names_vector)); // Store attributes in an Instances object Instances instances = new Instances(data_set_name, attributes_vector, row_labels.length); instances.setClassIndex(instances.numAttributes() - 1); // Store the feature values and model classifications for (int inst = 0; inst < row_labels.length; inst++) { // Initialize an instance Instance this_instance = new Instance(instances.numAttributes()); this_instance.setDataset(instances); int current_attribute = 0; // Set feature values for the instance for (int feat = 0; feat < column_labels.length; feat++) this_instance.setValue(feat, feature_table[inst][feat]); // Set the class value for the instance // this_instance.setClassValue("a"); instances.setRelationName("jWebMiner2"); // Add this instance to instances instances.add(this_instance); } // Prepare the buffer to save to and add comments indicating // the names of the rows DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods .getDataOutputStream(save_location); writer.writeBytes("% INSTANCES (DATA ROWS) BELOW CORRESPOND TO:\n%\n"); for (int inst = 0; inst < row_labels.length; inst++) writer.writeBytes("% " + (inst + 1) + ") " + row_labels[inst] + "\n"); writer.writeBytes("%\n"); // Save the ARFF file ArffSaver arff_saver = new ArffSaver(); arff_saver.setInstances(instances); arff_saver.setFile(save_location); arff_saver.setDestination(writer); try { arff_saver.writeBatch(); } catch (Exception e) { throw new Exception( "File only partially saved.\n\nTry resaving the file with a .arff extension."); } // Close the writer writer.close(); } } // Terminate the progress bar progress_bar.done(); }
From source file:kea.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files/*w w w. jav a 2 s . com*/ */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Extract keyphrases Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i] .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i] .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); m_KEAFilter.batchFinished(); }
From source file:kea.KEAModelBuilder.java
License:Open Source License
/** * Builds the model from the files// w w w . jav a2s.c o m */ public void buildModel(Hashtable stems) throws Exception { // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model m_KEAFilter = new KEAFilter(); m_KEAFilter.setDebug(m_debug); m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods()); m_KEAFilter.setKFused(getUseKFrequency()); m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength()); m_KEAFilter.setMinPhraseLength(getMinPhraseLength()); m_KEAFilter.setMinNumOccur(getMinNumOccur()); m_KEAFilter.setInputFormat(data); m_KEAFilter.setStemmer(getStemmer()); m_KEAFilter.setStopwords(getStopwords()); m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns()); Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); BufferedReader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't find document for stem " + str + "."); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); BufferedReader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't find keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); } m_KEAFilter.batchFinished(); // Get rid of instances in filter while (m_KEAFilter.output() != null) { } ; }
From source file:kea.main.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files/*from w w w. j av a 2s .c om*/ */ public synchronized void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } this.m_KEAFilter.setNumPhrases(m_numPhrases); this.m_KEAFilter.setVocabulary(m_vocabulary); this.m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); this.m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); this.m_KEAFilter.setStemmer(m_Stemmer); this.m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { this.m_KEAFilter.m_NODEfeature = false; } else { // Know thesaurus is loaded in the constructor //m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, vocabularyDir, manager); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (this.m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } System.out.println("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the // author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } is.close(); newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); this.m_KEAFilter.input(data.instance(0), vocabulary); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = this.m_KEAFilter.output()) != null) { int index = (int) inst.value(this.m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print( topRankedInstances[i].stringValue(this.m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print( topRankedInstances[i].stringValue(this.m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(this.m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.out.println("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.out.println("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:learn.Classification.Chinese.TextDirectoryLoader.java
License:Open Source License
/** * Return the full data set. If the structure hasn't yet been determined by * a call to getStructure then method should do so before processing the * rest of the data set.//from www .j a va 2 s . c o m * * @return the structure of the data set as an empty set of Instances * @throws IOException * if there is no source or parsing fails */ public Instances getDataSet() throws IOException { if (getDirectory() == null) throw new IOException("No directory/source has been specified"); String directoryPath = getDirectory().getAbsolutePath(); ArrayList<String> classes = new ArrayList<String>(); Enumeration enm = getStructure().classAttribute().enumerateValues(); while (enm.hasMoreElements()) classes.add((String) enm.nextElement()); Instances data = getStructure(); int fileCount = 0; for (int k = 0; k < classes.size(); k++) { String subdirPath = (String) classes.get(k); File subdir = new File(directoryPath + File.separator + subdirPath); String[] files = subdir.list(); for (int j = 0; j < files.length; j++) { try { fileCount++; if (getDebug()) System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]); double[] newInst = null; if (m_OutputFilename) newInst = new double[3]; else newInst = new double[2]; File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]); BufferedInputStream is; is = new BufferedInputStream(new FileInputStream(txt)); StringBuffer txtStr = new StringBuffer(); int c; /* * while ((c = is.read()) != -1) { txtStr.append((char) c); * } */ //FileReader fr = new FileReader(txt); BufferedReader br = new BufferedReader( new InputStreamReader(new FileInputStream(txt), "UTF-8")); String line; while ((line = br.readLine()) != null) { txtStr.append(line + "\n"); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); if (m_OutputFilename) newInst[1] = (double) data.attribute(1) .addStringValue(subdirPath + File.separator + files[j]); newInst[data.classIndex()] = (double) k; data.add(new DenseInstance(1.0, newInst)); is.close(); } catch (Exception e) { System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath + File.separator + files[j]); } } } return data; }
From source file:lineage.AAFClusterer.java
License:Open Source License
public Instances convertMatrixToWeka(double[][] data, int numObs, int numFeatures) { // convert the data to WEKA format FastVector atts = new FastVector(); for (int i = 0; i < numFeatures; i++) { atts.addElement(new Attribute("Feature" + i, i)); }/*w w w . j a va2 s.c om*/ Instances ds = new Instances("AAF Data", atts, numObs); for (int i = 0; i < numObs; i++) { ds.add(new Instance(numFeatures)); } for (int i = 0; i < numFeatures; i++) { for (int j = 0; j < numObs; j++) { ds.instance(j).setValue(i, data[j][i]); } } return ds; }
From source file:linqs.gaia.model.oc.ncc.WekaClassifier.java
License:Open Source License
/** * Create Weka instance//w ww. j a va2 s.c o m * * @param intances Weka instances * @param di Decorable item to convert * @param attInfo Weka attributes * @param ispredict Is this item created for training or testing */ private void createInstance(Instances instances, Decorable di, boolean ispredict) { double[] instvalues = new double[attinfosize]; int attindex = 0; Schema schema = di.getSchema(); for (String fid : featureids) { FeatureValue fvalue = di.getFeatureValue(fid); Attribute a = instances.attribute(attindex); Feature f = schema.getFeature(fid); if (!(f instanceof CompositeFeature)) { // Handle non multi-valued feature instvalues[attindex] = this.gaiavalues2weka(f, fid, fvalue, a, ispredict); attindex++; } else { // Handle multi-valued feature CompositeFeature mv = (CompositeFeature) f; UnmodifiableList<SimplePair<String, CVFeature>> mvfeatures = mv.getFeatures(); CompositeValue mvvalue = (CompositeValue) di.getFeatureValue(fid); UnmodifiableList<FeatureValue> mvfvalues = mvvalue.getFeatureValues(); int num = mvfvalues.size(); for (int j = 0; j < num; j++) { if (fvalue.equals(FeatureValue.UNKNOWN_VALUE)) { attindex++; continue; } a = instances.attribute(attindex); f = mvfeatures.get(j).getSecond(); fvalue = mvfvalues.get(j); instvalues[attindex] = this.gaiavalues2weka(f, fid, fvalue, a, ispredict); attindex++; } } } // Create instance of weight 1 and the specified values Instance inst = new SparseInstance(1, instvalues); inst.setDataset(instances); instances.add(inst); }
From source file:lu.lippmann.cdb.datasetview.tabs.TimeSeriesSimilarityPanel.java
License:Open Source License
private Instances buildFeatureDS(final Instances dataSet) throws Exception { final int numAttributes = dataSet.numAttributes(); final java.util.List<String> namesOfFeaturesToConsider = new ArrayList<String>(); namesOfFeaturesToConsider.addAll(WekaDataStatsUtil.getAttributeNames(dataSet)); namesOfFeaturesToConsider.removeAll(WekaDataStatsUtil.getDateAttributeNames(dataSet)); final double[][] simMatrix = new double[numAttributes][numAttributes]; for (int i = 0; i < numAttributes; i++) { final double[] arrayI = dataSet.attributeToDoubleArray(i); if (this.withNormalization) MathsUtil.normalize(arrayI); simMatrix[i][i] = 0d;/*from w w w . j ava2 s . c o m*/ ; for (int j = i + 1; j < numAttributes; j++) { final double[] arrayJ = dataSet.attributeToDoubleArray(j); if (this.withNormalization) MathsUtil.normalize(arrayJ); simMatrix[i][j] = new DynamicTimeWarping(arrayI, arrayJ).getDistance(); //System.out.println(i+" "+j); } } for (int i = 0; i < numAttributes; i++) { for (int j = 0; j < i + 1; j++) { simMatrix[i][j] = simMatrix[j][i]; } } /*for (int i=0;i<numAttributes;i++) { System.out.println(i+" -> "+FormatterUtil.buildStringFromArrayOfDoubles(simMatrix[i])); }*/ final ArrayList<Attribute> attrs = new ArrayList<Attribute>(numAttributes + 1); for (int i = 0; i < numAttributes; i++) { attrs.add(new Attribute(dataSet.attribute(i).name() + "-feat")); } attrs.add(new Attribute(FEATUREDESC_ATTRNAME, namesOfFeaturesToConsider)); final Instances ds = new Instances("featuresComparisonDs", attrs, 0); ds.setClassIndex(attrs.size() - 1); for (int i = 0; i < simMatrix.length; i++) { final DenseInstance di = new DenseInstance(1.0d, ArraysUtil.concat(simMatrix[i], new double[] { 0d })); di.setDataset(ds); di.setValue(simMatrix.length, dataSet.attribute(i).name()); ds.add(di); } return ds; }
From source file:lu.lippmann.cdb.datasetview.tabs.UnsupervisedFeatureEvaluationTabView.java
License:Open Source License
private static Instances buildDerivatedDataset(final Instances dataSet, final List<String> possibleValues, final List<Integer> valueForEachFeature) throws Exception { final int numInstances = dataSet.numInstances(); final ArrayList<Attribute> attrs = new ArrayList<Attribute>(numInstances + 2); attrs.add(new Attribute(FEATUREDESC_ATTRNAME, (java.util.List<String>) null)); for (int i = 0; i < numInstances; i++) { attrs.add(new Attribute(i + "_eval")); }/*from www . j a v a 2 s . c o m*/ attrs.add(new Attribute("__", possibleValues)); final Instances newds = new Instances("unsupervisedFeaturesEval", attrs, 0); final int numAttributes = dataSet.numAttributes(); for (int j = 0; j < numAttributes; j++) { double[] val = ArraysUtil.concat(dataSet.attributeToDoubleArray(j), new double[] { 0.0d }); val = ArraysUtil.concat(new double[] { 0.0d }, val); newds.add(new DenseInstance(1.0d, val)); } for (int j = 0; j < numAttributes; j++) { newds.instance(j).setValue(0, dataSet.attribute(j).name()); newds.instance(j).setValue(numInstances + 1, possibleValues.get(valueForEachFeature.get(j))); } newds.setClassIndex(numInstances + 1); return newds; }
From source file:lu.lippmann.cdb.ext.hydviga.cbr.GapFillingKnowledgeDB.java
License:Open Source License
public static Instances getKnowledgeDBWithBestCasesOnly() throws Exception { final Instances newds = new Instances(DATABASE, 0); final int numInstances = DATABASE.numInstances(); final int numAttributes = DATABASE.numAttributes(); for (int i = 0; i < numInstances; i++) { if (DATABASE.instance(i).stringValue(numAttributes - 1).equals("true")) newds.add(DATABASE.instance(i)); }/* ww w. j a va2 s. c o m*/ return newds; }