Example usage for weka.core Instances add

Introduction

In this page you can find the example usage for weka.core Instances add.

Prototype

@Override
public boolean add(Instance instance)

Source Link

Document

Adds one instance to the end of the set.

Usage

From source file:jwebminer2.FeatureValueFileSaver.java

/**
 * Save the given text to the given location in the given format or
 * save the stored feature values, depending on the chosen_file_extension.
 * A progress bar is displayed (although not incremented).
 *
 * @param chosen_file_extension The file extension (corresponding to one
 *                              of the extensions published by the
 *                              getFileFormatExtension method) to use when
 *                              saving data_to_save, and the corresponding
 *                              file format.
 * @param data_to_save          The HTML code displayed on-screen. May be
 *                              null for non-HTML saving.
 * @param save_location         The file to save data_to_save to.
 * @throws Exception            Throws an Exception if the file cannot be
 *                              saved.//  ww w.  j a  v  a2s  .c o m
 */
public void saveContents(String chosen_file_extension, String data_to_save, File save_location)
        throws Exception {
    // Prepare the progress bar
    SimpleProgressBarDialog progress_bar = new SimpleProgressBarDialog(1, results_panel);

    // Write the whole contents of data_to_save verbatim as an HTML file
    // if an HTML file is to be saved
    if (chosen_file_extension.equals("HTML")) {
        DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods
                .getDataOutputStream(save_location);
        writer.writeBytes(data_to_save);
        writer.close();
    }

    // Only save the table of final feature values itself if a non-HTML
    // file format is to be saved
    else {
        // Access information to store
        double[][] feature_table = results_panel.feature_values;

        String[] column_labels = results_panel.column_labels;
        String[] row_labels = results_panel.row_labels;
        String[] orig_column_labels = column_labels;

        if (AnalysisProcessor.lastfm_enabled && AnalysisProcessor.is_cross_tabulation
                && (AnalysisProcessor.yahoo_application_id != null
                        || AnalysisProcessor.google_license_key != null)) {
            String[] column_labels_lastfm_websearch = new String[2 * column_labels.length];
            for (int i = 0; i < column_labels.length; i++) {
                column_labels_lastfm_websearch[i] = column_labels[i] + "_WS";
                column_labels_lastfm_websearch[i + column_labels.length] = column_labels[i] + "_LastFM";
            }
            column_labels = column_labels_lastfm_websearch;
        } else {
            column_labels = orig_column_labels;
        }

        // Save as tab delimited text file
        if (chosen_file_extension.equals("TXT")) {
            // Calculate the table to save
            String[][] results_table = new String[row_labels.length + 1][column_labels.length + 1];
            results_table[0][0] = "";
            for (int i = 0; i < results_table.length; i++) {
                for (int j = 0; j < results_table[i].length; j++) {
                    if (i == 0) {
                        if (j != 0)
                            results_table[i][j] = column_labels[j - 1];
                    } else {
                        if (j == 0)
                            results_table[i][j] = row_labels[i - 1];
                        else
                            results_table[i][j] = String.valueOf(feature_table[i - 1][j - 1]);
                    }
                }
            }

            // Save the table
            DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods
                    .getDataOutputStream(save_location);
            for (int i = 0; i < results_table.length; i++) {
                for (int j = 0; j < results_table[i].length; j++) {
                    // Write the table entry
                    writer.writeBytes(results_table[i][j]);

                    // Add a tab or a line break
                    if (j == results_table[i].length - 1)
                        writer.writeBytes("\n");
                    else
                        writer.writeBytes("\t");
                }
            }

            // Close the writing stream
            writer.close();
        }

        // Save as ACE XML file
        else if (chosen_file_extension.equals("ACE XML")) {
            // Set the name of the dataset to the name of the file
            // that is tob be saved
            String data_set_name = mckay.utilities.staticlibraries.StringMethods
                    .removeExtension(save_location.getName());

            // Prepare feature definitions and store feature names to
            // put in DataSets
            FeatureDefinition[] feature_definitions = new FeatureDefinition[column_labels.length];
            String[] feature_names = new String[column_labels.length];
            for (int feat = 0; feat < feature_definitions.length; feat++) {
                feature_definitions[feat] = new FeatureDefinition(column_labels[feat], "", false, 1);
                feature_names[feat] = column_labels[feat];
            }

            // Prepare the the DataSets to write
            DataSet[] data_sets = new DataSet[row_labels.length];
            for (int instance = 0; instance < data_sets.length; instance++) {
                // Instantiate the DataSet
                data_sets[instance] = new DataSet();

                // Store the instance names
                data_sets[instance].identifier = row_labels[instance];

                // Store the names of the features
                data_sets[instance].feature_names = feature_names;

                // Store the features for this DataSet as well as the
                // feature names
                double[][] these_feature_values = new double[feature_table[instance].length][1];
                for (int feat = 0; feat < these_feature_values.length; feat++)
                    these_feature_values[feat][0] = feature_table[instance][feat];
                data_sets[instance].feature_values = these_feature_values;

                // Validate, order and compact the DataSet
                data_sets[instance].orderAndCompactFeatures(feature_definitions, true);
            }

            // Save the feature values
            DataSet.saveDataSets(data_sets, feature_definitions, save_location,
                    "Features extracted with jWebMiner 2.0");
        }

        // Save as Weka ARFF file
        else if (chosen_file_extension.equals("Weka ARFF")) {
            // Set the name of the dataset to the name of the file
            // that is to be saved
            String data_set_name = mckay.utilities.staticlibraries.StringMethods
                    .removeExtension(save_location.getName());

            // Set the Attributes (feature names and class names)
            FastVector attributes_vector = new FastVector(column_labels.length + 1); // extra 1 is for class name
            for (int feat = 0; feat < column_labels.length; feat++)
                attributes_vector.addElement(new Attribute(column_labels[feat]));
            FastVector class_names_vector = new FastVector(column_labels.length);
            for (int cat = 0; cat < orig_column_labels.length; cat++)
                class_names_vector.addElement(orig_column_labels[cat]);
            attributes_vector.addElement(new Attribute("Class", class_names_vector));

            // Store attributes in an Instances object
            Instances instances = new Instances(data_set_name, attributes_vector, row_labels.length);
            instances.setClassIndex(instances.numAttributes() - 1);

            // Store the feature values and model classifications
            for (int inst = 0; inst < row_labels.length; inst++) {
                // Initialize an instance
                Instance this_instance = new Instance(instances.numAttributes());
                this_instance.setDataset(instances);
                int current_attribute = 0;

                // Set feature values for the instance
                for (int feat = 0; feat < column_labels.length; feat++)
                    this_instance.setValue(feat, feature_table[inst][feat]);

                // Set the class value for the instance
                // this_instance.setClassValue("a");
                instances.setRelationName("jWebMiner2");

                // Add this instance to instances
                instances.add(this_instance);
            }

            // Prepare the buffer to save to and add comments indicating
            // the names of the rows
            DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods
                    .getDataOutputStream(save_location);
            writer.writeBytes("% INSTANCES (DATA ROWS) BELOW CORRESPOND TO:\n%\n");
            for (int inst = 0; inst < row_labels.length; inst++)
                writer.writeBytes("%    " + (inst + 1) + ") " + row_labels[inst] + "\n");
            writer.writeBytes("%\n");

            // Save the ARFF file
            ArffSaver arff_saver = new ArffSaver();
            arff_saver.setInstances(instances);
            arff_saver.setFile(save_location);
            arff_saver.setDestination(writer);
            try {
                arff_saver.writeBatch();
            } catch (Exception e) {
                throw new Exception(
                        "File only partially saved.\n\nTry resaving the file with a .arff extension.");
            }

            // Close the writer
            writer.close();
        }
    }

    // Terminate the progress bar
    progress_bar.done();
}

From source file:kea.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files/*w  w  w. jav  a  2 s  . com*/
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Extract keyphrases
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();
        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;
        while ((inst = m_KEAFilter.output()) != null) {
            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;
            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }
        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;
        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i]
                        .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i]
                                .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- "
            + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    m_KEAFilter.batchFinished();
}

From source file:kea.KEAModelBuilder.java

License:Open Source License

/**
 * Builds the model from the files// w w  w  .  jav a2s.c  o  m
 */
public void buildModel(Hashtable stems) throws Exception {

    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    m_KEAFilter = new KEAFilter();
    m_KEAFilter.setDebug(m_debug);
    m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
    m_KEAFilter.setKFused(getUseKFrequency());
    m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
    m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
    m_KEAFilter.setMinNumOccur(getMinNumOccur());
    m_KEAFilter.setInputFormat(data);
    m_KEAFilter.setStemmer(getStemmer());
    m_KEAFilter.setStopwords(getStopwords());
    m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();
        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            BufferedReader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't find document for stem " + str + ".");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            BufferedReader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't find keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    m_KEAFilter.batchFinished();

    // Get rid of instances in filter
    while (m_KEAFilter.output() != null) {
    }
    ;
}

From source file:kea.main.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files/*from w w  w.  j  av  a 2s .c  om*/
 */
public synchronized void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    this.m_KEAFilter.setNumPhrases(m_numPhrases);
    this.m_KEAFilter.setVocabulary(m_vocabulary);
    this.m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    this.m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    this.m_KEAFilter.setStemmer(m_Stemmer);
    this.m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        this.m_KEAFilter.m_NODEfeature = false;
    } else {
        // Know thesaurus is loaded in the constructor
        //m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, vocabularyDir, manager);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (this.m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    System.out.println("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            is.close();

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        this.m_KEAFilter.input(data.instance(0), vocabulary);

        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = this.m_KEAFilter.output()) != null) {

            int index = (int) inst.value(this.m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }

        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(
                            topRankedInstances[i].stringValue(this.m_KEAFilter.getUnstemmedPhraseIndex()));

                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(
                                topRankedInstances[i].stringValue(this.m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(this.m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    System.out.println("Avg. number of matching keyphrases compared to existing ones : "
            + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2));
    System.out.println("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:learn.Classification.Chinese.TextDirectoryLoader.java

License:Open Source License

/**
 * Return the full data set. If the structure hasn't yet been determined by
 * a call to getStructure then method should do so before processing the
 * rest of the data set.//from   www  .j a va  2 s  . c o m
 * 
 * @return the structure of the data set as an empty set of Instances
 * @throws IOException
 *             if there is no source or parsing fails
 */
public Instances getDataSet() throws IOException {
    if (getDirectory() == null)
        throw new IOException("No directory/source has been specified");

    String directoryPath = getDirectory().getAbsolutePath();
    ArrayList<String> classes = new ArrayList<String>();
    Enumeration enm = getStructure().classAttribute().enumerateValues();
    while (enm.hasMoreElements())
        classes.add((String) enm.nextElement());

    Instances data = getStructure();
    int fileCount = 0;
    for (int k = 0; k < classes.size(); k++) {
        String subdirPath = (String) classes.get(k);
        File subdir = new File(directoryPath + File.separator + subdirPath);
        String[] files = subdir.list();
        for (int j = 0; j < files.length; j++) {
            try {
                fileCount++;
                if (getDebug())
                    System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]);

                double[] newInst = null;
                if (m_OutputFilename)
                    newInst = new double[3];
                else
                    newInst = new double[2];
                File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]);
                BufferedInputStream is;
                is = new BufferedInputStream(new FileInputStream(txt));
                StringBuffer txtStr = new StringBuffer();
                int c;
                /*
                 * while ((c = is.read()) != -1) { txtStr.append((char) c);
                 * }
                 */
                //FileReader fr = new FileReader(txt);

                BufferedReader br = new BufferedReader(
                        new InputStreamReader(new FileInputStream(txt), "UTF-8"));

                String line;

                while ((line = br.readLine()) != null) {

                    txtStr.append(line + "\n");

                }
                newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
                if (m_OutputFilename)
                    newInst[1] = (double) data.attribute(1)
                            .addStringValue(subdirPath + File.separator + files[j]);
                newInst[data.classIndex()] = (double) k;
                data.add(new DenseInstance(1.0, newInst));
                is.close();
            } catch (Exception e) {
                System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath
                        + File.separator + files[j]);
            }
        }
    }

    return data;
}

From source file:lineage.AAFClusterer.java

License:Open Source License

public Instances convertMatrixToWeka(double[][] data, int numObs, int numFeatures) {
    // convert the data to WEKA format
    FastVector atts = new FastVector();
    for (int i = 0; i < numFeatures; i++) {
        atts.addElement(new Attribute("Feature" + i, i));
    }/*w w  w  .  j  a va2 s.c  om*/

    Instances ds = new Instances("AAF Data", atts, numObs);
    for (int i = 0; i < numObs; i++) {
        ds.add(new Instance(numFeatures));
    }
    for (int i = 0; i < numFeatures; i++) {
        for (int j = 0; j < numObs; j++) {
            ds.instance(j).setValue(i, data[j][i]);
        }
    }
    return ds;
}

From source file:linqs.gaia.model.oc.ncc.WekaClassifier.java

License:Open Source License

/**
 * Create Weka instance//w ww. j  a  va2  s.c o  m
 * 
 * @param intances Weka instances
 * @param di Decorable item to convert
 * @param attInfo Weka attributes
 * @param ispredict Is this item created for training or testing
 */
private void createInstance(Instances instances, Decorable di, boolean ispredict) {
    double[] instvalues = new double[attinfosize];
    int attindex = 0;

    Schema schema = di.getSchema();
    for (String fid : featureids) {
        FeatureValue fvalue = di.getFeatureValue(fid);
        Attribute a = instances.attribute(attindex);

        Feature f = schema.getFeature(fid);
        if (!(f instanceof CompositeFeature)) {
            // Handle non multi-valued feature
            instvalues[attindex] = this.gaiavalues2weka(f, fid, fvalue, a, ispredict);
            attindex++;
        } else {
            // Handle multi-valued feature
            CompositeFeature mv = (CompositeFeature) f;
            UnmodifiableList<SimplePair<String, CVFeature>> mvfeatures = mv.getFeatures();
            CompositeValue mvvalue = (CompositeValue) di.getFeatureValue(fid);
            UnmodifiableList<FeatureValue> mvfvalues = mvvalue.getFeatureValues();
            int num = mvfvalues.size();
            for (int j = 0; j < num; j++) {
                if (fvalue.equals(FeatureValue.UNKNOWN_VALUE)) {
                    attindex++;
                    continue;
                }

                a = instances.attribute(attindex);
                f = mvfeatures.get(j).getSecond();
                fvalue = mvfvalues.get(j);
                instvalues[attindex] = this.gaiavalues2weka(f, fid, fvalue, a, ispredict);
                attindex++;
            }
        }
    }

    // Create instance of weight 1 and the specified values
    Instance inst = new SparseInstance(1, instvalues);
    inst.setDataset(instances);

    instances.add(inst);
}

From source file:lu.lippmann.cdb.datasetview.tabs.TimeSeriesSimilarityPanel.java

License:Open Source License

private Instances buildFeatureDS(final Instances dataSet) throws Exception {
    final int numAttributes = dataSet.numAttributes();

    final java.util.List<String> namesOfFeaturesToConsider = new ArrayList<String>();
    namesOfFeaturesToConsider.addAll(WekaDataStatsUtil.getAttributeNames(dataSet));
    namesOfFeaturesToConsider.removeAll(WekaDataStatsUtil.getDateAttributeNames(dataSet));

    final double[][] simMatrix = new double[numAttributes][numAttributes];
    for (int i = 0; i < numAttributes; i++) {
        final double[] arrayI = dataSet.attributeToDoubleArray(i);
        if (this.withNormalization)
            MathsUtil.normalize(arrayI);
        simMatrix[i][i] = 0d;/*from w w w . j ava2 s . c o m*/
        ;
        for (int j = i + 1; j < numAttributes; j++) {
            final double[] arrayJ = dataSet.attributeToDoubleArray(j);
            if (this.withNormalization)
                MathsUtil.normalize(arrayJ);
            simMatrix[i][j] = new DynamicTimeWarping(arrayI, arrayJ).getDistance();
            //System.out.println(i+" "+j);
        }

    }
    for (int i = 0; i < numAttributes; i++) {
        for (int j = 0; j < i + 1; j++) {
            simMatrix[i][j] = simMatrix[j][i];
        }
    }

    /*for (int i=0;i<numAttributes;i++)
    {
       System.out.println(i+" -> "+FormatterUtil.buildStringFromArrayOfDoubles(simMatrix[i]));
    }*/

    final ArrayList<Attribute> attrs = new ArrayList<Attribute>(numAttributes + 1);
    for (int i = 0; i < numAttributes; i++) {
        attrs.add(new Attribute(dataSet.attribute(i).name() + "-feat"));
    }
    attrs.add(new Attribute(FEATUREDESC_ATTRNAME, namesOfFeaturesToConsider));
    final Instances ds = new Instances("featuresComparisonDs", attrs, 0);
    ds.setClassIndex(attrs.size() - 1);

    for (int i = 0; i < simMatrix.length; i++) {
        final DenseInstance di = new DenseInstance(1.0d, ArraysUtil.concat(simMatrix[i], new double[] { 0d }));
        di.setDataset(ds);
        di.setValue(simMatrix.length, dataSet.attribute(i).name());
        ds.add(di);
    }
    return ds;
}

From source file:lu.lippmann.cdb.datasetview.tabs.UnsupervisedFeatureEvaluationTabView.java

License:Open Source License

private static Instances buildDerivatedDataset(final Instances dataSet, final List<String> possibleValues,
        final List<Integer> valueForEachFeature) throws Exception {
    final int numInstances = dataSet.numInstances();
    final ArrayList<Attribute> attrs = new ArrayList<Attribute>(numInstances + 2);
    attrs.add(new Attribute(FEATUREDESC_ATTRNAME, (java.util.List<String>) null));
    for (int i = 0; i < numInstances; i++) {
        attrs.add(new Attribute(i + "_eval"));
    }/*from   www  .  j  a  v  a  2 s  . c o m*/
    attrs.add(new Attribute("__", possibleValues));

    final Instances newds = new Instances("unsupervisedFeaturesEval", attrs, 0);
    final int numAttributes = dataSet.numAttributes();
    for (int j = 0; j < numAttributes; j++) {
        double[] val = ArraysUtil.concat(dataSet.attributeToDoubleArray(j), new double[] { 0.0d });
        val = ArraysUtil.concat(new double[] { 0.0d }, val);
        newds.add(new DenseInstance(1.0d, val));
    }
    for (int j = 0; j < numAttributes; j++) {
        newds.instance(j).setValue(0, dataSet.attribute(j).name());
        newds.instance(j).setValue(numInstances + 1, possibleValues.get(valueForEachFeature.get(j)));
    }
    newds.setClassIndex(numInstances + 1);
    return newds;
}

From source file:lu.lippmann.cdb.ext.hydviga.cbr.GapFillingKnowledgeDB.java

License:Open Source License

public static Instances getKnowledgeDBWithBestCasesOnly() throws Exception {
    final Instances newds = new Instances(DATABASE, 0);
    final int numInstances = DATABASE.numInstances();
    final int numAttributes = DATABASE.numAttributes();
    for (int i = 0; i < numInstances; i++) {
        if (DATABASE.instance(i).stringValue(numAttributes - 1).equals("true"))
            newds.add(DATABASE.instance(i));
    }/* ww w. j a  va2 s.  c o m*/

    return newds;
}