Example usage for weka.core Instances instance

List of usage examples for weka.core Instances instance

Introduction

In this page you can find the example usage for weka.core Instances instance.

Prototype



publicInstance instance(int index) 

Source Link

Document

Returns the instance at the given position.

Usage

From source file:com.openkm.kea.metadata.SubjectExtractor.java

License:Open Source License

/**
 * extractSuggestedSubjects/* w  w  w. j  a  v a2  s .c  o  m*/
 * 
 * @param documentText
 * @return
 */
public List<String> extractSuggestedSubjects(String documentText) {

    Date start, stop;

    start = new Date();
    List<String> subjects = new ArrayList<String>();
    // no idea what this is ....
    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances unknownDataStructure = new Instances("keyphrase_training_data", atts, 0);

    try {
        // this is the exrtraction process part - not too well understood yet
        // "unkowndatastructure" is called instances in original KEA code
        double[] unknownStructure = new double[2];
        unknownStructure[0] = (double) unknownDataStructure.attribute(0).addStringValue(documentText);
        unknownStructure[1] = Instance.missingValue(); // this part used for existing subjects - we have none
        unknownDataStructure.add(new Instance(1.0, unknownStructure));
        filter.input(unknownDataStructure.instance(0));
        unknownDataStructure.stringFreeStructure(); //??**&%%!!!??

        // this is getting the results out - better understood
        Instance[] rankedSubjects = new Instance[this.subjectNumLimit];
        Instance subject;
        while ((subject = filter.output()) != null) {
            int index = (int) subject.value(filter.getRankIndex()) - 1;
            if (index < subjectNumLimit) {
                rankedSubjects[index] = subject;
            }
        }
        for (int i = 0; i < subjectNumLimit; i++) {
            if (rankedSubjects[i] != null) {
                subjects.add(rankedSubjects[i].stringValue(filter.getUnstemmedPhraseIndex()));
            }
        }

    } catch (Exception e) {
        log.error("problem in subject extraction: ", e);
    } finally {
        stop = new Date();
        long time = (stop.getTime() - start.getTime());
        log.info("Subject extraction completed in " + time + "ms");
    }

    return subjects;
}

From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files// w  w  w.j a v  a 2s .  c  o  m
 */
public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception {
    Vector<Double> stats = new Vector<Double>();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    m_KEAFilter.setNumPhrases(m_numPhrases);
    m_KEAFilter.setVocabulary(m_vocabulary);
    m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setStemmer(m_Stemmer);
    m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    log.info("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration<String> elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                log.debug("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                log.debug("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        m_KEAFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (m_debug) {
            log.debug("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = m_KEAFilter.output()) != null) {

            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;

            }
        }

        if (m_debug) {
            log.debug("-- Keyphrases and feature values:");
        }

        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }

        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                // My addition: to exclude low ranking phrases
                double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex());

                if (rank >= 0.00) {
                    if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                        numExtracted += 1.0;
                    }
                    if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                        numCorrect += 1.0;
                    }
                    if (printer != null) {
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));

                        if (m_AdditionalInfo) {
                            printer.print("\t");
                            printer.print(
                                    topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                            printer.print("\t");
                            printer.print(Utils.doubleToString(
                                    topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                        }
                        printer.println();
                    }
                    if (m_debug) {
                        log.debug("" + topRankedInstances[i]);
                    }
                }
            }
        }

        if (numExtracted > 0) {
            if (m_debug) {
                log.debug("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }

        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }

    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2)
            + " +/- " + Utils.doubleToString(stdDev, 2));
    log.info("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:com.openkm.kea.modelcreator.KEAModelBuilder.java

License:Open Source License

/**
 * Builds the model from the files/*from  w  w  w. j a v  a 2 s .  c  o m*/
 */
public void buildModel(Hashtable<String, Double> stems, Stopwords stopwords) throws Exception {
    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    m_KEAFilter = new KEAFilter(stopwords);

    m_KEAFilter.setDebug(m_debug);
    m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
    m_KEAFilter.setKFused(getUseKFrequency());

    m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
    m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
    m_KEAFilter.setMinNumOccur(getMinNumOccur());
    m_KEAFilter.setStemmer(getStemmer());
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setVocabulary(getVocabulary());
    m_KEAFilter.setVocabularyFormat(getVocabularyFormat());
    m_KEAFilter.setStopwords(getStopwords());
    m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
    m_KEAFilter.setInputFormat(data);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(getStemmer(), getStopwords());
    }
    m_KEAFilter.setNumFeature();

    log.info("-- Reading the Documents... ");

    Enumeration<String> elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            log.error("Can't find document for stem " + str + ".");
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            log.error("Can't find keyphrases for stem " + str + ".");
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    m_KEAFilter.batchFinished();

    while ((m_KEAFilter.output()) != null) {
    }
    ;
}

From source file:com.rapidminer.operator.learner.clustering.clusterer.WekaCluster.java

License:Open Source License

public ExampleSet apply(ExampleSet exampleSet) throws OperatorException {
    log("Converting to Weka instances.");
    Instances instances = WekaTools.toWekaInstances(exampleSet, "ClusterInstances",
            WekaInstancesAdaptor.CLUSTERING);
    log("Applying Weka clusterer.");
    int i = 0;/*from w w w.  ja v  a 2 s. c o  m*/
    Attribute clusterAtt = exampleSet.getAttributes().getCluster();
    if (clusterAtt == null)
        clusterAtt = Tools.createSpecialAttribute(exampleSet, Attributes.CLUSTER_NAME, Ontology.NOMINAL);
    Iterator<Example> r = exampleSet.iterator();
    while (r.hasNext()) {
        Example e = r.next();
        Instance instance = instances.instance(i++);
        applyModelForInstance(instance, e, clusterAtt);
    }

    return exampleSet;
}

From source file:com.rapidminer.tools.WekaTools.java

License:Open Source License

/**
 * Creates a RapidMiner example set from Weka instances. Only a label can be used
 * as special attributes, other types of special attributes are not
 * supported. If <code>attributeNamePrefix</code> is not null, the given
 * string prefix plus a number is used as attribute names.
 *//*from  ww  w  .  j ava  2 s . com*/
public static ExampleSet toRapidMinerExampleSet(Instances instances, String attributeNamePrefix,
        int datamanagement) {
    int classIndex = instances.classIndex();

    // create example table

    // 1. Extract attributes
    List<Attribute> attributes = new ArrayList<Attribute>();
    int number = 1; // use for attribute names
    for (int i = 0; i < instances.numAttributes(); i++) {
        weka.core.Attribute wekaAttribute = instances.attribute(i);
        int rapidMinerAttributeValueType = Ontology.REAL;
        if (wekaAttribute.isNominal())
            rapidMinerAttributeValueType = Ontology.NOMINAL;
        else if (wekaAttribute.isString())
            rapidMinerAttributeValueType = Ontology.STRING;
        Attribute attribute = AttributeFactory.createAttribute(wekaAttribute.name(),
                rapidMinerAttributeValueType);
        if ((i != classIndex) && (attributeNamePrefix != null) && (attributeNamePrefix.length() > 0)) {
            attribute.setName(attributeNamePrefix + "_" + (number++));
        }
        if (wekaAttribute.isNominal()) {
            for (int a = 0; a < wekaAttribute.numValues(); a++) {
                String nominalValue = wekaAttribute.value(a);
                attribute.getMapping().mapString(nominalValue);
            }
        }
        attributes.add(attribute);
    }

    Attribute label = null;
    if (classIndex >= 0) {
        label = attributes.get(classIndex);
        label.setName("label");
    }

    // 2. Guarantee alphabetical mapping to numbers
    for (int j = 0; j < attributes.size(); j++) {
        Attribute attribute = attributes.get(j);
        if (attribute.isNominal())
            attribute.getMapping().sortMappings();
    }

    // 3. Read data
    MemoryExampleTable table = new MemoryExampleTable(attributes);
    DataRowFactory factory = new DataRowFactory(datamanagement, '.');
    // create data
    List<DataRow> dataList = new LinkedList<DataRow>();
    int numberOfRapidMinerAttributes = instances.numAttributes();
    for (int i = 0; i < instances.numInstances(); i++) {
        Instance instance = instances.instance(i);
        DataRow dataRow = factory.create(numberOfRapidMinerAttributes);
        for (int a = 0; a < instances.numAttributes(); a++) {
            Attribute attribute = table.getAttribute(a);
            double wekaValue = instance.value(a);
            if (attribute.isNominal()) {
                String nominalValue = instances.attribute(a).value((int) wekaValue);
                dataRow.set(attribute, attribute.getMapping().mapString(nominalValue));
            } else {
                dataRow.set(attribute, wekaValue);
            }
        }
        dataRow.trim();
        dataList.add(dataRow);
    }

    // handle label extra
    table.readExamples(new ListDataRowReader(dataList.iterator()));

    // create and return example set
    return table.createExampleSet(label);
}

From source file:com.reactivetechnologies.analytics.core.eval.AdaBoostM1WithBuiltClassifiers.java

License:Open Source License

@Override
protected void buildClassifierUsingResampling(Instances data) throws Exception {

    Instances trainData, training;
    double epsilon, reweight, sumProbs;
    Evaluation evaluation;/*from   w w w.ja v  a 2s. c  o  m*/
    int numInstances = data.numInstances();
    int resamplingIterations = 0;

    // Initialize data
    m_Betas = new double[m_Classifiers.length];
    m_NumIterationsPerformed = 0;
    // Create a copy of the data so that when the weights are diddled
    // with it doesn't mess up the weights for anyone else
    training = new Instances(data, 0, numInstances);
    sumProbs = training.sumOfWeights();
    for (int i = 0; i < training.numInstances(); i++) {
        training.instance(i).setWeight(training.instance(i).weight() / sumProbs);
    }

    // Do boostrap iterations
    for (m_NumIterationsPerformed = 0; m_NumIterationsPerformed < m_Classifiers.length; m_NumIterationsPerformed++) {
        if (m_Debug) {
            System.err.println("Training classifier " + (m_NumIterationsPerformed + 1));
        }

        // Select instances to train the classifier on
        if (m_WeightThreshold < 100) {
            trainData = selectWeightQuantile(training, (double) m_WeightThreshold / 100);
        } else {
            trainData = new Instances(training);
        }

        // Resample
        resamplingIterations = 0;
        double[] weights = new double[trainData.numInstances()];
        for (int i = 0; i < weights.length; i++) {
            weights[i] = trainData.instance(i).weight();
        }
        do {

            /** Changed here: DO NOT build classifier*/
            // Build and evaluate classifier
            //m_Classifiers[m_NumIterationsPerformed].buildClassifier(sample);
            /** End change */

            evaluation = new Evaluation(data);
            evaluation.evaluateModel(m_Classifiers[m_NumIterationsPerformed], training);
            epsilon = evaluation.errorRate();
            resamplingIterations++;
        } while (Utils.eq(epsilon, 0) && (resamplingIterations < 10));

        // Stop if error too big or 0
        if (Utils.grOrEq(epsilon, 0.5) || Utils.eq(epsilon, 0)) {
            if (m_NumIterationsPerformed == 0) {
                m_NumIterationsPerformed = 1; // If we're the first we have to to use it
            }
            break;
        }

        // Determine the weight to assign to this model
        m_Betas[m_NumIterationsPerformed] = Math.log((1 - epsilon) / epsilon);
        reweight = (1 - epsilon) / epsilon;
        if (m_Debug) {
            System.err.println("\terror rate = " + epsilon + "  beta = " + m_Betas[m_NumIterationsPerformed]);
        }

        // Update instance weights
        setWeights(training, reweight);
    }
}

From source file:com.reactivetechnologies.analytics.core.eval.BaggingWithBuiltClassifiers.java

License:Open Source License

@Override
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();//from  w  ww .java  2  s  .c  o  m

    /** Changed here: Use supplied classifier */
    //super.buildClassifier(data);
    /** End change */

    if (m_CalcOutOfBag && (m_BagSizePercent != 100)) {
        throw new IllegalArgumentException(
                "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!");
    }

    int bagSize = (int) (data.numInstances() * (m_BagSizePercent / 100.0));
    Random random = new Random(m_Seed);

    boolean[][] inBag = null;
    if (m_CalcOutOfBag)
        inBag = new boolean[m_Classifiers.length][];

    for (int j = 0; j < m_Classifiers.length; j++) {
        Instances bagData = null;

        // create the in-bag dataset
        if (m_CalcOutOfBag) {
            inBag[j] = new boolean[data.numInstances()];
            bagData = data.resampleWithWeights(random, inBag[j]);
        } else {
            bagData = data.resampleWithWeights(random);
            if (bagSize < data.numInstances()) {
                bagData.randomize(random);
                Instances newBagData = new Instances(bagData, 0, bagSize);
                bagData = newBagData;
            }
        }

        /** Changed here: Use supplied classifier */
        /*if (m_Classifier instanceof Randomizable) {
          ((Randomizable) m_Classifiers[j]).setSeed(random.nextInt());
        }
                
        // build the classifier
        m_Classifiers[j].buildClassifier(bagData);*/
        /** End change */
    }

    // calc OOB error?
    if (getCalcOutOfBag()) {
        double outOfBagCount = 0.0;
        double errorSum = 0.0;
        boolean numeric = data.classAttribute().isNumeric();

        for (int i = 0; i < data.numInstances(); i++) {
            double vote;
            double[] votes;
            if (numeric)
                votes = new double[1];
            else
                votes = new double[data.numClasses()];

            // determine predictions for instance
            int voteCount = 0;
            for (int j = 0; j < m_Classifiers.length; j++) {
                if (inBag[j][i])
                    continue;

                voteCount++;
                // double pred = m_Classifiers[j].classifyInstance(data.instance(i));
                if (numeric) {
                    // votes[0] += pred;
                    votes[0] += m_Classifiers[j].classifyInstance(data.instance(i));
                } else {
                    // votes[(int) pred]++;
                    double[] newProbs = m_Classifiers[j].distributionForInstance(data.instance(i));
                    // average the probability estimates
                    for (int k = 0; k < newProbs.length; k++) {
                        votes[k] += newProbs[k];
                    }
                }
            }

            // "vote"
            if (numeric) {
                vote = votes[0];
                if (voteCount > 0) {
                    vote /= voteCount; // average
                }
            } else {
                if (Utils.eq(Utils.sum(votes), 0)) {
                } else {
                    Utils.normalize(votes);
                }
                vote = Utils.maxIndex(votes); // predicted class
            }

            // error for instance
            outOfBagCount += data.instance(i).weight();
            if (numeric) {
                errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight();
            } else {
                if (vote != data.instance(i).classValue())
                    errorSum += data.instance(i).weight();
            }
        }

        m_OutOfBagError = errorSum / outOfBagCount;
    } else {
        m_OutOfBagError = 0;
    }
}

From source file:com.reactivetechnologies.analytics.core.eval.StackingWithBuiltClassifiers.java

License:Open Source License

/**
 * Generates the meta data/*w w w .j a  va 2 s . c  o m*/
 *
 * @param newData the data to work on
 * @param random the random number generator to use for cross-validation
 * @throws Exception if generation fails
 */
@Override
protected void generateMetaLevel(Instances newData, Random random) throws Exception {

    Instances metaData = metaFormat(newData);
    m_MetaFormat = new Instances(metaData, 0);
    for (int j = 0; j < m_NumFolds; j++) {

        /** Changed here */
        //Instances train = newData.trainCV(m_NumFolds, j, random);
        // DO NOT Build base classifiers
        /*for (int i = 0; i < m_Classifiers.length; i++) {
            getClassifier(i).buildClassifier(train);
        }*/
        /** End change */

        // Classify test instances and add to meta data
        Instances test = newData.testCV(m_NumFolds, j);
        for (int i = 0; i < test.numInstances(); i++) {
            metaData.add(metaInstance(test.instance(i)));
        }
    }

    m_MetaClassifier.buildClassifier(metaData);
}

From source file:com.sliit.normalize.NormalizeDataset.java

public String normalizeDataset() {
    System.out.println("start normalizing data");

    String filePathOut = "";
    try {//from w  w  w  .  j ava  2  s. c  o  m

        CSVLoader loader = new CSVLoader();
        if (reducedDiemensionFile != null) {

            loader.setSource(reducedDiemensionFile);
        } else {
            if (tempFIle != null && tempFIle.exists()) {

                loader.setSource(tempFIle);
            } else {

                loader.setSource(csvFile);
            }
        }
        Instances dataInstance = loader.getDataSet();
        Normalize normalize = new Normalize();
        dataInstance.setClassIndex(dataInstance.numAttributes() - 1);
        normalize.setInputFormat(dataInstance);
        String directory = csvFile.getParent();
        outputFile = new File(directory + "/" + "normalized" + csvFile.getName());
        if (!outputFile.exists()) {

            outputFile.createNewFile();
        }
        CSVSaver saver = new CSVSaver();
        saver.setFile(outputFile);
        for (int i = 1; i < dataInstance.numInstances(); i++) {

            normalize.input(dataInstance.instance(i));
        }
        normalize.batchFinished();
        Instances outPut = new Instances(dataInstance, 0);
        for (int i = 1; i < dataInstance.numInstances(); i++) {

            outPut.add(normalize.output());
        }
        Attribute attribute = dataInstance.attribute(outPut.numAttributes() - 1);
        for (int j = 0; j < attribute.numValues(); j++) {

            if (attribute.value(j).equals("normal.")) {
                outPut.renameAttributeValue(attribute, attribute.value(j), "0");
            } else {
                outPut.renameAttributeValue(attribute, attribute.value(j), "1");
            }
        }
        saver.setInstances(outPut);
        saver.writeBatch();
        writeToNewFile(directory);
        filePathOut = directory + "norm" + csvFile.getName();
        if (tempFIle != null) {

            tempFIle.delete();
        }
        if (reducedDiemensionFile != null) {

            reducedDiemensionFile.delete();
        }
        outputFile.delete();
    } catch (IOException e) {

        log.error("Error occurred:" + e.getMessage());
    } catch (Exception e) {

        log.error("Error occurred:" + e.getMessage());
    }
    return filePathOut;
}

From source file:com.sliit.normalize.NormalizeDataset.java

public int whiteningData() {
    System.out.println("whiteningData");

    int nums = 0;
    try {/*from  w  ww .  j a va 2  s  . c  o m*/

        if (tempFIle != null && tempFIle.exists()) {

            csv.setSource(tempFIle);
        } else {

            csv.setSource(csvFile);
        }
        Instances instances = csv.getDataSet();
        if (instances.numAttributes() > 10) {
            instances.setClassIndex(instances.numAttributes() - 1);
            RandomProjection random = new RandomProjection();
            random.setDistribution(
                    new SelectedTag(RandomProjection.GAUSSIAN, RandomProjection.TAGS_DSTRS_TYPE));
            reducedDiemensionFile = new File(csvFile.getParent() + "/tempwhite.csv");
            if (!reducedDiemensionFile.exists()) {

                reducedDiemensionFile.createNewFile();
            }
            // CSVSaver saver = new CSVSaver();
            /// saver.setFile(reducedDiemensionFile);
            random.setInputFormat(instances);
            //saver.setRetrieval(AbstractSaver.INCREMENTAL);
            BufferedWriter writer = new BufferedWriter(new FileWriter(reducedDiemensionFile));
            for (int i = 0; i < instances.numInstances(); i++) {

                random.input(instances.instance(i));
                random.setNumberOfAttributes(10);
                random.setReplaceMissingValues(true);
                writer.write(random.output().toString());
                writer.newLine();
                //saver.writeIncremental(random.output());
            }
            writer.flush();
            writer.close();
            nums = random.getNumberOfAttributes();
        } else {

            nums = instances.numAttributes();
        }
    } catch (IOException e) {

        log.error("Error occurred:" + e.getMessage());
    } catch (Exception e) {

        log.error("Error occurred:" + e.getMessage());
    }
    return nums;
}