Example usage for weka.core Instances add

Introduction

In this page you can find the example usage for weka.core Instances add.

Prototype

@Override
public boolean add(Instance instance)

Source Link

Document

Adds one instance to the end of the set.

Usage

From source file:cn.ict.zyq.bestConf.COMT2.COMT2.java

License:Open Source License

private void train() throws Exception {
    models = new M5P[ModelNum];
    for (int i = 0; i < ModelNum; i++) {
        models[i] = buildModel(labeledInstances, M[i]);
    }/*from   w  w w  . ja  v  a  2  s  .co  m*/

    for (int i = 0; i < this.comtIterations; i++) {
        ArrayList<Instance>[] InstancePiSet = new ArrayList[ModelNum];
        for (int j = 0; j < ModelNum; j++)
            InstancePiSet[j] = new ArrayList<Instance>();

        for (int m = 0; m < ModelNum; m++) {
            double maxDelta = 0;
            Instance maxDeltaXY = null;
            Enumeration<Instance> enu = this.unlabeledInstances.enumerateInstances();

            while (enu.hasMoreElements()) {
                Instance ulIns = enu.nextElement();
                Instances omega = getSiblings(models[m], ulIns);
                double y = models[m].classifyInstance(ulIns);
                if (indexOfClass == -1)
                    indexOfClass = labeledInstances.classIndex();
                ulIns.setValue(indexOfClass, y);

                Instances instancesPi = new Instances(models[m].getM5RootNode().zyqGetTrainingSet());
                instancesPi.add(ulIns);
                M5P modelPi = buildModel(instancesPi, M[m]);
                double delta = computeOmegaDelta(models[m], modelPi, omega);
                if (maxDelta < delta) {
                    maxDelta = delta;
                    maxDeltaXY = ulIns;
                }
            }

            //now check facts about delta
            if (maxDelta > 0) {
                InstancePiSet[m].add(maxDeltaXY);
                this.unlabeledInstances.delete(this.unlabeledInstances.indexOf(maxDeltaXY));
            }
        } //check for both model

        boolean toExit = true;
        for (int m = 0; m < ModelNum; m++) {
            if (InstancePiSet[m].size() > 0) {
                toExit = false;
                break;
            }
        }

        if (toExit)
            break;
        else {
            //update the models
            int toGen = 0;
            for (int m = 0; m < ModelNum; m++) {
                Instances set = models[m].getM5RootNode().zyqGetTrainingSet();
                toGen += InstancePiSet[m].size();
                for (Instance ins : InstancePiSet[m])
                    set.add(ins);

                models[m] = buildModel(set, M[m]);
            }

            //Replenish pool U' to size p
            Instances toAdd = retrieveMore(toGen);
            unlabeledInstances.addAll(toAdd);
        } //we will go to another round of iteration
    } //iterate for a number of rounds or break out on empty InstancesPiSets

    //now we have the model as y = 0.5*sum(models[m].predict(x))
}

From source file:cn.ict.zyq.bestConf.COMT2.COMT2.java

License:Open Source License

private Instances retrieveMore(int toGen) {
    Instances retval = new Instances(this.unlabeldPool, toGen);
    for (int i = 0; i < toGen; i++) {
        retval.add(this.unlabeldPool.remove(rand.nextInt(this.unlabeldPool.size())));
    }/*ww w.  j ava2s .co m*/
    return retval;
}

From source file:cn.ict.zyq.bestConf.util.LHSInitializer.java

License:Open Source License

/**
 * Assumptions:(1)Numberic is continuous and has lower/upper bounds; (2) Nominals have domains permutable
 * //from ww w  . j ava2  s .  com
 * @param useMid true if to use the middle point of a subdomain, false if to use a random point within a subdomain
 */
public static Instances getMultiDim(ArrayList<Attribute> atts, int sampleSetSize, boolean useMid) {

    int L = Math.min(7, Math.max(sampleSetSize, atts.size()));//7 is chosen for no special reason
    double maxMinDist = 0, crntMinDist;//work as the threshold to select the sample set
    ArrayList<Integer>[] setWithMaxMinDist = null;
    //generate L sets of sampleSetSize points
    for (int i = 0; i < L; i++) {
        ArrayList<Integer>[] setPerm = generateOneSampleSet(sampleSetSize, atts.size());
        //compute the minimum distance minDist between any sample pair for each set
        crntMinDist = minDistForSet(setPerm);
        //select the set with the maximum minDist
        if (crntMinDist > maxMinDist) {
            setWithMaxMinDist = setPerm;
            maxMinDist = crntMinDist;
        }
    }

    //generate and output the set with the maximum minDist as the result

    //first, divide the domain of each attribute into sampleSetSize equal subdomain
    double[][] bounds = new double[atts.size()][sampleSetSize + 1];//sampleSetSize+1 to include the lower and upper bounds
    Iterator<Attribute> itr = atts.iterator();
    Attribute crntAttr;
    double pace;
    for (int i = 0; i < bounds.length; i++) {
        crntAttr = itr.next();

        if (crntAttr.isNumeric()) {
            bounds[i][0] = crntAttr.getLowerNumericBound();
            bounds[i][sampleSetSize] = crntAttr.getUpperNumericBound();
            pace = (crntAttr.getUpperNumericBound() - crntAttr.getLowerNumericBound()) / sampleSetSize;
            for (int j = 1; j < sampleSetSize; j++) {
                bounds[i][j] = bounds[i][j - 1] + pace;
            }
        } else {//crntAttr.isNominal()
            if (crntAttr.numValues() >= sampleSetSize) {
                //randomly select among the set
                for (int j = 0; j <= sampleSetSize; j++)
                    bounds[i][j] = uniRand.nextInt(crntAttr.numValues());//the position of one of the nominal values
            } else {
                //first round-robin
                int lastPart = sampleSetSize % crntAttr.numValues();
                for (int j = 0; j < sampleSetSize - lastPart; j++)
                    bounds[i][j] = j % crntAttr.numValues();
                //then randomly select
                for (int j = sampleSetSize - lastPart; j <= sampleSetSize; j++)
                    bounds[i][j] = uniRand.nextInt(crntAttr.numValues());
            }
        } //nominal attribute
    } //get all subdomains

    //second, generate the set according to setWithMaxMinDist
    Instances data = new Instances("InitialSetByLHS", atts, sampleSetSize);
    for (int i = 0; i < sampleSetSize; i++) {
        double[] vals = new double[atts.size()];
        for (int j = 0; j < vals.length; j++) {
            if (atts.get(j).isNumeric()) {
                vals[j] = useMid
                        ? (bounds[j][setWithMaxMinDist[j].get(i)] + bounds[j][setWithMaxMinDist[j].get(i) + 1])
                                / 2
                        : bounds[j][setWithMaxMinDist[j].get(i)] + ((bounds[j][setWithMaxMinDist[j].get(i) + 1]
                                - bounds[j][setWithMaxMinDist[j].get(i)]) * uniRand.nextDouble());
            } else {//isNominal()
                vals[j] = bounds[j][setWithMaxMinDist[j].get(i)];
            }
        }
        data.add(new DenseInstance(1.0, vals));
    }

    //third, return the generated points
    return data;
}

From source file:cn.ict.zyq.bestConf.util.LHSInitializer.java

License:Open Source License

/**
 * At current version, we assume all attributes are numeric attributes with bounds
 * //w  w  w. ja va 2  s .c o m
 * Let PACE be log10(upper/lower)
 * 
 * @param useMid true if to use the middle point of a subdomain, false if to use a random point within a subdomain
 */
public static Instances getMultiDimContinuousLog(ArrayList<Attribute> atts, int sampleSetSize, boolean useMid) {

    int L = Math.min(7, Math.max(sampleSetSize, atts.size()));//7 is chosen for no special reason
    double maxMinDist = 0, crntMinDist;//work as the threshold to select the sample set
    ArrayList<Integer>[] setWithMaxMinDist = null;
    //generate L sets of sampleSetSize points
    for (int i = 0; i < L; i++) {
        ArrayList<Integer>[] setPerm = generateOneSampleSet(sampleSetSize, atts.size());
        //compute the minimum distance minDist between any sample pair for each set
        crntMinDist = minDistForSet(setPerm);
        //select the set with the maximum minDist
        if (crntMinDist > maxMinDist) {
            setWithMaxMinDist = setPerm;
            maxMinDist = crntMinDist;
        }
    }

    //generate and output the set with the maximum minDist as the result

    //first, divide the domain of each attribute into sampleSetSize equal subdomain
    double[][] bounds = new double[atts.size()][sampleSetSize + 1];//sampleSetSize+1 to include the lower and upper bounds
    Iterator<Attribute> itr = atts.iterator();
    Attribute crntAttr;
    int step, crntStep;
    for (int i = 0; i < bounds.length; i++) {
        crntAttr = itr.next();

        bounds[i][0] = crntAttr.getLowerNumericBound();
        bounds[i][sampleSetSize] = crntAttr.getUpperNumericBound();
        crntStep = (int) Math.log10(bounds[i][sampleSetSize] - bounds[i][0]);
        step = sampleSetSize / crntStep;//num of points drawn after the multiplication of 10
        int left = sampleSetSize % crntStep;
        if (bounds[i][0] == 0)
            bounds[i][0] = uniRand.nextInt(10);
        crntStep = 1;
        double theBound = bounds[i][sampleSetSize] / 10;
        for (int j = 1; j < sampleSetSize; j++) {
            if (crntStep >= step && bounds[i][j - 1] <= theBound)
                crntStep = 0;

            if (crntStep == 0)
                bounds[i][j] = bounds[i][j - step] * 10;
            else if (crntStep < step)
                bounds[i][j] = bounds[i][j - crntStep] * ((double) crntStep * 10. / ((double) step + 1.));
            else if (crntStep >= step)
                bounds[i][j] = bounds[i][j - crntStep] * ((double) crntStep * 10. / (double) (left + step + 1));

            if (bounds[i][j] >= bounds[i][sampleSetSize])
                System.err.println("be careful!!!!");
            crntStep++;
        }
    }

    //second, generate the set according to setWithMaxMinDist
    Instances data = new Instances("InitialSetByLHS", atts, sampleSetSize);
    for (int i = 0; i < sampleSetSize; i++) {
        double[] vals = new double[atts.size()];
        for (int j = 0; j < vals.length; j++) {
            vals[j] = useMid
                    ? (bounds[j][setWithMaxMinDist[j].get(i)] + bounds[j][setWithMaxMinDist[j].get(i) + 1]) / 2
                    : bounds[j][setWithMaxMinDist[j].get(i)] + ((bounds[j][setWithMaxMinDist[j].get(i) + 1]
                            - bounds[j][setWithMaxMinDist[j].get(i)]) * uniRand.nextDouble());
        }
        data.add(new DenseInstance(1.0, vals));
    }

    //third, return the generated points
    return data;
}

From source file:cn.ict.zyq.bestConf.util.LHSInitializer.java

License:Open Source License

/**
 * At current version, we assume all attributes are numeric attributes with bounds
 * //from  w ww .j  a v a 2  s  . com
 * Let PACE be log10(upper/lower)
 * 
 * @param useMid true if to use the middle point of a subdomain, false if to use a random point within a subdomain
 */
public static Instances getMultiDimContinuous(ArrayList<Attribute> atts, int sampleSetSize, boolean useMid) {

    int L = Math.min(7, Math.max(sampleSetSize, atts.size()));//7 is chosen for no special reason
    double maxMinDist = 0, crntMinDist;//work as the threshold to select the sample set
    ArrayList<Integer>[] setWithMaxMinDist = null;
    //generate L sets of sampleSetSize points
    for (int i = 0; i < L; i++) {
        ArrayList<Integer>[] setPerm = generateOneSampleSet(sampleSetSize, atts.size());
        //compute the minimum distance minDist between any sample pair for each set
        crntMinDist = minDistForSet(setPerm);
        //select the set with the maximum minDist
        if (crntMinDist > maxMinDist) {
            setWithMaxMinDist = setPerm;
            maxMinDist = crntMinDist;
        }
    }

    //generate and output the set with the maximum minDist as the result

    //first, divide the domain of each attribute into sampleSetSize equal subdomain
    double[][] bounds = new double[atts.size()][sampleSetSize + 1];//sampleSetSize+1 to include the lower and upper bounds
    Iterator<Attribute> itr = atts.iterator();
    Attribute crntAttr;
    boolean[] roundToInt = new boolean[atts.size()];
    for (int i = 0; i < bounds.length; i++) {
        crntAttr = itr.next();
        uniBoundsGeneration(bounds[i], crntAttr, sampleSetSize);
        //flexibleBoundsGeneration(bounds[i], crntAttr, sampleSetSize);

        if (bounds[i][sampleSetSize] - bounds[i][0] > sampleSetSize)
            roundToInt[i] = true;
    }

    //second, generate the set according to setWithMaxMinDist
    Instances data = new Instances("InitialSetByLHS", atts, sampleSetSize);
    for (int i = 0; i < sampleSetSize; i++) {
        double[] vals = new double[atts.size()];
        for (int j = 0; j < vals.length; j++) {
            vals[j] = useMid
                    ? (bounds[j][setWithMaxMinDist[j].get(i)] + bounds[j][setWithMaxMinDist[j].get(i) + 1]) / 2
                    : bounds[j][setWithMaxMinDist[j].get(i)] + ((bounds[j][setWithMaxMinDist[j].get(i) + 1]
                            - bounds[j][setWithMaxMinDist[j].get(i)]) * uniRand.nextDouble());
            if (roundToInt[j])
                vals[j] = (int) vals[j];
        }
        data.add(new DenseInstance(1.0, vals));
    }

    //third, return the generated points
    return data;
}

From source file:cn.ict.zyq.bestConf.util.LHSInitializer.java

License:Open Source License

/**
 * At current version, we assume all attributes are numeric attributes with bounds
 * //from   ww w .j a  v  a  2s .  c  om
 * Let PACE be upper-lower DIVided by the sampleSetSize
 * 
 * @param useMid true if to use the middle point of a subdomain, false if to use a random point within a subdomain
 */
public static Instances getMultiDimContinuousDiv(ArrayList<Attribute> atts, int sampleSetSize, boolean useMid) {

    int L = Math.min(7, Math.max(sampleSetSize, atts.size()));//7 is chosen for no special reason
    double maxMinDist = 0, crntMinDist;//work as the threshold to select the sample set
    ArrayList<Integer>[] setWithMaxMinDist = null;
    //generate L sets of sampleSetSize points
    for (int i = 0; i < L; i++) {
        ArrayList<Integer>[] setPerm = generateOneSampleSet(sampleSetSize, atts.size());
        //compute the minimum distance minDist between any sample pair for each set
        crntMinDist = minDistForSet(setPerm);
        //select the set with the maximum minDist
        if (crntMinDist > maxMinDist) {
            setWithMaxMinDist = setPerm;
            maxMinDist = crntMinDist;
        }
    }

    //generate and output the set with the maximum minDist as the result

    //first, divide the domain of each attribute into sampleSetSize equal subdomain
    double[][] bounds = new double[atts.size()][sampleSetSize + 1];//sampleSetSize+1 to include the lower and upper bounds
    Iterator<Attribute> itr = atts.iterator();
    Attribute crntAttr;
    double pace;
    for (int i = 0; i < bounds.length; i++) {
        crntAttr = itr.next();

        bounds[i][0] = crntAttr.getLowerNumericBound();
        bounds[i][sampleSetSize] = crntAttr.getUpperNumericBound();
        pace = (bounds[i][sampleSetSize] - bounds[i][0]) / sampleSetSize;
        for (int j = 1; j < sampleSetSize; j++) {
            bounds[i][j] = bounds[i][j - 1] + pace;
        }
    }

    //second, generate the set according to setWithMaxMinDist
    Instances data = new Instances("InitialSetByLHS", atts, sampleSetSize);
    for (int i = 0; i < sampleSetSize; i++) {
        double[] vals = new double[atts.size()];
        for (int j = 0; j < vals.length; j++) {
            vals[j] = useMid
                    ? (bounds[j][setWithMaxMinDist[j].get(i)] + bounds[j][setWithMaxMinDist[j].get(i) + 1]) / 2
                    : bounds[j][setWithMaxMinDist[j].get(i)] + ((bounds[j][setWithMaxMinDist[j].get(i) + 1]
                            - bounds[j][setWithMaxMinDist[j].get(i)]) * uniRand.nextDouble());
        }
        data.add(new DenseInstance(1.0, vals));
    }

    //third, return the generated points
    return data;
}

From source file:com.actelion.research.orbit.imageAnalysis.imaging.TMAPoints.java

License:Open Source License

/**
 * returns x/y pairs for each input point
 *
 * @param pList/*from w w  w  .ja v a 2s .c om*/
 * @return
 */
private HashMap<Point, Point> clusterLines(List<Point> pList) {
    ArrayList<Attribute> attrListX = new ArrayList<Attribute>(2);
    attrListX.add(new Attribute("xvalue"));
    ArrayList<Attribute> attrListY = new ArrayList<Attribute>(2);
    attrListY.add(new Attribute("yvalue"));
    //attrList.add(new Attribute("class"));
    Instances xInst = new Instances("xlines", attrListX, pList.size());
    Instances yInst = new Instances("ylines", attrListY, pList.size());
    //instances.setClassIndex(1);
    for (Point p : pList) {
        //Instance inst = new DenseInstance(1d, new double[]{p.x,Double.NaN});
        Instance instX = new DenseInstance(1d, new double[] { p.x });
        instX.setDataset(xInst);
        //inst.setClassMissing();
        xInst.add(instX);

        Instance instY = new DenseInstance(1d, new double[] { p.y });
        instY.setDataset(yInst);
        yInst.add(instY);
    }
    try {
        EM colClusterer = new EM();
        int numCols = guessNumClusters(colClusterer, xInst, 1, 20);
        colClusterer.setNumClusters(numCols);
        colClusterer.buildClusterer(xInst);
        logger.debug("NumCols: " + colClusterer.getNumClusters());

        EM rowClusterer = new EM();
        int numRows = guessNumClusters(rowClusterer, yInst, 1, 20);
        rowClusterer.setNumClusters(numRows);
        rowClusterer.buildClusterer(yInst);
        logger.debug("NumRows: " + rowClusterer.getNumClusters());

        logger.trace("ColClusterer:");
        HashMap<Integer, Integer> colHash = sortAndpPrintCluster(colClusterer);

        logger.trace("RowClusterer:");
        HashMap<Integer, Integer> rowHash = sortAndpPrintCluster(rowClusterer);

        if (logger.isTraceEnabled()) {
            logger.trace("ColHash:");
            for (Integer i : colHash.keySet()) {
                logger.trace("cluster " + i + ": " + colHash.get(i));
            }
            logger.trace("RowHash:");
            for (Integer i : rowHash.keySet()) {
                logger.trace("cluster " + i + ": " + rowHash.get(i));
            }
        }

        // classify points
        HashMap<Point, Point> pMap = new HashMap<Point, Point>();
        for (Point p : pList) {
            Instance instX = new DenseInstance(1d, new double[] { p.x });
            instX.setDataset(xInst);
            Instance instY = new DenseInstance(1d, new double[] { p.y });
            instY.setDataset(yInst);
            int x = colClusterer.clusterInstance(instX);
            int y = rowClusterer.clusterInstance(instY);
            x = colHash.get(x);
            y = rowHash.get(y);
            logger.trace(p + ": " + x + "/" + y);
            pMap.put(p, new Point(x, y));
        }
        return pMap;

    } catch (Exception e) {
        e.printStackTrace();
        logger.error("error while clustering points", e);
        return null;
    }

}

From source file:com.daniel.convert.IncrementalClassifier.java

License:Open Source License

/**
 * Expects an ARFF file as first argument (class attribute is assumed to be
 * the last attribute)./*from www  .  j  av a  2s.c o m*/
 * 
 * @param args
 *            the commandline arguments
 * @throws Exception
 *             if something goes wrong
 */
public static BayesNet treinar(String[] args) throws Exception {
    // load data
    ArffLoader loader = new ArffLoader();
    loader.setFile(new File(args[0]));
    Instances structure = loader.getStructure();
    structure.setClassIndex(structure.numAttributes() - 1);

    // train NaiveBayes
    BayesNet BayesNet = new BayesNet();

    Instance current;
    while ((current = loader.getNextInstance(structure)) != null) {
        structure.add(current);
    }
    BayesNet.buildClassifier(structure);

    // output generated model
    // System.out.println(nb);

    // test set
    BayesNet BayesNetTest = new BayesNet();

    // test the model
    Evaluation eTest = new Evaluation(structure);
    // eTest.evaluateModel(nb, structure);
    eTest.crossValidateModel(BayesNetTest, structure, 15, new Random(1));

    // Print the result  la Weka explorer:
    String strSummary = eTest.toSummaryString();
    System.out.println(strSummary);

    return BayesNet;
}

From source file:com.dhamacher.sentimentanalysis4tweets.preprocessing.TweetFeatureExtractor.java

License:Apache License

/**
 * Method which contructs the arff file for weka with the training data
 *///  w  w w.j  av  a  2  s.  c om
public static void constructModel() {
    Instances instdata = null;
    try {
        FastVector atts;
        atts = new FastVector();
        atts.addElement(new Attribute("content", (FastVector) null));
        FastVector fvClassVal = new FastVector(4);
        fvClassVal.addElement("");
        fvClassVal.addElement("neutral");
        fvClassVal.addElement("negative");
        fvClassVal.addElement("positive");
        Attribute ClassAttribute = new Attribute("Class", fvClassVal);
        atts.addElement(ClassAttribute);
        instdata = new Instances("tweetData", atts, 0);
        CsvReader data = new CsvReader("../classified data/traindata.csv");
        int i = 0;
        while (data.readRecord()) {
            double[] vals = new double[instdata.numAttributes()];
            String class_id = data.get(0);
            switch (Integer.parseInt(class_id)) {
            case 0:
                class_id = "negative";
                break;
            case 2:
                class_id = "neutral";
                break;
            case 4:
                class_id = "positive";
                break;
            }
            String tweet_content = data.get(5);
            Instance iInst = new Instance(2);
            iInst.setValue((Attribute) atts.elementAt(0), tweet_content);
            iInst.setValue((Attribute) atts.elementAt(1), class_id);
            instdata.add(iInst);
            System.out.println("[" + i + "] " + class_id + ":" + tweet_content);
            i++;
        }
        data.close();
        StringToWordVector filter = new StringToWordVector();
        instdata.setClassIndex(instdata.numAttributes() - 1);
        filter.setInputFormat(instdata);
        Instances newdata = Filter.useFilter(instdata, filter);
        ArffSaver saver = new ArffSaver();
        saver.setInstances(newdata);
        saver.setFile(new File("./data/train2data.arff"));
        saver.writeBatch();
    } catch (Exception ex) {
        Logger.getLogger(TweetFeatureExtractor.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:com.entopix.maui.main.MauiModelBuilder.java

License:Open Source License

/**
 * Builds the model from the training data
 * @throws MauiFilterException //from   w  ww  .  j a va 2 s . co m
 */
public MauiFilter buildModel(List<MauiDocument> documents) throws MauiFilterException {

    log.info("-- Building the model... ");

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("document", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    mauiFilter = new MauiFilter();
    mauiFilter.setMaxPhraseLength(maxPhraseLength);
    mauiFilter.setMinPhraseLength(minPhraseLength);
    mauiFilter.setMinNumOccur(minNumOccur);
    mauiFilter.setStemmer(stemmer);
    mauiFilter.setDocumentLanguage(documentLanguage);
    mauiFilter.setVocabularyName(vocabularyName);
    mauiFilter.setVocabularyFormat(vocabularyFormat);
    mauiFilter.setStopwords(stopwords);
    mauiFilter.setVocabulary(vocabulary);

    if (classifier != null) {
        mauiFilter.setClassifier(classifier);
    }

    mauiFilter.setInputFormat(data);

    // set features configurations
    mauiFilter.setBasicFeatures(useBasicFeatures);
    mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature);
    mauiFilter.setFrequencyFeatures(useFrequencyFeatures);
    mauiFilter.setPositionsFeatures(usePositionsFeatures);
    mauiFilter.setLengthFeature(useLengthFeature);
    mauiFilter.setThesaurusFeatures(useThesaurusFeatures);
    mauiFilter.setWikipediaFeatures(useWikipediaFeatures, wikiFeatures);

    mauiFilter.setClassifier(classifier);

    if (!vocabularyName.equals("none")) {
        loadVocabulary();
        mauiFilter.setVocabulary(vocabulary);
    }

    log.info("-- Adding documents as instances... ");

    for (MauiDocument document : documents) {

        double[] newInst = new double[3];
        newInst[0] = data.attribute(0).addStringValue(document.getFileName());

        // Adding the text and the topics for the document to the instance
        if (document.getTextContent().length() > 0) {
            newInst[1] = data.attribute(1).addStringValue(document.getTextContent());
        } else {
            newInst[1] = Instance.missingValue();
        }

        if (document.getTopicsString().length() > 0) {
            newInst[2] = data.attribute(2).addStringValue(document.getTopicsString());
        } else {
            newInst[2] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        mauiFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    log.info("-- Building the model... ");

    mauiFilter.batchFinished();

    while ((mauiFilter.output()) != null) {
    }

    return mauiFilter;

}