Example usage for weka.core Instances add

Introduction

In this page you can find the example usage for weka.core Instances add.

Prototype

@Override
public boolean add(Instance instance)

Source Link

Document

Adds one instance to the end of the set.

Usage

From source file:com.gamerecommendation.Weatherconditions.Clasificacion.java

public String clasificar(String[] testCases) throws Exception {
    String ruta = "model.model";

    InputStream classModelStream;
    classModelStream = getClass().getResourceAsStream(ruta);
    Classifier clasify = (Classifier) SerializationHelper.read(classModelStream);
    FastVector condition = new FastVector();
    condition.addElement("Cloudy");
    condition.addElement("Clear");
    condition.addElement("Sunny");
    condition.addElement("Fair");
    condition.addElement("Partly_Cloudy");
    condition.addElement("Mostly_Cloudy");
    condition.addElement("Showers");
    condition.addElement("Haze");
    condition.addElement("Dust");
    condition.addElement("Other");
    Attribute _condition = new Attribute("contition", condition);

    FastVector temperature = new FastVector();
    temperature.addElement("Hot");
    temperature.addElement("Mild");
    temperature.addElement("Cool");
    Attribute _temperature = new Attribute("temperature", temperature);

    FastVector chill = new FastVector();
    chill.addElement("Regrettable");
    chill.addElement("Mint");
    Attribute _chill = new Attribute("chill", chill);

    FastVector direction = new FastVector();
    direction.addElement("Mint");
    direction.addElement("Fair");
    direction.addElement("Regular");
    Attribute _direction = new Attribute("direction", direction);

    FastVector speed = new FastVector();
    speed.addElement("Mint");
    speed.addElement("Fair");
    speed.addElement("Regular");
    Attribute _speed = new Attribute("speed", speed);

    FastVector humidity = new FastVector();
    humidity.addElement("High");
    humidity.addElement("Normal");
    humidity.addElement("Low");
    Attribute _humidity = new Attribute("humidity", humidity);

    FastVector visibility = new FastVector();
    visibility.addElement("Recommended");
    visibility.addElement("Not_Recommended");
    Attribute _visibility = new Attribute("visibility", visibility);

    FastVector preassure = new FastVector();
    preassure.addElement("Fair");
    preassure.addElement("Mint");
    Attribute _preassure = new Attribute("preassure", preassure);

    FastVector Class = new FastVector();
    Class.addElement("Recommended");
    Class.addElement("Not_Recommended");
    Attribute _Class = new Attribute("class", Class);

    FastVector atributos = new FastVector(9);
    atributos.addElement(_condition);//from w w  w  .  jav a 2  s  .  c  o  m
    atributos.addElement(_temperature);
    atributos.addElement(_chill);
    atributos.addElement(_direction);
    atributos.addElement(_speed);
    atributos.addElement(_humidity);
    atributos.addElement(_visibility);
    atributos.addElement(_preassure);
    atributos.addElement(_Class);

    ArrayList<Attribute> atributs = new ArrayList<>();
    atributs.add(_condition);
    atributs.add(_temperature);
    atributs.add(_chill);
    atributs.add(_direction);
    atributs.add(_speed);
    atributs.add(_humidity);
    atributs.add(_visibility);
    atributs.add(_preassure);
    atributs.add(_Class);

    //Aqu se crea la instacia, que tiene todos los atributos del modelo
    Instances dataTest = new Instances("TestCases", atributos, 1);
    dataTest.setClassIndex(8);

    Instance setPrueba = new Instance(9);

    int index = -1;
    for (int i = 0; i < 8; i++) {
        index = atributs.get(i).indexOfValue(testCases[i]);
        //System.out.println(i + " " + atributs.get(i)  + " " + index + " " + testCases[i]);
        setPrueba.setValue(atributs.get(i), index);
    }

    //Agregando el set que se desea evaluar.
    dataTest.add(setPrueba);

    //Realizando la Prediccin
    //La instancia es la 0 debido a que es la unica que se encuentra.
    double valorP = clasify.classifyInstance(dataTest.instance(0));
    //get the name of the class value
    String prediccion = dataTest.classAttribute().value((int) valorP);

    return prediccion;
}

From source file:com.github.polarisation.kea.main.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//w  ww .  j ava  2 s  .  c o m
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    m_KEAFilter.setNumPhrases(m_numPhrases);
    m_KEAFilter.setVocabulary(m_vocabulary);
    m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setStemmer(m_Stemmer);
    m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    System.err.println("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        m_KEAFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = m_KEAFilter.output()) != null) {

            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;

            }
        }

        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));

                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    System.err.println("Avg. number of matching keyphrases compared to existing ones : "
            + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:com.hack23.cia.service.impl.action.user.wordcount.WordCounterImpl.java

License:Apache License

@Override
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData,
        final int maxResult) {

    final String html = documentContentData.getContent();

    final Attribute input = new Attribute("html", (ArrayList<String>) null);

    final ArrayList<Attribute> inputVec = new ArrayList<>();
    inputVec.add(input);//from w  w w  . ja v  a 2s.  co m

    final Instances htmlInst = new Instances("html", inputVec, 1);

    htmlInst.add(new DenseInstance(1));
    htmlInst.instance(0).setValue(0, html);

    final StopwordsHandler StopwordsHandler = new StopwordsHandler() {

        @Override
        public boolean isStopword(final String word) {

            return word.length() < 5;
        }
    };

    final NGramTokenizer tokenizer = new NGramTokenizer();
    tokenizer.setNGramMinSize(1);
    tokenizer.setNGramMaxSize(1);
    tokenizer.setDelimiters(" \r\n\t.,;:'\"()?!'");

    final StringToWordVector filter = new StringToWordVector();
    filter.setTokenizer(tokenizer);
    filter.setStopwordsHandler(StopwordsHandler);
    filter.setLowerCaseTokens(true);
    filter.setOutputWordCounts(true);
    filter.setWordsToKeep(maxResult);

    final Map<String, Integer> result = new HashMap<>();

    try {
        filter.setInputFormat(htmlInst);
        final Instances dataFiltered = Filter.useFilter(htmlInst, filter);

        final Instance last = dataFiltered.lastInstance();

        final int numAttributes = last.numAttributes();

        for (int i = 0; i < numAttributes; i++) {
            result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
        }
    } catch (final Exception e) {
        LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e);
    }

    return result;
}

From source file:com.ivanrf.smsspam.SpamClassifier.java

License:Apache License

public static String classify(String model, String text, JTextArea log) {
    FilteredClassifier classifier = loadModel(model, log);

    //Create the instance
    ArrayList<String> fvNominalVal = new ArrayList<String>();
    fvNominalVal.add("ham");
    fvNominalVal.add("spam");

    Attribute attribute1 = new Attribute("spam_class", fvNominalVal);
    Attribute attribute2 = new Attribute("text", (List<String>) null);
    ArrayList<Attribute> fvWekaAttributes = new ArrayList<Attribute>();
    fvWekaAttributes.add(attribute1);//from w  w w .j  a  va 2 s. c  om
    fvWekaAttributes.add(attribute2);

    Instances instances = new Instances("Test relation", fvWekaAttributes, 1);
    instances.setClassIndex(0);

    DenseInstance instance = new DenseInstance(2);
    instance.setValue(attribute2, text);
    instances.add(instance);

    publishEstado("=== Instance created ===", log);
    publishEstado(instances.toString(), log);

    //Classify the instance
    try {
        publishEstado("=== Classifying instance ===", log);

        double pred = classifier.classifyInstance(instances.instance(0));

        publishEstado("=== Instance classified  ===", log);

        String classPredicted = instances.classAttribute().value((int) pred);
        publishEstado("Class predicted: " + classPredicted, log);

        return classPredicted;
    } catch (Exception e) {
        publishEstado("Error found when classifying the text", log);
        return null;
    }
}

From source file:com.kdcloud.lib.client.StubClient.java

License:Open Source License

@Override
public Instances getData() {
    double[] values = readData("ecg_small.csv");
    log("data length: " + values.length);
    Instances data = modality.getInputSpecification().newInstances("test");
    for (int i = 0; i < values.length; i++) {
        double[] cells = { values[i] };
        data.add(new DenseInstance(1, cells));
    }/*  ww w.  java2s. c  o  m*/
    return data;
}

From source file:com.mechaglot_Alpha2.controller.Calculate.java

License:Creative Commons License

/**
 * /*from  w  ww .  j  a  va2s. co  m*/
 * @param in
 *            String representing the calculated String-metric distances,
 *            comma separated.
 * @return Instance The inputted series of numbers (comma separated) as
 *         Instance.
 */

private Instance instanceMaker(String in) {

    String[] s = in.split(",");
    double[] r = new double[s.length];
    for (int t = 0; t < r.length; t++) {
        r[t] = Double.parseDouble(s[t]);
    }

    int sz = r.length - 1;

    ArrayList<Attribute> atts = new ArrayList<Attribute>(sz);

    for (int t = 0; t < sz + 1; t++) {
        atts.add(new Attribute("number" + t, t));
    }

    Instances dataRaw = new Instances("TestInstances", atts, sz);
    dataRaw.add(new DenseInstance(1.0, r));
    Instance first = dataRaw.firstInstance(); //
    int cIdx = dataRaw.numAttributes() - 1;
    dataRaw.setClassIndex(cIdx);

    return first;

}

From source file:com.openkm.kea.metadata.SubjectExtractor.java

License:Open Source License

/**
 * extractSuggestedSubjects//  w  w w .j a v  a2  s  . c om
 * 
 * @param documentText
 * @return
 */
public List<String> extractSuggestedSubjects(String documentText) {

    Date start, stop;

    start = new Date();
    List<String> subjects = new ArrayList<String>();
    // no idea what this is ....
    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances unknownDataStructure = new Instances("keyphrase_training_data", atts, 0);

    try {
        // this is the exrtraction process part - not too well understood yet
        // "unkowndatastructure" is called instances in original KEA code
        double[] unknownStructure = new double[2];
        unknownStructure[0] = (double) unknownDataStructure.attribute(0).addStringValue(documentText);
        unknownStructure[1] = Instance.missingValue(); // this part used for existing subjects - we have none
        unknownDataStructure.add(new Instance(1.0, unknownStructure));
        filter.input(unknownDataStructure.instance(0));
        unknownDataStructure.stringFreeStructure(); //??**&%%!!!??

        // this is getting the results out - better understood
        Instance[] rankedSubjects = new Instance[this.subjectNumLimit];
        Instance subject;
        while ((subject = filter.output()) != null) {
            int index = (int) subject.value(filter.getRankIndex()) - 1;
            if (index < subjectNumLimit) {
                rankedSubjects[index] = subject;
            }
        }
        for (int i = 0; i < subjectNumLimit; i++) {
            if (rankedSubjects[i] != null) {
                subjects.add(rankedSubjects[i].stringValue(filter.getUnstemmedPhraseIndex()));
            }
        }

    } catch (Exception e) {
        log.error("problem in subject extraction: ", e);
    } finally {
        stop = new Date();
        long time = (stop.getTime() - start.getTime());
        log.info("Subject extraction completed in " + time + "ms");
    }

    return subjects;
}

From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//from w  w  w . j  a  va  2 s  .  c om
 */
public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception {
    Vector<Double> stats = new Vector<Double>();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    m_KEAFilter.setNumPhrases(m_numPhrases);
    m_KEAFilter.setVocabulary(m_vocabulary);
    m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setStemmer(m_Stemmer);
    m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    log.info("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration<String> elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                log.debug("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                log.debug("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        m_KEAFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (m_debug) {
            log.debug("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = m_KEAFilter.output()) != null) {

            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;

            }
        }

        if (m_debug) {
            log.debug("-- Keyphrases and feature values:");
        }

        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }

        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                // My addition: to exclude low ranking phrases
                double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex());

                if (rank >= 0.00) {
                    if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                        numExtracted += 1.0;
                    }
                    if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                        numCorrect += 1.0;
                    }
                    if (printer != null) {
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));

                        if (m_AdditionalInfo) {
                            printer.print("\t");
                            printer.print(
                                    topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                            printer.print("\t");
                            printer.print(Utils.doubleToString(
                                    topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                        }
                        printer.println();
                    }
                    if (m_debug) {
                        log.debug("" + topRankedInstances[i]);
                    }
                }
            }
        }

        if (numExtracted > 0) {
            if (m_debug) {
                log.debug("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }

        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }

    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2)
            + " +/- " + Utils.doubleToString(stdDev, 2));
    log.info("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:com.openkm.kea.modelcreator.KEAModelBuilder.java

License:Open Source License

/**
 * Builds the model from the files/*w w w.j  ava  2  s  . c  om*/
 */
public void buildModel(Hashtable<String, Double> stems, Stopwords stopwords) throws Exception {
    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    m_KEAFilter = new KEAFilter(stopwords);

    m_KEAFilter.setDebug(m_debug);
    m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
    m_KEAFilter.setKFused(getUseKFrequency());

    m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
    m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
    m_KEAFilter.setMinNumOccur(getMinNumOccur());
    m_KEAFilter.setStemmer(getStemmer());
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setVocabulary(getVocabulary());
    m_KEAFilter.setVocabularyFormat(getVocabularyFormat());
    m_KEAFilter.setStopwords(getStopwords());
    m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
    m_KEAFilter.setInputFormat(data);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(getStemmer(), getStopwords());
    }
    m_KEAFilter.setNumFeature();

    log.info("-- Reading the Documents... ");

    Enumeration<String> elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            log.error("Can't find document for stem " + str + ".");
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            log.error("Can't find keyphrases for stem " + str + ".");
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    m_KEAFilter.batchFinished();

    while ((m_KEAFilter.output()) != null) {
    }
    ;
}

From source file:com.reactivetechnologies.analytics.core.eval.StackingWithBuiltClassifiers.java

License:Open Source License

/**
 * Generates the meta data//from   w w w . j a v  a 2s.c o m
 *
 * @param newData the data to work on
 * @param random the random number generator to use for cross-validation
 * @throws Exception if generation fails
 */
@Override
protected void generateMetaLevel(Instances newData, Random random) throws Exception {

    Instances metaData = metaFormat(newData);
    m_MetaFormat = new Instances(metaData, 0);
    for (int j = 0; j < m_NumFolds; j++) {

        /** Changed here */
        //Instances train = newData.trainCV(m_NumFolds, j, random);
        // DO NOT Build base classifiers
        /*for (int i = 0; i < m_Classifiers.length; i++) {
            getClassifier(i).buildClassifier(train);
        }*/
        /** End change */

        // Classify test instances and add to meta data
        Instances test = newData.testCV(m_NumFolds, j);
        for (int i = 0; i < test.numInstances(); i++) {
            metaData.add(metaInstance(test.instance(i)));
        }
    }

    m_MetaClassifier.buildClassifier(metaData);
}