Example usage for weka.core Instances add

Introduction

In this page you can find the example usage for weka.core Instances add.

Prototype

@Override
public boolean add(Instance instance)

Source Link

Document

Adds one instance to the end of the set.

Usage

From source file:Prediccion.PrecidePasoNodo.java

License:Open Source License

Instances cargarDatos(int hora) throws ParseException {
    //Declaramos los atributos de las instancias
    Attribute a0 = new Attribute("Intervalo", "yyyy-MM-dd HH:mm:ss");
    Attribute a1 = new Attribute("Total");

    ArrayList<Attribute> c = new ArrayList<>();
    c.add(a0);//from  ww  w. j a v  a 2 s . co  m
    c.add(a1);

    //Creamos el conjunto de instancias
    Instances instances = new Instances(nodo, c, 1000);

    //Instanciamos conexion con FT
    cFT = new conectarFusionTables();
    Sqlresponse r = cFT.select(TABLAID, "Intervalo, Total",
            "idNodo = " + nodo + " and Intervalo ENDS WITH '00:00:00'",
            "ORDER BY \'Intervalo\' DESC LIMIT 10000");

    for (List<Object> a : r.getRows()) {
        Instance i = new DenseInstance(2);

        String s0 = (String) a.get(0);
        String s1 = (String) a.get(1);

        System.err.println(s0 + " ->" + s1);

        i.setValue(instances.attribute(0), instances.attribute(0).parseDate(s0));
        i.setValue(instances.attribute(1), Integer.parseInt(s1));

        instances.add(i);
    }

    instances.sort(0);

    return instances;
}

From source file:preprocess.TextDirectoryLoader.java

License:Open Source License

/**
 * Return the full data set. If the structure hasn't yet been determined
 * by a call to getStructure then method should do so before processing
 * the rest of the data set.//from  w w  w.jav  a 2s.co m
 *
 * @return the structure of the data set as an empty set of Instances
 * @throws IOException if there is no source or parsing fails
 */
public Instances getDataSet() throws IOException {
    if (getDirectory() == null)
        throw new IOException("No directory/source has been specified");

    String directoryPath = getDirectory().getAbsolutePath();
    FastVector classes = new FastVector();
    Enumeration enm = getStructure().classAttribute().enumerateValues();
    while (enm.hasMoreElements())
        classes.addElement(enm.nextElement());

    Instances data = getStructure();
    int fileCount = 0;
    for (int k = 0; k < classes.size(); k++) {
        String subdirPath = (String) classes.elementAt(k);
        File subdir = new File(directoryPath + File.separator + subdirPath);
        String[] files = subdir.list();
        for (int j = 0; j < files.length; j++) {
            try {
                fileCount++;
                if (getDebug())
                    System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]);

                double[] newInst = null;
                if (m_OutputFilename)
                    newInst = new double[3];
                else
                    newInst = new double[2];
                File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]);
                BufferedInputStream is;
                is = new BufferedInputStream(new FileInputStream(txt));

                StringBuffer txtStr = new StringBuffer();
                FileReader fr = new FileReader(txt);

                BufferedReader br = new BufferedReader(fr);

                String line;

                while ((line = br.readLine()) != null) {

                    txtStr.append(line + "\n");

                }

                newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
                if (m_OutputFilename)
                    newInst[1] = (double) data.attribute(1)
                            .addStringValue(subdirPath + File.separator + files[j]);
                newInst[data.classIndex()] = (double) k;
                data.add(new Instance(1.0, newInst));
                is.close();
            } catch (Exception e) {
                System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath
                        + File.separator + files[j]);
            }
        }
    }

    return data;
}

From source file:preprocess.TextDirectoryLoaderEX.java

License:Open Source License

/**
 * Return the full data set. If the structure hasn't yet been determined
 * by a call to getStructure then method should do so before processing
 * the rest of the data set.//from   www.j av a  2 s .  c o  m
 *
 * @return the structure of the data set as an empty set of Instances
 * @throws IOException if there is no source or parsing fails
 */
public Instances getDataSet() throws IOException {
    if (getDirectory() == null)
        throw new IOException("No directory/source has been specified");

    String directoryPath = getDirectory().getAbsolutePath();
    FastVector classes = new FastVector();
    Enumeration enm = getStructure().classAttribute().enumerateValues();
    while (enm.hasMoreElements())
        classes.addElement(enm.nextElement());

    Instances data = getStructure();
    int fileCount = 0;
    for (int k = 0; k < classes.size(); k++) {
        String subdirPath = (String) classes.elementAt(k);
        File subdir = new File(directoryPath + File.separator + subdirPath);
        String[] files = subdir.list();
        for (int j = 0; j < files.length; j++) {
            try {
                fileCount++;
                if (getDebug())
                    System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]);

                double[] newInst = null;
                if (m_OutputFilename)
                    newInst = new double[3];
                else
                    newInst = new double[2];
                File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]);
                BufferedInputStream is;
                is = new BufferedInputStream(new FileInputStream(txt));

                StringBuffer txtStr = new StringBuffer();
                /*     int c;
                     while ((c = is.read()) != -1) {
                       txtStr.append((char) c);
                     }*/

                FileReader fr = new FileReader(txt);

                BufferedReader br = new BufferedReader(fr);

                String line;

                while ((line = br.readLine()) != null) {

                    txtStr.append(line + "\n");

                }

                br.close();
                fr.close();

                newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString());
                if (m_OutputFilename)
                    newInst[0] = Integer.valueOf(files[j].toString());
                newInst[data.classIndex()] = (double) k;
                data.add(new Instance(1.0, newInst));
                is.close();
            } catch (Exception e) {
                System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath
                        + File.separator + files[j]);
            }
        }
    }
    return data;
}

From source file:probcog.J48Reader.java

License:Open Source License

public static Instances readDB(String dbname)
        throws IOException, ClassNotFoundException, DDException, FileNotFoundException, Exception {
    Database db = Database.fromFile(new FileInputStream(dbname));
    probcog.srldb.datadict.DataDictionary dd = db.getDataDictionary();
    //the vector of attributes
    FastVector fvAttribs = new FastVector();
    HashMap<String, Attribute> mapAttrs = new HashMap<String, Attribute>();
    for (DDAttribute attribute : dd.getObject("object").getAttributes().values()) {
        if (attribute.isDiscarded() && !attribute.getName().equals("objectT")) {
            continue;
        }// ww w .  j  av  a2 s  . c  om
        FastVector attValues = new FastVector();
        Domain dom = attribute.getDomain();
        for (String s : dom.getValues())
            attValues.addElement(s);
        Attribute attr = new Attribute(attribute.getName(), attValues);
        fvAttribs.addElement(attr);
        mapAttrs.put(attribute.getName(), attr);
    }

    Instances instances = new Instances("name", fvAttribs, 10000);
    instances.setClass(mapAttrs.get("objectT"));
    //for each object add an instance
    for (Object o : db.getObjects()) {
        if (o.hasAttribute("objectT")) {
            Instance instance = new Instance(fvAttribs.size());
            for (Entry<String, String> e : o.getAttributes().entrySet()) {
                if (!dd.getAttribute(e.getKey()).isDiscarded()) {
                    instance.setValue(mapAttrs.get(e.getKey()), e.getValue());
                }
            }
            instances.add(instance);
        }
    }
    return instances;
}

From source file:project.MauiModelBuilder.java

License:Open Source License

/**
 * Builds the model from the training data
 *//* w  w  w. j av a 2s.c o  m*/
public void buildModel(HashSet<String> fileNames) throws Exception {

    // Check whether there is actually any data
    if (fileNames.size() == 0) {
        throw new Exception("Couldn't find any data in " + inputDirectoryName);
    }

    System.err.println("-- Building the model... ");

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("document", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    mauiFilter = new MauiFilter();

    mauiFilter.setDebug(getDebug());
    mauiFilter.setMaxPhraseLength(getMaxPhraseLength());
    mauiFilter.setMinPhraseLength(getMinPhraseLength());
    mauiFilter.setMinNumOccur(getMinNumOccur());
    mauiFilter.setStemmer(getStemmer());
    mauiFilter.setDocumentLanguage(getDocumentLanguage());
    mauiFilter.setVocabularyName(getVocabularyName());
    mauiFilter.setVocabularyFormat(getVocabularyFormat());
    mauiFilter.setStopwords(getStopwords());

    if (wikipedia != null) {
        mauiFilter.setWikipedia(wikipedia);
    } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) {
        mauiFilter.setWikipedia(wikipedia);
    } else {
        mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
    }

    if (classifier != null) {
        mauiFilter.setClassifier(classifier);
    }

    mauiFilter.setInputFormat(data);

    // set features configurations
    mauiFilter.setBasicFeatures(useBasicFeatures);
    mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature);
    mauiFilter.setFrequencyFeatures(useFrequencyFeatures);
    mauiFilter.setPositionsFeatures(usePositionsFeatures);
    mauiFilter.setLengthFeature(useLengthFeature);
    mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);
    mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures);
    mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures);
    mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);

    mauiFilter.setClassifier(classifier);

    mauiFilter.setContextSize(contextSize);
    mauiFilter.setMinKeyphraseness(minKeyphraseness);
    mauiFilter.setMinSenseProbability(minSenseProbability);

    if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
        mauiFilter.loadThesaurus(getStemmer(), getStopwords());
    }

    System.err.println("-- Reading the input documents... ");

    for (String fileName : fileNames) {

        double[] newInst = new double[3];

        newInst[0] = (double) data.attribute(0).addStringValue(fileName);
        ;

        File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
        File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");

        try {

            InputStreamReader is;
            if (!documentEncoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding);
            } else {
                is = new InputStreamReader(new FileInputStream(documentTextFile));
            }

            // Reading the file content
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();

            // Adding the text of the document to the instance
            newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString());

        } catch (Exception e) {

            System.err.println("Problem with reading " + documentTextFile);
            e.printStackTrace();
            newInst[1] = Instance.missingValue();
        }

        try {

            InputStreamReader is;
            if (!documentEncoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding);
            } else {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile));
            }

            // Reading the content of the keyphrase file
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            // Adding the topics to the file
            newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString());

        } catch (Exception e) {

            System.err.println("Problem with reading " + documentTopicsFile);
            e.printStackTrace();
            newInst[2] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        mauiFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    mauiFilter.batchFinished();

    while ((mauiFilter.output()) != null) {
    }
    ;
}

From source file:project.MauiTopicExtractor.java

License:Open Source License

/**
 * Builds the model from the files/*from   www  .  j a  v a2  s .c o  m*/
 */
public void extractKeyphrases(HashSet<String> fileNames) throws Exception {

    // Check whether there is actually any data
    if (fileNames.size() == 0) {
        throw new Exception("Couldn't find any data in " + inputDirectoryName);
    }

    mauiFilter.setVocabularyName(getVocabularyName());
    mauiFilter.setVocabularyFormat(getVocabularyFormat());
    mauiFilter.setDocumentLanguage(getDocumentLanguage());
    mauiFilter.setStemmer(getStemmer());
    mauiFilter.setStopwords(getStopwords());
    if (wikipedia != null) {
        mauiFilter.setWikipedia(wikipedia);
    } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) {
        mauiFilter.setWikipedia(wikipedia);
    } else {
        mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
    }
    if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
        mauiFilter.loadThesaurus(getStemmer(), getStopwords());
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    System.err.println("-- Extracting keyphrases... ");

    Vector<Double> correctStatistics = new Vector<Double>();
    Vector<Double> precisionStatistics = new Vector<Double>();
    Vector<Double> recallStatistics = new Vector<Double>();

    for (String fileName : fileNames) {

        double[] newInst = new double[3];
        System.out.println("fileName print out: " + fileName);
        newInst[0] = (double) data.attribute(0).addStringValue(fileName);
        ;

        File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
        File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");

        try {

            InputStreamReader is;
            if (!documentEncoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding);
            } else {
                is = new InputStreamReader(new FileInputStream(documentTextFile));
            }

            // Reading the file content
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();

            // Adding the text of the document to the instance
            newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString());

        } catch (Exception e) {
            System.err.println("Problem with reading " + documentTextFile);
            e.printStackTrace();
            newInst[1] = Instance.missingValue();
        }

        try {

            InputStreamReader is;
            if (!documentEncoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding);
            } else {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile));
            }

            // Reading the content of the keyphrase file
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            // Adding the topics to the file
            newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString());

        } catch (Exception e) {
            if (debugMode) {
                System.err.println("No existing topics for " + documentTextFile);
            }
            newInst[2] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        mauiFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (debugMode) {
            System.err.println("-- Processing document: " + fileName);
        }
        Instance[] topRankedInstances = new Instance[topicsPerDocument];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = mauiFilter.output()) != null) {

            int index = (int) inst.value(mauiFilter.getRankIndex()) - 1;

            if (index < topicsPerDocument) {
                topRankedInstances[index] = inst;
            }
        }

        if (debugMode) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;

        if (!documentTopicsFile.exists()) {
            out = new FileOutputStream(documentTopicsFile);
            if (!documentEncoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, documentEncoding));
            } else {
                printer = new PrintWriter(out);
            }
        }

        double numExtracted = 0, numCorrect = 0;
        wikipedia = mauiFilter.getWikipedia();

        HashMap<Article, Integer> topics = null;

        if (printGraph) {
            topics = new HashMap<Article, Integer>();
        }

        int p = 0;
        String root = "";
        for (int i = 0; i < topicsPerDocument; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    String topic = topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex());
                    printer.print(topic);

                    if (printGraph) {

                        Article article = wikipedia.getArticleByTitle(topic);
                        if (article == null) {
                            article = wikipedia.getMostLikelyArticle(topic, new CaseFolder());
                        }
                        if (article != null) {
                            if (root == "") {
                                root = article.getTitle();
                            }
                            topics.put(article, new Integer(p));
                        } else {
                            if (debugMode) {
                                System.err.println(
                                        "Couldn't find article for " + topic + " in " + documentTopicsFile);
                            }
                        }
                        p++;
                    }
                    if (additionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(mauiFilter.getNormalizedFormIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(mauiFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (debugMode) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }

        if (printGraph) {
            String graphFile = documentTopicsFile.getAbsolutePath().replace(".key", ".gv");
            computeGraph(topics, root, graphFile);
        }
        if (numExtracted > 0) {
            if (debugMode) {
                System.err.println("-- " + numCorrect + " correct");
            }
            double totalCorrect = mauiFilter.getTotalCorrect();
            correctStatistics.addElement(new Double(numCorrect));
            precisionStatistics.addElement(new Double(numCorrect / numExtracted));
            recallStatistics.addElement(new Double(numCorrect / totalCorrect));

        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
        for (int i = 0; i < topicsPerDocument; i++) {
            System.out.println(topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex()));
        }
    }

    if (correctStatistics.size() != 0) {

        double[] st = new double[correctStatistics.size()];
        for (int i = 0; i < correctStatistics.size(); i++) {
            st[i] = correctStatistics.elementAt(i).doubleValue();
        }
        double avg = Utils.mean(st);
        double stdDev = Math.sqrt(Utils.variance(st));

        if (correctStatistics.size() == 1) {
            System.err.println("\n-- Evaluation results based on 1 document:");

        } else {
            System.err.println("\n-- Evaluation results based on " + correctStatistics.size() + " documents:");
        }
        System.err.println("Avg. number of correct keyphrases per document: " + Utils.doubleToString(avg, 2)
                + " +/- " + Utils.doubleToString(stdDev, 2));

        st = new double[precisionStatistics.size()];
        for (int i = 0; i < precisionStatistics.size(); i++) {
            st[i] = precisionStatistics.elementAt(i).doubleValue();
        }
        double avgPrecision = Utils.mean(st);
        double stdDevPrecision = Math.sqrt(Utils.variance(st));

        System.err.println("Precision: " + Utils.doubleToString(avgPrecision * 100, 2) + " +/- "
                + Utils.doubleToString(stdDevPrecision * 100, 2));

        st = new double[recallStatistics.size()];
        for (int i = 0; i < recallStatistics.size(); i++) {
            st[i] = recallStatistics.elementAt(i).doubleValue();
        }
        double avgRecall = Utils.mean(st);
        double stdDevRecall = Math.sqrt(Utils.variance(st));

        System.err.println("Recall: " + Utils.doubleToString(avgRecall * 100, 2) + " +/- "
                + Utils.doubleToString(stdDevRecall * 100, 2));

        double fMeasure = 2 * avgRecall * avgPrecision / (avgRecall + avgPrecision);
        System.err.println("F-Measure: " + Utils.doubleToString(fMeasure * 100, 2));

        System.err.println("");
    }
    mauiFilter.batchFinished();
}

From source file:project.MauiTopicExtractor.java

License:Open Source License

/**
* Builds the model from the files/*w w  w.  j  ava  2 s.  c om*/
*/
public LinkedList<String> extractKeyphrases(String abstractContents) throws Exception {

    mauiFilter.setVocabularyName(getVocabularyName());
    mauiFilter.setVocabularyFormat(getVocabularyFormat());
    mauiFilter.setDocumentLanguage(getDocumentLanguage());
    mauiFilter.setStemmer(getStemmer());
    mauiFilter.setStopwords(getStopwords());
    if (wikipedia != null) {
        mauiFilter.setWikipedia(wikipedia);
    } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) {
        mauiFilter.setWikipedia(wikipedia);
    } else {
        mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
    }
    if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
        mauiFilter.loadThesaurus(getStemmer(), getStopwords());
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    System.err.println("-- Extracting keyphrases... ");

    Vector<Double> correctStatistics = new Vector<Double>();
    Vector<Double> precisionStatistics = new Vector<Double>();
    Vector<Double> recallStatistics = new Vector<Double>();
    boolean doneAlready = false;
    while (!doneAlready) {

        double[] newInst = new double[3];
        String fileName = "cailen";
        newInst[0] = (double) data.attribute(0).addStringValue(fileName);
        ;

        File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
        File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");

        try {

            // Adding the text of the document to the instance
            newInst[1] = (double) data.attribute(1).addStringValue(abstractContents);

        } catch (Exception e) {
            System.err.println("Problem with reading " + documentTextFile);
            e.printStackTrace();
            newInst[1] = Instance.missingValue();
        }

        try {

            InputStreamReader is;
            if (!documentEncoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding);
            } else {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile));
            }

            // Reading the content of the keyphrase file
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            // Adding the topics to the file
            newInst[2] = (double) data.attribute(2).addStringValue(abstractContents);

        } catch (Exception e) {
            if (debugMode) {
                System.err.println("No existing topics for " + documentTextFile);
            }
            newInst[2] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        mauiFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (debugMode) {
            System.err.println("-- Processing document: " + fileName);
        }
        Instance[] topRankedInstances = new Instance[topicsPerDocument];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = mauiFilter.output()) != null) {

            int index = (int) inst.value(mauiFilter.getRankIndex()) - 1;

            if (index < topicsPerDocument) {
                topRankedInstances[index] = inst;
            }
        }

        if (debugMode) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;

        if (!documentTopicsFile.exists()) {
            out = new FileOutputStream(documentTopicsFile);
            if (!documentEncoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, documentEncoding));
            } else {
                printer = new PrintWriter(out);
            }
        }

        double numExtracted = 0, numCorrect = 0;
        wikipedia = mauiFilter.getWikipedia();

        HashMap<Article, Integer> topics = null;

        if (printGraph) {
            topics = new HashMap<Article, Integer>();
        }

        int p = 0;
        String root = "";
        for (int i = 0; i < topicsPerDocument; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    String topic = topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex());
                    printer.print(topic);

                    if (printGraph) {

                        Article article = wikipedia.getArticleByTitle(topic);
                        if (article == null) {
                            article = wikipedia.getMostLikelyArticle(topic, new CaseFolder());
                        }
                        if (article != null) {
                            if (root == "") {
                                root = article.getTitle();
                            }
                            topics.put(article, new Integer(p));
                        } else {
                            if (debugMode) {
                                System.err.println(
                                        "Couldn't find article for " + topic + " in " + documentTopicsFile);
                            }
                        }
                        p++;
                    }
                    if (additionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(mauiFilter.getNormalizedFormIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(mauiFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (debugMode) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }

        if (printGraph) {
            String graphFile = documentTopicsFile.getAbsolutePath().replace(".key", ".gv");
            computeGraph(topics, root, graphFile);
        }
        if (numExtracted > 0) {
            if (debugMode) {
                System.err.println("-- " + numCorrect + " correct");
            }
            double totalCorrect = mauiFilter.getTotalCorrect();
            correctStatistics.addElement(new Double(numCorrect));
            precisionStatistics.addElement(new Double(numCorrect / numExtracted));
            recallStatistics.addElement(new Double(numCorrect / totalCorrect));

        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }

        for (int i = 0; i < topicsPerDocument; i++) {
            topicsList.add(topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex()));
        }
        doneAlready = true;

    }

    if (correctStatistics.size() != 0) {

        double[] st = new double[correctStatistics.size()];
        for (int i = 0; i < correctStatistics.size(); i++) {
            st[i] = correctStatistics.elementAt(i).doubleValue();
        }
        double avg = Utils.mean(st);
        double stdDev = Math.sqrt(Utils.variance(st));

        if (correctStatistics.size() == 1) {
            System.err.println("\n-- Evaluation results based on 1 document:");

        } else {
            System.err.println("\n-- Evaluation results based on " + correctStatistics.size() + " documents:");
        }
        System.err.println("Avg. number of correct keyphrases per document: " + Utils.doubleToString(avg, 2)
                + " +/- " + Utils.doubleToString(stdDev, 2));

        st = new double[precisionStatistics.size()];
        for (int i = 0; i < precisionStatistics.size(); i++) {
            st[i] = precisionStatistics.elementAt(i).doubleValue();
        }
        double avgPrecision = Utils.mean(st);
        double stdDevPrecision = Math.sqrt(Utils.variance(st));

        System.err.println("Precision: " + Utils.doubleToString(avgPrecision * 100, 2) + " +/- "
                + Utils.doubleToString(stdDevPrecision * 100, 2));

        st = new double[recallStatistics.size()];
        for (int i = 0; i < recallStatistics.size(); i++) {
            st[i] = recallStatistics.elementAt(i).doubleValue();
        }
        double avgRecall = Utils.mean(st);
        double stdDevRecall = Math.sqrt(Utils.variance(st));

        System.err.println("Recall: " + Utils.doubleToString(avgRecall * 100, 2) + " +/- "
                + Utils.doubleToString(stdDevRecall * 100, 2));

        double fMeasure = 2 * avgRecall * avgPrecision / (avgRecall + avgPrecision);
        System.err.println("F-Measure: " + Utils.doubleToString(fMeasure * 100, 2));

        System.err.println("");
    }
    mauiFilter.batchFinished();

    return topicsList;
}

From source file:py.fpuna.lib.ExtendedInstanceQuery.java

License:Open Source License

/**
 * Makes a database query to convert a table into a set of instances
 *
 * @param query the query to convert to instances
 * @return the instances contained in the result of the query, NULL if the
 * SQL query doesn't return a ResultSet, e.g., DELETE/INSERT/UPDATE
 * @throws Exception if an error occurs//  ww  w  . ja v  a 2 s  .c  om
 */
public Instances retrieveInstances(String query) throws Exception {

    if (m_Debug)
        System.err.println("Executing query: " + query);
    connectToDatabase();
    if (execute(query) == false) {
        if (m_PreparedStatement.getUpdateCount() == -1) {
            throw new Exception("Query didn't produce results");
        } else {
            if (m_Debug)
                System.err.println(m_PreparedStatement.getUpdateCount() + " rows affected.");
            close();
            return null;
        }
    }
    ResultSet rs = getResultSet();
    if (m_Debug)
        System.err.println("Getting metadata...");
    ResultSetMetaData md = rs.getMetaData();
    if (m_Debug)
        System.err.println("Completed getting metadata...");

    // Determine structure of the instances
    int numAttributes = md.getColumnCount();
    int[] attributeTypes = new int[numAttributes];
    Hashtable[] nominalIndexes = new Hashtable[numAttributes];
    FastVector[] nominalStrings = new FastVector[numAttributes];
    for (int i = 1; i <= numAttributes; i++) {
        /* switch (md.getColumnType(i)) {
        case Types.CHAR:
        case Types.VARCHAR:
        case Types.LONGVARCHAR:
        case Types.BINARY:
        case Types.VARBINARY:
        case Types.LONGVARBINARY:*/

        switch (translateDBColumnType(md.getColumnTypeName(i))) {

        case STRING:
            //System.err.println("String --> nominal");
            attributeTypes[i - 1] = Attribute.NOMINAL;
            nominalIndexes[i - 1] = new Hashtable();
            nominalStrings[i - 1] = new FastVector();
            break;
        case TEXT:
            //System.err.println("Text --> string");
            attributeTypes[i - 1] = Attribute.STRING;
            nominalIndexes[i - 1] = new Hashtable();
            nominalStrings[i - 1] = new FastVector();
            break;
        case BOOL:
            //System.err.println("boolean --> nominal");
            attributeTypes[i - 1] = Attribute.NOMINAL;
            nominalIndexes[i - 1] = new Hashtable();
            nominalIndexes[i - 1].put("false", new Double(0));
            nominalIndexes[i - 1].put("true", new Double(1));
            nominalStrings[i - 1] = new FastVector();
            nominalStrings[i - 1].addElement("false");
            nominalStrings[i - 1].addElement("true");
            break;
        case DOUBLE:
            //System.err.println("BigDecimal --> numeric");
            attributeTypes[i - 1] = Attribute.NUMERIC;
            break;
        case BYTE:
            //System.err.println("byte --> numeric");
            attributeTypes[i - 1] = Attribute.NUMERIC;
            break;
        case SHORT:
            //System.err.println("short --> numeric");
            attributeTypes[i - 1] = Attribute.NUMERIC;
            break;
        case INTEGER:
            //System.err.println("int --> numeric");
            attributeTypes[i - 1] = Attribute.NUMERIC;
            break;
        case LONG:
            //System.err.println("long --> numeric");
            attributeTypes[i - 1] = Attribute.NUMERIC;
            break;
        case FLOAT:
            //System.err.println("float --> numeric");
            attributeTypes[i - 1] = Attribute.NUMERIC;
            break;
        case DATE:
            attributeTypes[i - 1] = Attribute.DATE;
            break;
        case TIME:
            attributeTypes[i - 1] = Attribute.DATE;
            break;
        default:
            //System.err.println("Unknown column type");
            attributeTypes[i - 1] = Attribute.STRING;
        }
    }

    // For sqlite
    // cache column names because the last while(rs.next()) { iteration for
    // the tuples below will close the md object:
    Vector<String> columnNames = new Vector<String>();
    for (int i = 0; i < numAttributes; i++) {
        columnNames.add(md.getColumnLabel(i + 1));
    }

    // Step through the tuples
    if (m_Debug)
        System.err.println("Creating instances...");
    FastVector instances = new FastVector();
    int rowCount = 0;
    while (rs.next()) {
        if (rowCount % 100 == 0) {
            if (m_Debug) {
                System.err.print("read " + rowCount + " instances \r");
                System.err.flush();
            }
        }
        double[] vals = new double[numAttributes];
        for (int i = 1; i <= numAttributes; i++) {
            /*switch (md.getColumnType(i)) {
            case Types.CHAR:
            case Types.VARCHAR:
            case Types.LONGVARCHAR:
            case Types.BINARY:
            case Types.VARBINARY:
            case Types.LONGVARBINARY:*/
            switch (translateDBColumnType(md.getColumnTypeName(i))) {
            case STRING:
                String str = rs.getString(i);

                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    Double index = (Double) nominalIndexes[i - 1].get(str);
                    if (index == null) {
                        index = new Double(nominalStrings[i - 1].size());
                        nominalIndexes[i - 1].put(str, index);
                        nominalStrings[i - 1].addElement(str);
                    }
                    vals[i - 1] = index.doubleValue();
                }
                break;
            case TEXT:
                String txt = rs.getString(i);

                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    Double index = (Double) nominalIndexes[i - 1].get(txt);
                    if (index == null) {

                        // Need to add one because first value in
                        // string attribute is dummy value.
                        index = new Double(nominalStrings[i - 1].size()) + 1;
                        nominalIndexes[i - 1].put(txt, index);
                        nominalStrings[i - 1].addElement(txt);
                    }
                    vals[i - 1] = index.doubleValue();
                }
                break;
            case BOOL:
                boolean boo = rs.getBoolean(i);
                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    vals[i - 1] = (boo ? 1.0 : 0.0);
                }
                break;
            case DOUBLE:
                //     BigDecimal bd = rs.getBigDecimal(i, 4);
                double dd = rs.getDouble(i);
                // Use the column precision instead of 4?
                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    //       newInst.setValue(i - 1, bd.doubleValue());
                    vals[i - 1] = dd;
                }
                break;
            case BYTE:
                byte by = rs.getByte(i);
                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    vals[i - 1] = (double) by;
                }
                break;
            case SHORT:
                short sh = rs.getShort(i);
                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    vals[i - 1] = (double) sh;
                }
                break;
            case INTEGER:
                int in = rs.getInt(i);
                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    vals[i - 1] = (double) in;
                }
                break;
            case LONG:
                long lo = rs.getLong(i);
                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    vals[i - 1] = (double) lo;
                }
                break;
            case FLOAT:
                float fl = rs.getFloat(i);
                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    vals[i - 1] = (double) fl;
                }
                break;
            case DATE:
                Date date = rs.getDate(i);
                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    // TODO: Do a value check here.
                    vals[i - 1] = (double) date.getTime();
                }
                break;
            case TIME:
                Time time = rs.getTime(i);
                if (rs.wasNull()) {
                    vals[i - 1] = Instance.missingValue();
                } else {
                    // TODO: Do a value check here.
                    vals[i - 1] = (double) time.getTime();
                }
                break;
            default:
                vals[i - 1] = Instance.missingValue();
            }
        }
        Instance newInst;
        if (m_CreateSparseData) {
            newInst = new SparseInstance(1.0, vals);
        } else {
            newInst = new Instance(1.0, vals);
        }
        instances.addElement(newInst);
        rowCount++;
    }
    //disconnectFromDatabase();  (perhaps other queries might be made)

    // Create the header and add the instances to the dataset
    if (m_Debug)
        System.err.println("Creating header...");
    FastVector attribInfo = new FastVector();
    for (int i = 0; i < numAttributes; i++) {
        /* Fix for databases that uppercase column names */
        // String attribName = attributeCaseFix(md.getColumnName(i + 1));
        String attribName = attributeCaseFix(columnNames.get(i));
        switch (attributeTypes[i]) {
        case Attribute.NOMINAL:
            attribInfo.addElement(new Attribute(attribName, nominalStrings[i]));
            break;
        case Attribute.NUMERIC:
            attribInfo.addElement(new Attribute(attribName));
            break;
        case Attribute.STRING:
            Attribute att = new Attribute(attribName, (FastVector) null);
            attribInfo.addElement(att);
            for (int n = 0; n < nominalStrings[i].size(); n++) {
                att.addStringValue((String) nominalStrings[i].elementAt(n));
            }
            break;
        case Attribute.DATE:
            attribInfo.addElement(new Attribute(attribName, (String) null));
            break;
        default:
            throw new Exception("Unknown attribute type");
        }
    }
    Instances result = new Instances("QueryResult", attribInfo, instances.size());
    for (int i = 0; i < instances.size(); i++) {
        result.add((Instance) instances.elementAt(i));
    }
    close(rs);

    return result;
}

From source file:qa.experiment.ProcessFeatureVector.java

public String trainAndPredict(String[] processNames, String question) throws Exception {
    FastVector fvWekaAttribute = generateWEKAFeatureVector(processNames);
    Instances trainingSet = new Instances("Rel", fvWekaAttribute, bowFeature.size() + 1);
    trainingSet.setClassIndex(bowFeature.size());

    int cnt = 0;//from   w w w.ja  v  a2  s . c o m
    for (int i = 0; i < arrProcessFeature.size(); i++) {
        String[] names = arrProcessFeature.get(i).getProcessName().split("\\|");
        int sim = isNameFuzzyMatch(processNames, names);
        if (sim != -1) {
            // System.out.println("match " + arrProcessFeature.get(i).getProcessName());
            ArrayList<String> featureVector = arrProcessFeature.get(i).getFeatureVectors();
            for (int j = 0; j < featureVector.size(); j++) {
                Instance trainInstance = new Instance(bowFeature.size() + 1);
                String[] attrValues = featureVector.get(j).split("\t");
                // System.out.println(trainInstance.numAttributes());
                // System.out.println(fvWekaAttribute.size());
                for (int k = 0; k < bowFeature.size(); k++) {
                    trainInstance.setValue((Attribute) fvWekaAttribute.elementAt(k),
                            Integer.parseInt(attrValues[k]));
                }
                trainInstance.setValue((Attribute) fvWekaAttribute.elementAt(bowFeature.size()),
                        processNames[sim]);
                trainingSet.add(trainInstance);

                //System.out.println(cnt);
                cnt++;
            }
        }
    }

    Classifier cl = new NaiveBayes();
    cl.buildClassifier(trainingSet);
    Instance inst = new Instance(bowFeature.size() + 1);
    //String[] tokenArr = tokens.toArray(new String[tokens.size()]);
    for (int j = 0; j < bowFeature.size(); j++) {
        List<String> tokens = slem.tokenize(question);
        String[] tokArr = tokens.toArray(new String[tokens.size()]);
        int freq = getFrequency(bowFeature.get(j), tokArr);
        inst.setValue((Attribute) fvWekaAttribute.elementAt(j), freq);
    }

    inst.setDataset(trainingSet);
    int idxMax = ArrUtil.getIdxMax(cl.distributionForInstance(inst));
    return processNames[idxMax];
}

From source file:rdfsystem.data.DataMining.java

private static Instances transformData(RdfManager manager, boolean hasYear) throws Exception {
    Set<String> words = getAllWords(manager);

    FastVector binary = new FastVector();
    binary.addElement("true");
    binary.addElement("false");

    FastVector attrs = new FastVector();
    if (hasYear) {
        Attribute yearAttr = new Attribute("year");
        attrs.addElement(yearAttr);//from  w ww.ja  v  a  2 s  .  com
    }

    for (String word : words) {
        Attribute attr = new Attribute(word, binary);
        attrs.addElement(attr);
    }

    Instances ins = new Instances("paper", attrs, 0);

    for (Map.Entry<String, Paper> item : manager) {
        Paper p = item.getValue();

        double[] row = new double[ins.numAttributes()];
        int start = 0;
        if (hasYear) {
            row[0] = p.getYear();
            start++;
        }
        for (int i = start; i < row.length; i++)
            row[i] = ins.attribute(i).indexOfValue("false");

        for (String label : p.getLabel()) {
            int index = ins.attribute("label_" + label).index();
            row[index] = ins.attribute(index).indexOfValue("true");
        }

        for (Author au : p.getList()) {
            int index = ins.attribute("author_" + au.getId()).index();
            row[index] = ins.attribute(index).indexOfValue("true");
        }

        ins.add(new Instance(1.0, row));
    }

    if (hasYear) {
        NumericToNominal f1 = new NumericToNominal();
        f1.setInputFormat(ins);
        ins = Filter.useFilter(ins, f1);
    }

    return ins;
}