Example usage for weka.core Instances attribute

List of usage examples for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name) 

Source Link

Document

Returns an attribute given its name.

Usage

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Sets the format of the input instances.
 *
 * @param instanceInfo an Instances object containing the input
 * instance structure (any instances contained in the object are
 * ignored - only the structure is required).
 * @return true if the outputFormat may be collected immediately 
 *///w  ww.  j av  a 2  s  .  co m
public boolean setInputFormat(Instances instanceInfo) throws Exception {

    if (instanceInfo.classIndex() >= 0) {
        throw new Exception("Don't know what do to if class index set!");
    }
    if (!instanceInfo.attribute(m_KeyphrasesAtt).isString()
            || !instanceInfo.attribute(m_DocumentAtt).isString()) {
        throw new Exception("Keyphrase attribute and document attribute " + "need to be string attributes.");
    }
    m_PunctFilter = new KEAPhraseFilter();
    int[] arr = new int[1];
    arr[0] = m_DocumentAtt;
    m_PunctFilter.setAttributeIndicesArray(arr);
    m_PunctFilter.setInputFormat(instanceInfo);
    m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());
    m_NumbersFilter = new NumbersFilter();
    m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat());
    super.setInputFormat(m_NumbersFilter.getOutputFormat());
    return false;
}

From source file:kea.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files/*from   ww  w.j a  va2s. c o m*/
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Extract keyphrases
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();
        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;
        while ((inst = m_KEAFilter.output()) != null) {
            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;
            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }
        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;
        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i]
                        .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i]
                                .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- "
            + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    m_KEAFilter.batchFinished();
}

From source file:kea.KEAModelBuilder.java

License:Open Source License

/**
 * Builds the model from the files/*from   w w  w  .j  a  v a2s  .co  m*/
 */
public void buildModel(Hashtable stems) throws Exception {

    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    m_KEAFilter = new KEAFilter();
    m_KEAFilter.setDebug(m_debug);
    m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
    m_KEAFilter.setKFused(getUseKFrequency());
    m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
    m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
    m_KEAFilter.setMinNumOccur(getMinNumOccur());
    m_KEAFilter.setInputFormat(data);
    m_KEAFilter.setStemmer(getStemmer());
    m_KEAFilter.setStopwords(getStopwords());
    m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();
        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            BufferedReader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't find document for stem " + str + ".");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            BufferedReader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't find keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    m_KEAFilter.batchFinished();

    // Get rid of instances in filter
    while (m_KEAFilter.output() != null) {
    }
    ;
}

From source file:kea.main.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//w  w w  . j a v  a  2 s .  c  o  m
 */
public synchronized void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    this.m_KEAFilter.setNumPhrases(m_numPhrases);
    this.m_KEAFilter.setVocabulary(m_vocabulary);
    this.m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    this.m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    this.m_KEAFilter.setStemmer(m_Stemmer);
    this.m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        this.m_KEAFilter.m_NODEfeature = false;
    } else {
        // Know thesaurus is loaded in the constructor
        //m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, vocabularyDir, manager);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (this.m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    System.out.println("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            is.close();

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        this.m_KEAFilter.input(data.instance(0), vocabulary);

        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = this.m_KEAFilter.output()) != null) {

            int index = (int) inst.value(this.m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }

        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(
                            topRankedInstances[i].stringValue(this.m_KEAFilter.getUnstemmedPhraseIndex()));

                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(
                                topRankedInstances[i].stringValue(this.m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(this.m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    System.out.println("Avg. number of matching keyphrases compared to existing ones : "
            + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2));
    System.out.println("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:learn.Classification.Chinese.TextDirectoryLoader.java

License:Open Source License

/**
 * Return the full data set. If the structure hasn't yet been determined by
 * a call to getStructure then method should do so before processing the
 * rest of the data set./*from  ww w  .  j a  v a  2s.co  m*/
 * 
 * @return the structure of the data set as an empty set of Instances
 * @throws IOException
 *             if there is no source or parsing fails
 */
public Instances getDataSet() throws IOException {
    if (getDirectory() == null)
        throw new IOException("No directory/source has been specified");

    String directoryPath = getDirectory().getAbsolutePath();
    ArrayList<String> classes = new ArrayList<String>();
    Enumeration enm = getStructure().classAttribute().enumerateValues();
    while (enm.hasMoreElements())
        classes.add((String) enm.nextElement());

    Instances data = getStructure();
    int fileCount = 0;
    for (int k = 0; k < classes.size(); k++) {
        String subdirPath = (String) classes.get(k);
        File subdir = new File(directoryPath + File.separator + subdirPath);
        String[] files = subdir.list();
        for (int j = 0; j < files.length; j++) {
            try {
                fileCount++;
                if (getDebug())
                    System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]);

                double[] newInst = null;
                if (m_OutputFilename)
                    newInst = new double[3];
                else
                    newInst = new double[2];
                File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]);
                BufferedInputStream is;
                is = new BufferedInputStream(new FileInputStream(txt));
                StringBuffer txtStr = new StringBuffer();
                int c;
                /*
                 * while ((c = is.read()) != -1) { txtStr.append((char) c);
                 * }
                 */
                //FileReader fr = new FileReader(txt);

                BufferedReader br = new BufferedReader(
                        new InputStreamReader(new FileInputStream(txt), "UTF-8"));

                String line;

                while ((line = br.readLine()) != null) {

                    txtStr.append(line + "\n");

                }
                newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
                if (m_OutputFilename)
                    newInst[1] = (double) data.attribute(1)
                            .addStringValue(subdirPath + File.separator + files[j]);
                newInst[data.classIndex()] = (double) k;
                data.add(new DenseInstance(1.0, newInst));
                is.close();
            } catch (Exception e) {
                System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath
                        + File.separator + files[j]);
            }
        }
    }

    return data;
}

From source file:les.negocio.LerWeka.java

@Override
public String processar(EntidadeDominio entidade) {
    Arquivo arq = (Arquivo) entidade;//from w w w  .j a  v  a  2  s  .  com
    String path = "/home/gustavo/Documents/weka/";
    String full_path = path + arq.getNomeDoArquivo();
    List<String> nm_att = new ArrayList<String>();
    int qt_att = 0;

    String s = null;
    BufferedReader reader = null;
    try {
        reader = new BufferedReader(new FileReader(full_path));
    } catch (FileNotFoundException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
    }
    ArffReader arff = null;
    try {
        arff = new ArffReader(reader);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    Instances data = arff.getData();
    data.setClassIndex(data.numAttributes() - 1);
    int num_atributos = data.numAttributes() - 1;

    for (int i = 3; i < num_atributos; i++) {
        // o indice comea no trs, pois os atributos anteriores so fixados pelo sistema
        if (data.attribute(i).isNominal()) {
            qt_att++;
            nm_att.add(data.attribute(i).name().toString());

        }
    }

    arq.setAtributos_weka(nm_att);
    arq.setQt_perguntas(qt_att);

    return null;
}

From source file:lfsom.data.LFSData.java

License:Apache License

/**
 * Gets the data from a csv file.//from w  ww  . j  a va 2 s  .  c o  m
 * 
 * @param fileName
 */

public LFSData(String fileName) {
    Class claseCargador = CSVLoader.class;

    if (fileName.endsWith(ArffLoader.FILE_EXTENSION)) {
        claseCargador = ArffLoader.class;
    } else {
        if (fileName.endsWith(JSONLoader.FILE_EXTENSION)) {
            claseCargador = JSONLoader.class;
        } else {
            if (fileName.endsWith(MatlabLoader.FILE_EXTENSION)) {
                claseCargador = MatlabLoader.class;
            } else {
                if (fileName.endsWith(XRFFLoader.FILE_EXTENSION)) {
                    claseCargador = XRFFLoader.class;
                } else {
                    if (fileName.endsWith(C45Loader.FILE_EXTENSION)) {
                        claseCargador = C45Loader.class;
                    }
                }
            }
        }
    }

    try {
        AbstractFileLoader cargador = (AbstractFileLoader) claseCargador.getConstructor().newInstance();
        boolean cambio_col = false;

        cargador.setSource(new File(fileName));

        Instances data1 = cargador.getDataSet();

        double[][] matrix2 = new double[data1.size()][data1.numAttributes()];

        for (int i = 0; i < data1.size(); i++) {
            matrix2[i] = data1.get(i).toDoubleArray();
        }

        // Ahora se comprueba si todas las columnas son ok

        Integer[] colVale;
        dim = 0;

        if (data1.size() > 0) {
            colVale = new Integer[matrix2[0].length];
            double[] stdevX = StatisticSample.stddeviation(matrix2);

            for (int k = 0; k < matrix2[0].length; k++) {
                if (Math.abs(stdevX[k]) >= 0.000000001) {
                    colVale[k] = dim;
                    dim++;
                } else {
                    colVale[k] = -1;
                    cambio_col = true;
                }
            }

        } else {
            dim = data1.numAttributes();
            colVale = new Integer[dim];
            for (int k = 0; k < dim; k++) {
                colVale[k] = k;
            }
        }

        double[][] matrixAssign = new double[matrix2.length][dim];

        if (cambio_col) {
            for (int k = 0; k < matrix2.length; k++) {
                for (int w = 0; w < matrix2[0].length; w++) {
                    if (colVale[w] != -1) {
                        matrixAssign[k][colVale[w]] = matrix2[k][w];
                    }
                }

            }

        } else {
            matrixAssign = matrix2;
        }

        // Fin de la comprobacion

        setLabels(new String[dim]);
        for (int i = 0; i < data1.numAttributes(); i++) {
            if (colVale[i] != -1) {
                getLabels()[colVale[i]] = data1.attribute(i).name();
            }
        }

        BufferedWriter br = new BufferedWriter(new FileWriter("d:/tmp/fich.csv"));
        StringBuilder sb = new StringBuilder();

        for (int i = 0; i < matrixAssign.length; i++) {
            String cad = String.valueOf(matrixAssign[i][0]);
            for (int k = 1; k < matrixAssign[i].length; k++)
                cad += "," + matrixAssign[i][k];
            sb.append(cad + "\n");
        }

        br.write(sb.toString());
        br.close();

        setMatrix(matrixAssign);

    } catch (Exception e) {
        e.printStackTrace();
        System.exit(1);
    }
}

From source file:licensedetect.Classify.java

public String classify(Instances instances) {
    int dec;/*from  www.j av  a 2  s. c  o m*/
    Attribute a = instances.attribute(instances.numAttributes() - 1);

    try {
        dec = (int) multi.classifyInstance(instances.instance(0));
        return a.value(dec);
    } catch (Exception e) {
        System.out.println("Error Classifying instance");
        return Integer.toString(-1);
    }
}

From source file:linqs.gaia.model.oc.ncc.WekaClassifier.java

License:Open Source License

@Override
public void learn(Iterable<? extends Decorable> trainitems, String targetschemaid, String targetfeatureid,
        List<String> featureids) {
    try {//  w  ww  .  j  a  va 2s .c o m
        this.targetschemaid = targetschemaid;
        this.targetfeatureid = targetfeatureid;
        this.featureids = new LinkedList<String>(featureids);

        LinkedHashSet<String> uniquefids = new LinkedHashSet<String>(featureids);
        if (uniquefids.size() != featureids.size()) {
            Log.WARN("Duplicate feature ids found in set of features: " + featureids);
            this.featureids = new ArrayList<String>(uniquefids);
        }

        if (this.featureids.contains(this.targetfeatureid)) {
            throw new InvalidStateException(
                    "Cannot include target feature as a dependency feature: " + this.targetfeatureid);
        }
        Log.DEBUG("Features Used: " + ListUtils.list2string(featureids, ","));

        // Added for weka.  Will only be used for training.
        // Target will not be used as a feature itself.
        this.featureids.add(this.targetfeatureid);

        String wcclass = WekaClassifier.DEFAULT_WEKA_CLASSIFIER;
        if (this.hasParameter("wekaclassifier")) {
            wcclass = this.getStringParameter("wekaclassifier");
        }

        String wekaparams = WekaClassifier.NO_PARAMS;
        if (this.hasParameter("wekaparams")) {
            wekaparams = this.getStringParameter("wekaparams");
        }
        boolean printwekamodel = this.hasParameter("printwekamodel", "yes");

        // Support generation of class based cost matrix
        if (this.hasParameter("costbyclass", "yes")) {
            fclasscount = new KeyedCount<String>();
        }

        // Weka instances
        int numinstances = IteratorUtils.numIterable(trainitems);
        Instances traininstances = this.gaia2weka(trainitems.iterator(), numinstances, false);

        // Handle class based cost matrix
        if (fclasscount != null) {
            if (wekaparams.equals(WekaClassifier.NO_PARAMS)) {
                wekaparams = "";
            } else {
                wekaparams += ",";
            }

            wekaparams += "-cost-matrix," + this.getCostMatrix();
        }

        // Set GAIA parameters and initialize classifier
        String params[] = null;
        if (!wekaparams.equals(WekaClassifier.NO_PARAMS)) {
            Log.DEBUG("Using wekaparams: " + wekaparams);
            params = wekaparams.split(",");
        }
        wekaclassifier = Classifier.forName(wcclass, params);

        // Train classifier
        if (this.hasParameter("wekatrainfile")) {
            String savefile = this.getStringParameter("wekatrainfile");
            this.saveWekaInstances(savefile, traininstances);
        }

        Log.DEBUG("Weka building classifier");
        SimpleTimer st = new SimpleTimer();
        st.start();
        wekaclassifier.buildClassifier(traininstances);
        Log.DEBUG("Weka done building classifier: (" + st.timeLapse(true) + ")");

        // Print Weka Model, if requested
        if (printwekamodel) {
            Log.INFO("Learned Weka Model:\n" + this.wekaclassifier);
        }

        // Print attributes
        if (Log.SHOWDEBUG) {

            String features = null;
            for (int f = 0; f < traininstances.numAttributes(); f++) {
                if (features == null) {
                    features = "";
                } else {
                    features += ",";
                }

                features += traininstances.attribute(f).name();
            }

            String options[] = wekaclassifier.getOptions();
            Log.DEBUG("Weka Options: " + ArrayUtils.array2String(options, ","));
        }

        // Clear instances once training is complete
        traininstances.delete();
    } catch (RuntimeException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:linqs.gaia.model.oc.ncc.WekaClassifier.java

License:Open Source License

/**
 * Create Weka instance/*from  w  ww .  j a v  a 2 s  .c om*/
 * 
 * @param intances Weka instances
 * @param di Decorable item to convert
 * @param attInfo Weka attributes
 * @param ispredict Is this item created for training or testing
 */
private void createInstance(Instances instances, Decorable di, boolean ispredict) {
    double[] instvalues = new double[attinfosize];
    int attindex = 0;

    Schema schema = di.getSchema();
    for (String fid : featureids) {
        FeatureValue fvalue = di.getFeatureValue(fid);
        Attribute a = instances.attribute(attindex);

        Feature f = schema.getFeature(fid);
        if (!(f instanceof CompositeFeature)) {
            // Handle non multi-valued feature
            instvalues[attindex] = this.gaiavalues2weka(f, fid, fvalue, a, ispredict);
            attindex++;
        } else {
            // Handle multi-valued feature
            CompositeFeature mv = (CompositeFeature) f;
            UnmodifiableList<SimplePair<String, CVFeature>> mvfeatures = mv.getFeatures();
            CompositeValue mvvalue = (CompositeValue) di.getFeatureValue(fid);
            UnmodifiableList<FeatureValue> mvfvalues = mvvalue.getFeatureValues();
            int num = mvfvalues.size();
            for (int j = 0; j < num; j++) {
                if (fvalue.equals(FeatureValue.UNKNOWN_VALUE)) {
                    attindex++;
                    continue;
                }

                a = instances.attribute(attindex);
                f = mvfeatures.get(j).getSecond();
                fvalue = mvfvalues.get(j);
                instvalues[attindex] = this.gaiavalues2weka(f, fid, fvalue, a, ispredict);
                attindex++;
            }
        }
    }

    // Create instance of weight 1 and the specified values
    Instance inst = new SparseInstance(1, instvalues);
    inst.setDataset(instances);

    instances.add(inst);
}