Example usage for weka.core Instances setClassIndex

Introduction

In this page you can find the example usage for weka.core Instances setClassIndex.

Prototype

public void setClassIndex(int classIndex)

Source Link

Document

Sets the class index of the set.

Usage

From source file:cz.vse.fis.keg.entityclassifier.core.salience.EntitySaliencer.java

License:Open Source License

public void computeSalience(List<Entity> entities) {
    try {/*from   ww  w.j  av  a 2 s  . com*/
        if (!initialized) {
            initialize();
            initialized = true;
        }

        ArrayList<SEntity> processedEntities = new ArrayList<SEntity>();

        for (Entity e : entities) {
            SEntity entityMention = new SEntity();
            entityMention.setBeginIndex(e.getStartOffset().intValue());
            entityMention.setEntityType(e.getEntityType());

            ArrayList<Type> types = e.getTypes();
            ArrayList<String> loggedURIs = new ArrayList<String>();

            if (types != null) {
                for (Type t : types) {
                    String entityURI = t.getEntityURI();

                    if (!loggedURIs.contains(entityURI)) {
                        loggedURIs.add(entityURI);
                        entityMention.getUrls().add(entityURI);
                    }
                }
            }

            boolean entityAlreadyLogged = false;

            for (SEntity sEntity : processedEntities) {
                boolean isThisEntitySame = false;
                ArrayList<String> entityURIs1 = sEntity.getUrls();
                ArrayList<String> entityURIs2 = entityMention.getUrls();

                for (String eURI1 : entityURIs1) {
                    for (String eURI2 : entityURIs2) {
                        if (!entityAlreadyLogged) {
                            if (eURI1.equals(eURI2)) {
                                entityAlreadyLogged = true;
                                isThisEntitySame = true;
                                sEntity.setNumOccurrences(sEntity.getNumOccurrences() + 1);
                            }
                        }
                    }
                }

                if (isThisEntitySame) {
                    for (String uri : entityMention.getUrls()) {
                        if (!sEntity.getUrls().contains(uri)) {
                            sEntity.getUrls().add(uri);
                        }
                    }
                }
            }

            // Entity seen for first time in the document.
            if (!entityAlreadyLogged) {
                entityMention.setNumOccurrences(1);
                processedEntities.add(entityMention);
            }
        }

        // Preparing the test data container.
        FastVector attributes = new FastVector(6);
        attributes.add(new Attribute("beginIndex"));
        attributes.add(new Attribute("numUniqueEntitiesInDoc"));
        attributes.add(new Attribute("numOfOccurrencesOfEntityInDoc"));
        attributes.add(new Attribute("numOfEntityMentionsInDoc"));

        FastVector entityTypeNominalAttVal = new FastVector(2);
        entityTypeNominalAttVal.addElement("named_entity");
        entityTypeNominalAttVal.addElement("common_entity");

        Attribute entityTypeAtt = new Attribute("type", entityTypeNominalAttVal);
        attributes.add(entityTypeAtt);
        FastVector classNominalAttVal = new FastVector(3);
        classNominalAttVal.addElement("not_salient");
        classNominalAttVal.addElement("less_salient");
        classNominalAttVal.addElement("most_salient");
        Attribute classAtt = new Attribute("class", classNominalAttVal);
        attributes.add(classAtt);
        Instances evalData = new Instances("MyRelation", attributes, 0);

        evalData.setClassIndex(evalData.numAttributes() - 1);

        for (int i = 0; i < processedEntities.size(); i++) {

            String entityType = "";
            if (processedEntities.get(i).getEntityType().equals("named entity")) {
                entityType = "named_entity";
            } else if (processedEntities.get(i).getEntityType().equals("common entity")) {
                entityType = "common_entity";
            } else {
            }
            Instance inst = new DenseInstance(6);
            inst.setValue(evalData.attribute(0), processedEntities.get(i).getBeginIndex()); // begin index
            inst.setValue(evalData.attribute(1), processedEntities.size()); // num of unique entities in doc
            inst.setValue(evalData.attribute(2), processedEntities.get(i).getNumOccurrences()); // num of entity occurrences in doc
            inst.setValue(evalData.attribute(3), entities.size()); // num of entity mentions in doc
            inst.setValue(evalData.attribute(4), entityType); // type of the entity
            evalData.add(inst);

        }

        for (int i = 0; i < processedEntities.size(); i++) {
            SEntity sEntity = processedEntities.get(i);
            int classIndex = (int) classifier.classifyInstance(evalData.get(i));
            String classLabel = evalData.firstInstance().classAttribute().value(classIndex);
            double pred[] = classifier.distributionForInstance(evalData.get(i));
            double probability = pred[classIndex];

            double salienceScore = pred[1] * 0.5 + pred[2];
            sEntity.setSalienceScore(salienceScore);
            sEntity.setSalienceConfidence(probability);
            sEntity.setSalienceClass(classLabel);

            for (Entity e : entities) {
                ArrayList<Type> types = e.getTypes();
                if (types != null) {
                    for (Type t : types) {
                        if (sEntity.getUrls().contains(t.getEntityURI())) {
                            Salience s = new Salience();
                            s.setClassLabel(classLabel);
                            DecimalFormat df = new DecimalFormat("0.000");
                            double fProbability = df.parse(df.format(probability)).doubleValue();
                            double fSalience = df.parse(df.format(salienceScore)).doubleValue();
                            s.setConfidence(fProbability);
                            s.setScore(fSalience);
                            t.setSalience(s);
                        }
                    }
                }
            }
        }

    } catch (Exception ex) {
        Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:cz.vse.fis.keg.entityclassifier.core.salience.EntitySaliencer.java

License:Open Source License

private void trainModel() {

    BufferedReader reader = null;

    try {/* ww  w. j  a  va2  s  .  c o m*/

        URL fileURL = THDController.getInstance().getClass().getResource(Settings.SALIENCE_DATASET);
        File arrfFile = new File(fileURL.getFile());

        reader = new BufferedReader(new FileReader(arrfFile));
        Instances data = new Instances(reader);
        data.setClassIndex(data.numAttributes() - 1);

        //            classifier = new NaiveBayes();
        classifier = new RandomForest();

        // Train the classifer.
        classifier.buildClassifier(data);

    } catch (FileNotFoundException ex) {
        Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex);
    } catch (IOException ex) {
        Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex);
    } catch (Exception ex) {
        Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex);
    } finally {
        try {
            reader.close();
            System.out.println("Model was successfully trained.");
        } catch (IOException ex) {
            Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
}

From source file:data.generation.target.utils.PrincipalComponents.java

License:Open Source License

/**
 * Set up the header for the PC->original space dataset
 * //from w w w. j  a v  a  2  s .co m
 * @return            the output format
 * @throws Exception  if something goes wrong
 */
private Instances setOutputFormatOriginal() throws Exception {
    FastVector attributes = new FastVector();

    for (int i = 0; i < m_numAttribs; i++) {
        String att = m_trainInstances.attribute(i).name();
        attributes.addElement(new Attribute(att));
    }

    if (m_hasClass) {
        attributes.addElement(m_trainHeader.classAttribute().copy());
    }

    Instances outputFormat = new Instances(m_trainHeader.relationName() + "->PC->original space", attributes,
            0);

    // set the class to be the last attribute if necessary
    if (m_hasClass) {
        outputFormat.setClassIndex(outputFormat.numAttributes() - 1);
    }

    return outputFormat;
}

From source file:data.generation.target.utils.PrincipalComponents.java

License:Open Source License

/**
 * Set the format for the transformed data
 * @return a set of empty Instances (header only) in the new format
 * @throws Exception if the output format can't be set
 *//*from  ww w .  ja  v  a  2  s.c o  m*/
private Instances setOutputFormat() throws Exception {
    if (m_eigenvalues == null) {
        return null;
    }

    double cumulative = 0.0;
    FastVector attributes = new FastVector();
    for (int i = m_numAttribs - 1; i >= 0; i--) {
        StringBuffer attName = new StringBuffer();
        // build array of coefficients
        double[] coeff_mags = new double[m_numAttribs];
        for (int j = 0; j < m_numAttribs; j++)
            coeff_mags[j] = -Math.abs(m_eigenvectors[j][m_sortedEigens[i]]);
        int num_attrs = (m_maxAttrsInName > 0) ? Math.min(m_numAttribs, m_maxAttrsInName) : m_numAttribs;
        // this array contains the sorted indices of the coefficients
        int[] coeff_inds;
        if (m_numAttribs > 0) {
            // if m_maxAttrsInName > 0, sort coefficients by decreasing magnitude
            coeff_inds = Utils.sort(coeff_mags);
        } else {
            // if  m_maxAttrsInName <= 0, use all coeffs in original order
            coeff_inds = new int[m_numAttribs];
            for (int j = 0; j < m_numAttribs; j++)
                coeff_inds[j] = j;
        }
        // build final attName string
        for (int j = 0; j < num_attrs; j++) {
            double coeff_value = m_eigenvectors[coeff_inds[j]][m_sortedEigens[i]];
            if (j > 0 && coeff_value >= 0)
                attName.append("+");
            attName.append(
                    Utils.doubleToString(coeff_value, 5, 3) + m_trainInstances.attribute(coeff_inds[j]).name());
        }
        if (num_attrs < m_numAttribs)
            attName.append("...");

        attributes.addElement(new Attribute(attName.toString()));
        cumulative += m_eigenvalues[m_sortedEigens[i]];

        if ((cumulative / m_sumOfEigenValues) >= m_coverVariance) {
            break;
        }
    }

    if (m_hasClass) {
        attributes.addElement(m_trainHeader.classAttribute().copy());
    }

    Instances outputFormat = new Instances(m_trainInstances.relationName() + "_principal components",
            attributes, 0);

    // set the class to be the last attribute if necessary
    if (m_hasClass) {
        outputFormat.setClassIndex(outputFormat.numAttributes() - 1);
    }

    m_outputNumAtts = outputFormat.numAttributes();
    return outputFormat;
}

From source file:data.Regression.java

public int regression(String fileName) {

    String arffName = FileTransfer.transfer(fileName);

    try {//from  ww w  .ja  v  a  2 s .c  om
        //load data
        Instances data = new Instances(new BufferedReader(new FileReader(arffName)));
        data.setClassIndex(data.numAttributes() - 1);
        //build model
        LinearRegression model = new LinearRegression();
        model.buildClassifier(data);
        //the last instance with missing class is not used
        System.out.println(model);
        //classify the last instance
        Instance num = data.lastInstance();
        int people = (int) model.classifyInstance(num);
        System.out.println("NumOfEnrolled (" + num + "): " + people);
        return people;
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("Regression fail");
    }
    return 0;
}

From source file:data.RegressionDrop.java

public void regression() throws Exception {

    //public static void main(String[] args) throws Exception{

    //load data// www  . ja v  a  2  s  .  co m
    Instances data = new Instances(new BufferedReader(new FileReader("NumOfDroppedByYear.arff")));
    data.setClassIndex(data.numAttributes() - 1);
    //build model
    LinearRegression model = new LinearRegression();
    model.buildClassifier(data);
    //the last instance with missing class is not used
    System.out.println(model);
    //classify the last instance
    Instance num = data.lastInstance();
    int people = (int) model.classifyInstance(num);
    System.out.println("NumOfDropped (" + num + "): " + people);
}

From source file:data.statistics.MLStatistics.java

License:Open Source License

/**
 * Calculates Phi and Chi-square correlation matrix.
 *
 * @param dataSet/*from   w  w w.  jav  a 2  s .c o  m*/
 *            A multi-label dataset.
 * @throws java.lang.Exception
 *             To be handled in an upper level.
 */
public void calculatePhiChi2(MultiLabelInstances dataSet) throws Exception {
    numLabels = dataSet.getNumLabels();

    // The indices of the label attributes
    int[] labelIndices;

    labelIndices = dataSet.getLabelIndices();
    numLabels = dataSet.getNumLabels();
    phi = new double[numLabels][numLabels];
    chi2 = new double[numLabels][numLabels];

    Remove remove = new Remove();
    remove.setInvertSelection(true);
    remove.setAttributeIndicesArray(labelIndices);
    remove.setInputFormat(dataSet.getDataSet());
    Instances result = Filter.useFilter(dataSet.getDataSet(), remove);
    result.setClassIndex(result.numAttributes() - 1);

    for (int i = 0; i < numLabels; i++) {
        int a[] = new int[numLabels];
        int b[] = new int[numLabels];
        int c[] = new int[numLabels];
        int d[] = new int[numLabels];
        double e[] = new double[numLabels];
        double f[] = new double[numLabels];
        double g[] = new double[numLabels];
        double h[] = new double[numLabels];
        for (int j = 0; j < result.numInstances(); j++) {
            for (int l = 0; l < numLabels; l++) {
                if (result.instance(j).stringValue(i).equals("0")) {
                    if (result.instance(j).stringValue(l).equals("0")) {
                        a[l]++;
                    } else {
                        c[l]++;
                    }
                } else {
                    if (result.instance(j).stringValue(l).equals("0")) {
                        b[l]++;
                    } else {
                        d[l]++;
                    }
                }
            }
        }
        for (int l = 0; l < numLabels; l++) {
            e[l] = a[l] + b[l];
            f[l] = c[l] + d[l];
            g[l] = a[l] + c[l];
            h[l] = b[l] + d[l];
            double mult = e[l] * f[l] * g[l] * h[l];
            double denominator = Math.sqrt(mult);
            double nominator = a[l] * d[l] - b[l] * c[l];
            phi[i][l] = nominator / denominator;
            chi2[i][l] = phi[i][l] * phi[i][l] * (a[l] + b[l] + c[l] + d[l]);
        }
    }
}

From source file:DataMiningLogHistoriKIRI.ArffIO.java

public Instances readArff(String name) throws IOException {
    Instances data;
    data = new Instances(new BufferedReader(new FileReader("temp.arff")));
    data.setClassIndex(data.numAttributes() - 1);
    return data;//from w  w  w .ja v a 2  s. c o  m
}

From source file:de.citec.sc.matoll.classifiers.WEKAclassifier.java

public void train(List<Provenance> provenances, Set<String> pattern_lookup, Set<String> pos_lookup)
        throws IOException {
    String path = "matoll" + Language.toString() + ".arff";
    writeVectors(provenances, path, pattern_lookup, pos_lookup);
    Instances inst = new Instances(new BufferedReader(new FileReader(path)));
    inst.setClassIndex(inst.numAttributes() - 1);
    try {//from   w  ww .  j  a  v  a2s  .  c  o m
        cls.buildClassifier(inst);
        // serialize model
        ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(path.replace(".arff", ".model")));
        oos.writeObject(cls);
        oos.flush();
        oos.close();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

From source file:de.citec.sc.matoll.classifiers.WEKAclassifier.java

public HashMap<Integer, Double> predict(Provenance provenance, Set<String> pattern_lookup,
        Set<String> pos_lookup) throws IOException, Exception {

    /*/*from w w  w  .ja  v a 2 s . com*/
    we want predict that the entry is true
    */
    provenance.setAnnotation(1);
    List<Provenance> tmp_prov = new ArrayList<Provenance>();
    tmp_prov.add(provenance);
    writeVectors(tmp_prov, "tmp.arff", pattern_lookup, pos_lookup);

    ArffLoader loader = new ArffLoader();
    loader.setFile(new File("tmp.arff"));
    Instances structure = loader.getStructure();
    structure.setClassIndex(structure.numAttributes() - 1);
    HashMap<Integer, Double> hm = new HashMap<Integer, Double>();

    Instance current;
    while ((current = loader.getNextInstance(structure)) != null) {
        /*
        * value_to_predict
        * can be only 0 or 1, as only two classes are given 
        */

        double value = cls.classifyInstance(current);
        double[] percentage = cls.distributionForInstance(current);

        List<String> result = new ArrayList<String>();
        int prediction = (int) value;
        double distribution = percentage[(int) value];
        hm.put(prediction, distribution);
    }
    return hm;
}