List of usage examples for weka.core Instances setClassIndex
public void setClassIndex(int classIndex)
From source file:cz.vse.fis.keg.entityclassifier.core.salience.EntitySaliencer.java
License:Open Source License
public void computeSalience(List<Entity> entities) { try {/*from ww w.j av a 2 s . com*/ if (!initialized) { initialize(); initialized = true; } ArrayList<SEntity> processedEntities = new ArrayList<SEntity>(); for (Entity e : entities) { SEntity entityMention = new SEntity(); entityMention.setBeginIndex(e.getStartOffset().intValue()); entityMention.setEntityType(e.getEntityType()); ArrayList<Type> types = e.getTypes(); ArrayList<String> loggedURIs = new ArrayList<String>(); if (types != null) { for (Type t : types) { String entityURI = t.getEntityURI(); if (!loggedURIs.contains(entityURI)) { loggedURIs.add(entityURI); entityMention.getUrls().add(entityURI); } } } boolean entityAlreadyLogged = false; for (SEntity sEntity : processedEntities) { boolean isThisEntitySame = false; ArrayList<String> entityURIs1 = sEntity.getUrls(); ArrayList<String> entityURIs2 = entityMention.getUrls(); for (String eURI1 : entityURIs1) { for (String eURI2 : entityURIs2) { if (!entityAlreadyLogged) { if (eURI1.equals(eURI2)) { entityAlreadyLogged = true; isThisEntitySame = true; sEntity.setNumOccurrences(sEntity.getNumOccurrences() + 1); } } } } if (isThisEntitySame) { for (String uri : entityMention.getUrls()) { if (!sEntity.getUrls().contains(uri)) { sEntity.getUrls().add(uri); } } } } // Entity seen for first time in the document. if (!entityAlreadyLogged) { entityMention.setNumOccurrences(1); processedEntities.add(entityMention); } } // Preparing the test data container. FastVector attributes = new FastVector(6); attributes.add(new Attribute("beginIndex")); attributes.add(new Attribute("numUniqueEntitiesInDoc")); attributes.add(new Attribute("numOfOccurrencesOfEntityInDoc")); attributes.add(new Attribute("numOfEntityMentionsInDoc")); FastVector entityTypeNominalAttVal = new FastVector(2); entityTypeNominalAttVal.addElement("named_entity"); entityTypeNominalAttVal.addElement("common_entity"); Attribute entityTypeAtt = new Attribute("type", entityTypeNominalAttVal); attributes.add(entityTypeAtt); FastVector classNominalAttVal = new FastVector(3); classNominalAttVal.addElement("not_salient"); classNominalAttVal.addElement("less_salient"); classNominalAttVal.addElement("most_salient"); Attribute classAtt = new Attribute("class", classNominalAttVal); attributes.add(classAtt); Instances evalData = new Instances("MyRelation", attributes, 0); evalData.setClassIndex(evalData.numAttributes() - 1); for (int i = 0; i < processedEntities.size(); i++) { String entityType = ""; if (processedEntities.get(i).getEntityType().equals("named entity")) { entityType = "named_entity"; } else if (processedEntities.get(i).getEntityType().equals("common entity")) { entityType = "common_entity"; } else { } Instance inst = new DenseInstance(6); inst.setValue(evalData.attribute(0), processedEntities.get(i).getBeginIndex()); // begin index inst.setValue(evalData.attribute(1), processedEntities.size()); // num of unique entities in doc inst.setValue(evalData.attribute(2), processedEntities.get(i).getNumOccurrences()); // num of entity occurrences in doc inst.setValue(evalData.attribute(3), entities.size()); // num of entity mentions in doc inst.setValue(evalData.attribute(4), entityType); // type of the entity evalData.add(inst); } for (int i = 0; i < processedEntities.size(); i++) { SEntity sEntity = processedEntities.get(i); int classIndex = (int) classifier.classifyInstance(evalData.get(i)); String classLabel = evalData.firstInstance().classAttribute().value(classIndex); double pred[] = classifier.distributionForInstance(evalData.get(i)); double probability = pred[classIndex]; double salienceScore = pred[1] * 0.5 + pred[2]; sEntity.setSalienceScore(salienceScore); sEntity.setSalienceConfidence(probability); sEntity.setSalienceClass(classLabel); for (Entity e : entities) { ArrayList<Type> types = e.getTypes(); if (types != null) { for (Type t : types) { if (sEntity.getUrls().contains(t.getEntityURI())) { Salience s = new Salience(); s.setClassLabel(classLabel); DecimalFormat df = new DecimalFormat("0.000"); double fProbability = df.parse(df.format(probability)).doubleValue(); double fSalience = df.parse(df.format(salienceScore)).doubleValue(); s.setConfidence(fProbability); s.setScore(fSalience); t.setSalience(s); } } } } } } catch (Exception ex) { Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:cz.vse.fis.keg.entityclassifier.core.salience.EntitySaliencer.java
License:Open Source License
private void trainModel() { BufferedReader reader = null; try {/* ww w. j a va2 s . c o m*/ URL fileURL = THDController.getInstance().getClass().getResource(Settings.SALIENCE_DATASET); File arrfFile = new File(fileURL.getFile()); reader = new BufferedReader(new FileReader(arrfFile)); Instances data = new Instances(reader); data.setClassIndex(data.numAttributes() - 1); // classifier = new NaiveBayes(); classifier = new RandomForest(); // Train the classifer. classifier.buildClassifier(data); } catch (FileNotFoundException ex) { Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex); } catch (Exception ex) { Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex); } finally { try { reader.close(); System.out.println("Model was successfully trained."); } catch (IOException ex) { Logger.getLogger(EntitySaliencer.class.getName()).log(Level.SEVERE, null, ex); } } }
From source file:data.generation.target.utils.PrincipalComponents.java
License:Open Source License
/** * Set up the header for the PC->original space dataset * //from w w w. j a v a 2 s .co m * @return the output format * @throws Exception if something goes wrong */ private Instances setOutputFormatOriginal() throws Exception { FastVector attributes = new FastVector(); for (int i = 0; i < m_numAttribs; i++) { String att = m_trainInstances.attribute(i).name(); attributes.addElement(new Attribute(att)); } if (m_hasClass) { attributes.addElement(m_trainHeader.classAttribute().copy()); } Instances outputFormat = new Instances(m_trainHeader.relationName() + "->PC->original space", attributes, 0); // set the class to be the last attribute if necessary if (m_hasClass) { outputFormat.setClassIndex(outputFormat.numAttributes() - 1); } return outputFormat; }
From source file:data.generation.target.utils.PrincipalComponents.java
License:Open Source License
/** * Set the format for the transformed data * @return a set of empty Instances (header only) in the new format * @throws Exception if the output format can't be set *//*from ww w . ja v a 2 s.c o m*/ private Instances setOutputFormat() throws Exception { if (m_eigenvalues == null) { return null; } double cumulative = 0.0; FastVector attributes = new FastVector(); for (int i = m_numAttribs - 1; i >= 0; i--) { StringBuffer attName = new StringBuffer(); // build array of coefficients double[] coeff_mags = new double[m_numAttribs]; for (int j = 0; j < m_numAttribs; j++) coeff_mags[j] = -Math.abs(m_eigenvectors[j][m_sortedEigens[i]]); int num_attrs = (m_maxAttrsInName > 0) ? Math.min(m_numAttribs, m_maxAttrsInName) : m_numAttribs; // this array contains the sorted indices of the coefficients int[] coeff_inds; if (m_numAttribs > 0) { // if m_maxAttrsInName > 0, sort coefficients by decreasing magnitude coeff_inds = Utils.sort(coeff_mags); } else { // if m_maxAttrsInName <= 0, use all coeffs in original order coeff_inds = new int[m_numAttribs]; for (int j = 0; j < m_numAttribs; j++) coeff_inds[j] = j; } // build final attName string for (int j = 0; j < num_attrs; j++) { double coeff_value = m_eigenvectors[coeff_inds[j]][m_sortedEigens[i]]; if (j > 0 && coeff_value >= 0) attName.append("+"); attName.append( Utils.doubleToString(coeff_value, 5, 3) + m_trainInstances.attribute(coeff_inds[j]).name()); } if (num_attrs < m_numAttribs) attName.append("..."); attributes.addElement(new Attribute(attName.toString())); cumulative += m_eigenvalues[m_sortedEigens[i]]; if ((cumulative / m_sumOfEigenValues) >= m_coverVariance) { break; } } if (m_hasClass) { attributes.addElement(m_trainHeader.classAttribute().copy()); } Instances outputFormat = new Instances(m_trainInstances.relationName() + "_principal components", attributes, 0); // set the class to be the last attribute if necessary if (m_hasClass) { outputFormat.setClassIndex(outputFormat.numAttributes() - 1); } m_outputNumAtts = outputFormat.numAttributes(); return outputFormat; }
From source file:data.Regression.java
public int regression(String fileName) { String arffName = FileTransfer.transfer(fileName); try {//from ww w .ja v a 2 s .c om //load data Instances data = new Instances(new BufferedReader(new FileReader(arffName))); data.setClassIndex(data.numAttributes() - 1); //build model LinearRegression model = new LinearRegression(); model.buildClassifier(data); //the last instance with missing class is not used System.out.println(model); //classify the last instance Instance num = data.lastInstance(); int people = (int) model.classifyInstance(num); System.out.println("NumOfEnrolled (" + num + "): " + people); return people; } catch (Exception e) { e.printStackTrace(); System.out.println("Regression fail"); } return 0; }
From source file:data.RegressionDrop.java
public void regression() throws Exception { //public static void main(String[] args) throws Exception{ //load data// www . ja v a 2 s . co m Instances data = new Instances(new BufferedReader(new FileReader("NumOfDroppedByYear.arff"))); data.setClassIndex(data.numAttributes() - 1); //build model LinearRegression model = new LinearRegression(); model.buildClassifier(data); //the last instance with missing class is not used System.out.println(model); //classify the last instance Instance num = data.lastInstance(); int people = (int) model.classifyInstance(num); System.out.println("NumOfDropped (" + num + "): " + people); }
From source file:data.statistics.MLStatistics.java
License:Open Source License
/** * Calculates Phi and Chi-square correlation matrix. * * @param dataSet/*from w w w. jav a 2 s .c o m*/ * A multi-label dataset. * @throws java.lang.Exception * To be handled in an upper level. */ public void calculatePhiChi2(MultiLabelInstances dataSet) throws Exception { numLabels = dataSet.getNumLabels(); // The indices of the label attributes int[] labelIndices; labelIndices = dataSet.getLabelIndices(); numLabels = dataSet.getNumLabels(); phi = new double[numLabels][numLabels]; chi2 = new double[numLabels][numLabels]; Remove remove = new Remove(); remove.setInvertSelection(true); remove.setAttributeIndicesArray(labelIndices); remove.setInputFormat(dataSet.getDataSet()); Instances result = Filter.useFilter(dataSet.getDataSet(), remove); result.setClassIndex(result.numAttributes() - 1); for (int i = 0; i < numLabels; i++) { int a[] = new int[numLabels]; int b[] = new int[numLabels]; int c[] = new int[numLabels]; int d[] = new int[numLabels]; double e[] = new double[numLabels]; double f[] = new double[numLabels]; double g[] = new double[numLabels]; double h[] = new double[numLabels]; for (int j = 0; j < result.numInstances(); j++) { for (int l = 0; l < numLabels; l++) { if (result.instance(j).stringValue(i).equals("0")) { if (result.instance(j).stringValue(l).equals("0")) { a[l]++; } else { c[l]++; } } else { if (result.instance(j).stringValue(l).equals("0")) { b[l]++; } else { d[l]++; } } } } for (int l = 0; l < numLabels; l++) { e[l] = a[l] + b[l]; f[l] = c[l] + d[l]; g[l] = a[l] + c[l]; h[l] = b[l] + d[l]; double mult = e[l] * f[l] * g[l] * h[l]; double denominator = Math.sqrt(mult); double nominator = a[l] * d[l] - b[l] * c[l]; phi[i][l] = nominator / denominator; chi2[i][l] = phi[i][l] * phi[i][l] * (a[l] + b[l] + c[l] + d[l]); } } }
From source file:DataMiningLogHistoriKIRI.ArffIO.java
public Instances readArff(String name) throws IOException { Instances data; data = new Instances(new BufferedReader(new FileReader("temp.arff"))); data.setClassIndex(data.numAttributes() - 1); return data;//from w w w .ja v a 2 s. c o m }
From source file:de.citec.sc.matoll.classifiers.WEKAclassifier.java
public void train(List<Provenance> provenances, Set<String> pattern_lookup, Set<String> pos_lookup) throws IOException { String path = "matoll" + Language.toString() + ".arff"; writeVectors(provenances, path, pattern_lookup, pos_lookup); Instances inst = new Instances(new BufferedReader(new FileReader(path))); inst.setClassIndex(inst.numAttributes() - 1); try {//from w ww . j a v a2s . c o m cls.buildClassifier(inst); // serialize model ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(path.replace(".arff", ".model"))); oos.writeObject(cls); oos.flush(); oos.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:de.citec.sc.matoll.classifiers.WEKAclassifier.java
public HashMap<Integer, Double> predict(Provenance provenance, Set<String> pattern_lookup, Set<String> pos_lookup) throws IOException, Exception { /*/*from w w w .ja v a 2 s . com*/ we want predict that the entry is true */ provenance.setAnnotation(1); List<Provenance> tmp_prov = new ArrayList<Provenance>(); tmp_prov.add(provenance); writeVectors(tmp_prov, "tmp.arff", pattern_lookup, pos_lookup); ArffLoader loader = new ArffLoader(); loader.setFile(new File("tmp.arff")); Instances structure = loader.getStructure(); structure.setClassIndex(structure.numAttributes() - 1); HashMap<Integer, Double> hm = new HashMap<Integer, Double>(); Instance current; while ((current = loader.getNextInstance(structure)) != null) { /* * value_to_predict * can be only 0 or 1, as only two classes are given */ double value = cls.classifyInstance(current); double[] percentage = cls.distributionForInstance(current); List<String> result = new ArrayList<String>(); int prediction = (int) value; double distribution = percentage[(int) value]; hm.put(prediction, distribution); } return hm; }