List of usage examples for weka.core Instance setDataset
public void setDataset(Instances instances);
From source file:nl.bioinf.roelen.thema11.classifier_tools.ClassifierUser.java
License:Open Source License
/** * use the classifier to test the sequences in a genbank or fasta file for boundaries * @param fileLocation the location of the genbank of fasta file * @param classifier the classifier to use * @return //from ww w . j av a2 s . c o m */ public static ArrayList<ClassifiedNucleotide> getPossibleBoundaries(String fileLocation, Classifier classifier) { ArrayList<Gene> genesFromFile = new ArrayList<>(); ArrayList<ClassifiedNucleotide> classifiedNucleotides = new ArrayList<>(); //read from fasta if (fileLocation.toUpperCase().endsWith(".FASTA") || fileLocation.toUpperCase().endsWith(".FA") || fileLocation.toUpperCase().endsWith(".FAN")) { genesFromFile.addAll(readFasta(fileLocation)); } //read from genbank else if (fileLocation.toUpperCase().endsWith(".GENBANK") || fileLocation.toUpperCase().endsWith(".GB")) { GenBankReader gbr = new GenBankReader(); gbr.readFile(fileLocation); GenbankResult gbresult = gbr.getResult(); genesFromFile = gbresult.getGenes(); } //get the test data HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResults; geneTestResults = TestGenes.testForIntronExonBoundaries(genesFromFile, 1); ArrayList<InstanceToClassify> instanceNucs = new ArrayList<>(); try { //write our results to a temporary file File tempArrf = File.createTempFile("realSet", ".arff"); ArffWriter.write(tempArrf.getAbsolutePath(), geneTestResults, null); //get data ConverterUtils.DataSource source = new ConverterUtils.DataSource(tempArrf.getAbsolutePath()); //SET DATA AND OPTIONS Instances data = source.getDataSet(); for (int i = 0; i < data.numInstances(); i++) { Instance in = data.instance(i); //get the name of the gene or sequence tested String nameOfInstance = in.stringValue(in.numAttributes() - 3); //get the tested position int testedPosition = (int) in.value(in.numAttributes() - 2); //set the class as missing, because we want to find it in.setMissing((in.numAttributes() - 1)); Instance instanceNoExtras = new Instance(in); //delete the name and position, they are irrelevant for classifying instanceNoExtras.deleteAttributeAt(instanceNoExtras.numAttributes() - 2); instanceNoExtras.deleteAttributeAt(instanceNoExtras.numAttributes() - 2); InstanceToClassify ic = new InstanceToClassify(instanceNoExtras, testedPosition, nameOfInstance); instanceNucs.add(ic); } for (InstanceToClassify ic : instanceNucs) { Instance in = ic.getInstance(); in.setDataset(data); data.setClassIndex(data.numAttributes() - 1); //classify our instance classifier.classifyInstance(in); //save the likelyhood something is part of something double likelyhoodBoundary = classifier.distributionForInstance(in)[0]; double likelyhoodNotBoundary = classifier.distributionForInstance(in)[1]; //create a classified nucleotide and give it the added data ClassifiedNucleotide cn = new ClassifiedNucleotide(likelyhoodBoundary, likelyhoodNotBoundary, ic.getName(), ic.getPosition()); classifiedNucleotides.add(cn); } } catch (IOException ex) { Logger.getLogger(ClassifierUser.class.getName()).log(Level.SEVERE, null, ex); } catch (Exception ex) { Logger.getLogger(ClassifierUser.class.getName()).log(Level.SEVERE, null, ex); } return classifiedNucleotides; }
From source file:ocr.ARFFSymbolFilter.java
License:Apache License
public static void writeWeka(final String filenameout, final ArrayList<?> symbolData) { final int nsold = ARFFSymbolFilter.ns; ARFFSymbolFilter.tangent = (ARFFSymbolFilter.times > 1); try {/*w w w .ja v a 2s. c o m*/ if (!ARFFSymbolFilter.strokenumber) { ARFFSymbolFilter.ns = 1; } final DataOutputStream[] fileout = new DataOutputStream[ARFFSymbolFilter.ns]; final Instances[] instances = new Instances[ARFFSymbolFilter.ns]; System.out.println("Writing file"); for (int i = 0; i < ARFFSymbolFilter.ns; ++i) { final int k = ARFFSymbolFilter.strokenumber ? i : (nsold - 1); fileout[ARFFSymbolFilter.strokenumber ? i : 0] = new DataOutputStream(new FileOutputStream( filenameout + (ARFFSymbolFilter.strokenumber ? ("" + (k + 1)) : "") + ".arff#")); } final int tot = symbolData.size(); for (int j = 0; j < symbolData.size(); ++j) { final ArrayList<?> group = (ArrayList<?>) symbolData.get(j); for (int i = 0; i < group.size(); ++i) { final Symbol sym = (Symbol) group.get(i); final int k = ARFFSymbolFilter.strokenumber ? (sym.size() - 1) : 0; if (sym.name.equals("no_name") || sym.name.equals("empty_symbol")) { System.out.print("#" + sym.name + "#"); } else { for (int t = 0; t < ARFFSymbolFilter.times; ++t) { final String line = constructStringInstance(sym, ARFFSymbolFilter.alpha); if (line == null) { System.out.print("line=null!"); } else { if (instances[k] == null) { final StringTokenizer st = new StringTokenizer(line, " "); final int nt = st.countTokens() / 2; final FastVector att = new FastVector(); for (int kk = 0; kk < nt; ++kk) { final String token = st.nextToken(); att.addElement(new Attribute(new String(token))); st.nextToken(); } att.addElement(new Attribute("class", (FastVector) null)); (instances[k] = new Instances("Symbols of Size " + (k + 1), att, 1)) .setClassIndex(att.size() - 1); } final StringTokenizer st = new StringTokenizer(line, " "); final int nt = st.countTokens() / 2; final Instance inst = new Instance(nt + 1); for (int kk = 0; kk < nt; ++kk) { st.nextToken(); final String token = new String(st.nextToken()); inst.setValue(kk, Double.parseDouble(token)); } inst.setDataset(instances[k]); inst.setClassValue(oldReplace(sym.name, "\\", "")); instances[k].add(inst); } } } } if ((int) (100.0 * j) / tot % 10 == 0) { System.out.print((int) (100.0 * j) / tot + "%-"); } } for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { if (fileout[ARFFSymbolFilter.strokenumber ? k : 0] == null) { System.out.println("fo" + fileout[ARFFSymbolFilter.strokenumber ? k : 0]); } if (instances[ARFFSymbolFilter.strokenumber ? k : 0] == null) { System.out.println("in:" + instances[ARFFSymbolFilter.strokenumber ? k : 0]); } fileout[ARFFSymbolFilter.strokenumber ? k : 0] .writeBytes(instances[ARFFSymbolFilter.strokenumber ? k : 0].toString()); fileout[ARFFSymbolFilter.strokenumber ? k : 0].close(); } final StringToNominal filter = new StringToNominal(); final String[] args = new String[4]; for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { args[0] = "-i"; args[1] = filenameout + (ARFFSymbolFilter.strokenumber ? ("" + (k + 1)) : "") + ".arff#"; args[2] = "-o"; args[3] = filenameout + (ARFFSymbolFilter.strokenumber ? ("" + (k + 1)) : "") + ".arff"; Filter.filterFile(filter, args); new File(args[1]).delete(); } System.out.println("100.0%"); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (Exception ioe) { ioe.printStackTrace(); } }
From source file:OnTheFlyMethods.FastImplementations.RedefinedWeightedNodePruning.java
License:Open Source License
protected boolean verifyValidEntities(int entityId, int xxx, List<AbstractBlock> newBlocks, ExecuteBlockComparisons ebc, Instances trainingInstances) { int index;/*ww w .java 2 s.c o m*/ retainedNeighbors.clear(); if (!cleanCleanER) { // for (int neighborId : validEntities) { // if (isValidComparison(entityId, neighborId,ebc)) { // totalComparisons++; // duplicatePropagation.isSuperfluous(getComparison(entityId, neighborId)); // } // } } else { if (entityId < datasetLimit) { // //Iterator<Integer> temp = validEntitiesB.iterator(); int size = validEntities.size(); Iterator<Integer> it = validEntitiesB.iterator(); for (int neighborId : validEntities) { Integer value = map.get(entityId); if (value != null && value == neighborId) { // System.out.println("----"); continue; } value = map.get(neighborId); if (value != null && value == entityId) { // System.out.println("----"); continue; } map.put(entityId, neighborId); // if(entityId==1178 && neighborId==2562) // System.out.println("ok"); // // index=temp.next(); int blockIndex = it.next(); if (isValidComparison(entityId, neighborId, ebc)) { totalComparisons++; duplicatePropagation.isSuperfluous(getComparison(entityId, neighborId)); // if(apagar++%1000==0) // System.out.println(apagar); Comparison c; if (entityId < datasetLimit) c = new Comparison(true, entityId, neighborId - datasetLimit); else c = new Comparison(true, entityId - datasetLimit, neighborId); final List<Integer> commonBlockIndices = entityIndex.getCommonBlockIndices(blockIndex, c); if (commonBlockIndices == null) continue; // if(!retainedEntitiesD1.contains(comparison.getEntityId1())) // retainedEntitiesD1.add(comparison.getEntityId1()); // if(!retainedEntitiesD2.contains(comparison.getEntityId2())) // retainedEntitiesD2.add(comparison.getEntityId2()); //////////////////////////// // if(c.getEntityId1()==1 && c.getEntityId2()==12088) // System.out.println(); double[] instanceValues = new double[8]; // int entityId2 = comparison.getEntityId2() + entityIndex.getDatasetLimit(); double ibf1 = Math.log(noOfBlocks / entityIndex.getNoOfEntityBlocks(c.getEntityId1(), 0)); double ibf2 = Math.log(noOfBlocks / entityIndex.getNoOfEntityBlocks(c.getEntityId2(), 1)); instanceValues[0] = commonBlockIndices.size() * ibf1 * ibf2; double raccb = 0; for (Integer index1 : commonBlockIndices) { raccb += 1.0 / comparisonsPerBlock[index1]; } if (raccb < 1.0E-6) { raccb = 1.0E-6; } instanceValues[1] = raccb; String temp = Integer.toString(entityId) + "00" + Integer.toString(neighborId - datasetLimit); instanceValues[2] = commonBlockIndices.size() / (redundantCPE[c.getEntityId1()] + redundantCPE[c.getEntityId2()] - commonBlockIndices.size()); instanceValues[3] = nonRedundantCPE[c.getEntityId1()]; instanceValues[4] = nonRedundantCPE[c.getEntityId2()]; // instanceValues[5] = ebc.getSimilarityAttribute(c.getEntityId1(), c.getEntityId2()); instanceValues[5] = getWeight(entityId, neighborId, ebc); instanceValues[6] = (Math.sqrt( Math.pow(averageWeight[entityId], 2) + Math.pow(averageWeight[neighborId], 2)) / 4) * getWeight(entityId, neighborId, ebc); instanceValues[7] = adp.isSuperfluous(getComparison(entityId, neighborId)) ? 1 : 0; Instance newInstance = new DenseInstance(1.0, instanceValues); newInstance.setDataset(trainingInstances); trainingInstances.add(newInstance); } } } else { Iterator<Integer> it = validEntitiesB.iterator(); for (int neighborId : validEntities) { Integer value = map.get(entityId); if (value != null && value == neighborId) { // System.out.println("----"); continue; } value = map.get(neighborId); if (value != null && value == entityId) { // System.out.println("----"); continue; } map.put(entityId, neighborId); int blockIndex = it.next(); // if (isValidComparison(entityId, neighborId,ebc)) { // totalComparisons++; // duplicatePropagation.isSuperfluous(getComparison(entityId, neighborId)); // // if(apagar++%1000==0) // // System.out.println(apagar); // // // // if(apagar==3) // // System.out.println(); // // // Comparison c ; // if(entityId<datasetLimit) // c = new Comparison(true, entityId, neighborId-datasetLimit); // else // c = new Comparison(true, entityId-datasetLimit, neighborId); // final List<Integer> commonBlockIndices = entityIndex.getCommonBlockIndices(blockIndex, c); // if(commonBlockIndices==null) // continue; // // if(!retainedEntitiesD1.contains(comparison.getEntityId1())) // // retainedEntitiesD1.add(comparison.getEntityId1()); // // if(!retainedEntitiesD2.contains(comparison.getEntityId2())) // // retainedEntitiesD2.add(comparison.getEntityId2()); // //////////////////////////// // double[] instanceValues = new double[8]; // // // int entityId2 = comparison.getEntityId2() + entityIndex.getDatasetLimit(); // // double ibf1 = Math.log(noOfBlocks/entityIndex.getNoOfEntityBlocks(entityId, 0)); // double ibf2 = Math.log(noOfBlocks/entityIndex.getNoOfEntityBlocks(neighborId-datasetLimit, 1)); // // instanceValues[0] = commonBlockIndices.size()*ibf1*ibf2; // // double raccb = 0; // for (Integer index1 : commonBlockIndices) { // raccb += 1.0 / comparisonsPerBlock[index1]; // } // if (raccb < 1.0E-6) { // raccb = 1.0E-6; // } // instanceValues[1] = raccb; // // instanceValues[2] = commonBlockIndices.size() / (redundantCPE[c.getEntityId1()] + redundantCPE[neighborId-datasetLimit] - commonBlockIndices.size()); // instanceValues[3] = nonRedundantCPE[entityId]; // instanceValues[4] = nonRedundantCPE[neighborId-datasetLimit]; // instanceValues[5]= (Math.sqrt(Math.pow(averageWeight[entityId], 2) + Math.pow(averageWeight[neighborId], 2)) / 4) * getWeight(entityId, neighborId, ebc); //// instanceValues[5] = ebc.getSimilarityAttribute(c.getEntityId1(), c.getEntityId2()); // instanceValues[5]= getWeight(entityId, neighborId, ebc); // instanceValues[6]= (Math.sqrt(Math.pow(averageWeight[entityId], 2) + Math.pow(averageWeight[neighborId], 2)) / 4) * getWeight(entityId, neighborId, ebc); // // instanceValues[7] = adp.isSuperfluous(getComparison(entityId, neighborId))?1:0; // // Instance newInstance = new DenseInstance(1.0, instanceValues); // newInstance.setDataset(trainingInstances); // trainingInstances.add(newInstance); // //return true; // } } } } return false; }
From source file:org.conqat.engine.commons.machine_learning.DataSetCreator.java
License:Apache License
/** * Creates a weka instance for the given classification object and the given * label and adds it to the given data set. *//* w w w.j a v a2 s .com*/ private Instance createInstance(T classificationObject, LABEL label, Instances dataSet) { Instance instance = instanceCreator.createWekaInstance(classificationObject, label); dataSet.add(instance); instance.setDataset(dataSet); return instance; }
From source file:org.goai.classification.impl.WekaClassifier.java
License:Apache License
/** * Classify item to one of classes/*from w w w . jav a2 s .c o m*/ * @param item double[] * @return String label of class */ @Override public String classify(double[] item) { try { //Instance out of double array Instance testInstance = new DenseInstance(1, item); //DataSet set for instance testInstance.setDataset(wekaDataSet); //classifyInstance returns double Double classDoubleValue = wekaClassifier.classifyInstance(testInstance); return classValsDoubleAsKey.get(classDoubleValue); } catch (Exception ex) { Logger.getLogger(WekaClassifier.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:org.goai.classification.impl.WekaClassifier.java
License:Apache License
/** * Calculate values of probability for classes that item can be classified as * @param item double[]// w ww. ja v a 2 s. c o m * @return Map<String, Double> */ @Override public Map<String, Double> classDistribution(double[] item) { try { //Instance out of double array Instance testInstance = new DenseInstance(1, item); //DataSet set for instance testInstance.setDataset(wekaDataSet); //Map for class value and their predict value Map<String, Double> map = new HashMap<String, Double>(); //Calculate predict values double[] predict = wekaClassifier.distributionForInstance(testInstance); //Number of class and predict values should be same if (classValues.size() != predict.length) { throw new RuntimeException( "Class values Set should be same size as double array with predict values"); } //fill map with class values and their predict values int i = 0; for (String val : classValues) { map.put(val, predict[i]); i++; } return map; } catch (Exception ex) { Logger.getLogger(WekaClassifier.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:org.goai.classification.impl.WekaClassifier.java
License:Apache License
/** * Converts map to weka data set//from w w w.j a va 2 s.com * @param itemClassMap Map<double[], String> * @return Instances Weka data set */ public Instances convertItemClassMapToInstances(Map<double[], String> itemClassMap) { if (itemClassMap.isEmpty()) { throw new RuntimeException("Map should have at least one element!"); } //Get first row as example for mapping attributes from sample Map.Entry<double[], String> row = itemClassMap.entrySet().iterator().next(); //Number of attributes without class attribute int numOfAttr = row.getKey().length; //possible class values fillClassValues(itemClassMap); //Sample size int capacity = itemClassMap.entrySet().size(); //Create empty Instances data set Instances newDataSet = createEmptyInstancesDataSet(numOfAttr, capacity); //Set class attribute index newDataSet.setClassIndex(numOfAttr); //Iterating through sample rows for (Map.Entry<double[], String> entry : itemClassMap.entrySet()) { //double array of values for particular class as String double[] el = entry.getKey(); String klasa = entry.getValue(); //Instance of double array for values with class attribute value double[] rowValues = new double[numOfAttr + 1]; //Values copy of common attributs for (int i = 0; i < numOfAttr; i++) { rowValues[i] = el[i]; } //Double value copy of class attribute rowValues[numOfAttr] = classVals.get(klasa); //dataRow as instance of DenseInstance class, 1 as instance weight and values of all attributes Instance dataRow = new DenseInstance(1, rowValues); dataRow.setDataset(newDataSet); newDataSet.add(dataRow); } return newDataSet; }
From source file:org.hypknowsys.wumprep.WUMprepWrapper.java
License:Open Source License
/** * Creates a dummy dataset from the input format, sends it to the script and * reads the script output's ARFF information that in turn is used to set * <code>this</code>' output format. * //from www . j a v a2s .co m * This mechanism allows a WUMprep script to alter the recordset layout as * long as this change is documented by the output ARFF header. For example, * the <tt>dnsLookup.pl</tt> script changes the <code>host_ip</code> field * to <code>host_dns</code> when performing IP lookups. * * @param instanceInfo * The input format. * @return Object containing the output instance structure. */ public Instances getScriptOutputFormat(Instances instanceInfo) { Instances outputFormat = instanceInfo; Instances testData = new Instances(instanceInfo); Instance testInstance = new Instance(testData.numAttributes()); testData.delete(); testInstance.setDataset(testData); // Initialize the testInstance's attribute values for (int i = 0; i < testInstance.numAttributes(); i++) { String aName = testInstance.attribute(i).name(); if (aName.equals("host_ip")) testInstance.setValue(i, "127.0.0.1"); else if (aName.equals("ts_day")) testInstance.setValue(i, "01"); else if (aName.equals("ts_month")) testInstance.setValue(i, "Jan"); else if (aName.equals("ts_year")) testInstance.setValue(i, "2005"); else if (aName.equals("ts_hour")) testInstance.setValue(i, "11"); else if (aName.equals("ts_minutes")) testInstance.setValue(i, "55"); else if (aName.equals("ts_seconds")) testInstance.setValue(i, "00"); else if (aName.equals("tz")) testInstance.setValue(i, "+0200"); else testInstance.setValue(i, aName + "-dummy"); } testData.add(testInstance); WUMprepWrapper testWrapper = new WUMprepWrapper(m_scriptName, m_args); testWrapper.start(); testWrapper.push(testData.toString()); testWrapper.push((Instance) null); class ErrorReader extends Thread implements Serializable { /** */ private static final long serialVersionUID = -488779846603045891L; PipedReader m_input = null; /** * Helper class for reading stderr output from the WUMprep script * * @param input The script's wrapper's stderr pipe reader */ ErrorReader(PipedReader input) { m_input = input; this.start(); } public void run() { try { while (m_input.read() >= 0) ; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } // read the stderr output new ErrorReader(testWrapper.getErrorPipe()); try { // ignore the stderr output outputFormat = new org.hypknowsys.wumprep4weka.core.Instances(testWrapper.getOutputPipe()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return outputFormat; }
From source file:org.knime.knip.suise.node.boundarymodel.contourdata.IRI.java
License:Open Source License
private Instances toSingleInstanceDataset(Instances miData, Instances flatData) throws Exception { MultiInstanceToPropositional convertToProp = new MultiInstanceToPropositional(); convertToProp.setInputFormat(miData); for (int i = 0; i < miData.numInstances(); i++) { convertToProp.input(miData.instance(i)); }//from w w w . j a va 2 s . c o m convertToProp.batchFinished(); if (flatData == null) { flatData = convertToProp.getOutputFormat(); flatData.deleteAttributeAt(0); // remove the bag index attribute } Instance processed; while ((processed = convertToProp.output()) != null) { processed.setDataset(null); processed.deleteAttributeAt(0); // remove the bag index attribute flatData.add(processed); } // remove class attribute // flatData.setClassIndex(-1); // flatData.deleteAttributeAt(flatData.numAttributes() - 1); // set weights int instanceIdx = 0; for (Instance bag : miData) { for (Instance instance : bag.relationalValue(1)) { flatData.get(instanceIdx).setWeight(instance.weight()); instanceIdx++; } } return flatData; }
From source file:org.knime.knip.suise.node.boundarymodel.contourdata.WekaMIContourDataClassifier.java
License:Open Source License
/** * {@inheritDoc}//from w w w .j a v a 2 s. co m */ @Override public void buildClassifier(ContourDataGrid cData, VectorDataList bgData) throws Exception { // transform input data to weka mi-instances m_data = initDataset(cData.numFeatures(), 2, cData.totalLength() + bgData.numVectors(), cData.width()); for (int r = 0; r < cData.totalLength(); r++) { Instances bagData = new Instances(m_data.attribute(1).relation(), cData.width()); for (int c = 0; c < cData.width(); c++) { int vecIdx = cData.getVectorIdx(c, r); Instance inst = new DenseInstance(cData.weight(vecIdx), cData.getVector(vecIdx)); inst.setDataset(bagData); bagData.add(inst); } int value = m_data.attribute(1).addRelation(bagData); Instance newBag = new DenseInstance(3); newBag.setValue(0, r); // bag id newBag.setValue(2, 1); // class attribute newBag.setValue(1, value); newBag.setWeight(1); newBag.setDataset(m_data); m_data.add(newBag); } for (int i = 0; i < bgData.numVectors(); i++) { Instances bagData = new Instances(m_data.attribute(1).relation(), cData.width()); Instance inst = new DenseInstance(bgData.weight(i), bgData.getVector(i)); inst.setDataset(bagData); bagData.add(inst); int value = m_data.attribute(1).addRelation(bagData); Instance newBag = new DenseInstance(3); newBag.setValue(0, cData.totalLength() + i); newBag.setValue(2, 0); newBag.setValue(1, value); newBag.setWeight(1); newBag.setDataset(m_data); m_data.add(newBag); } m_classifier.buildClassifier(m_data); }