List of usage examples for weka.core Instances add
@Override public boolean add(Instance instance)
From source file:cn.ict.zyq.bestConf.COMT2.COMT2.java
License:Open Source License
private void train() throws Exception { models = new M5P[ModelNum]; for (int i = 0; i < ModelNum; i++) { models[i] = buildModel(labeledInstances, M[i]); }/*from w w w . ja v a 2 s .co m*/ for (int i = 0; i < this.comtIterations; i++) { ArrayList<Instance>[] InstancePiSet = new ArrayList[ModelNum]; for (int j = 0; j < ModelNum; j++) InstancePiSet[j] = new ArrayList<Instance>(); for (int m = 0; m < ModelNum; m++) { double maxDelta = 0; Instance maxDeltaXY = null; Enumeration<Instance> enu = this.unlabeledInstances.enumerateInstances(); while (enu.hasMoreElements()) { Instance ulIns = enu.nextElement(); Instances omega = getSiblings(models[m], ulIns); double y = models[m].classifyInstance(ulIns); if (indexOfClass == -1) indexOfClass = labeledInstances.classIndex(); ulIns.setValue(indexOfClass, y); Instances instancesPi = new Instances(models[m].getM5RootNode().zyqGetTrainingSet()); instancesPi.add(ulIns); M5P modelPi = buildModel(instancesPi, M[m]); double delta = computeOmegaDelta(models[m], modelPi, omega); if (maxDelta < delta) { maxDelta = delta; maxDeltaXY = ulIns; } } //now check facts about delta if (maxDelta > 0) { InstancePiSet[m].add(maxDeltaXY); this.unlabeledInstances.delete(this.unlabeledInstances.indexOf(maxDeltaXY)); } } //check for both model boolean toExit = true; for (int m = 0; m < ModelNum; m++) { if (InstancePiSet[m].size() > 0) { toExit = false; break; } } if (toExit) break; else { //update the models int toGen = 0; for (int m = 0; m < ModelNum; m++) { Instances set = models[m].getM5RootNode().zyqGetTrainingSet(); toGen += InstancePiSet[m].size(); for (Instance ins : InstancePiSet[m]) set.add(ins); models[m] = buildModel(set, M[m]); } //Replenish pool U' to size p Instances toAdd = retrieveMore(toGen); unlabeledInstances.addAll(toAdd); } //we will go to another round of iteration } //iterate for a number of rounds or break out on empty InstancesPiSets //now we have the model as y = 0.5*sum(models[m].predict(x)) }
From source file:cn.ict.zyq.bestConf.COMT2.COMT2.java
License:Open Source License
private Instances retrieveMore(int toGen) { Instances retval = new Instances(this.unlabeldPool, toGen); for (int i = 0; i < toGen; i++) { retval.add(this.unlabeldPool.remove(rand.nextInt(this.unlabeldPool.size()))); }/*ww w. j ava2s .co m*/ return retval; }
From source file:cn.ict.zyq.bestConf.util.LHSInitializer.java
License:Open Source License
/** * Assumptions:(1)Numberic is continuous and has lower/upper bounds; (2) Nominals have domains permutable * //from ww w . j ava2 s . com * @param useMid true if to use the middle point of a subdomain, false if to use a random point within a subdomain */ public static Instances getMultiDim(ArrayList<Attribute> atts, int sampleSetSize, boolean useMid) { int L = Math.min(7, Math.max(sampleSetSize, atts.size()));//7 is chosen for no special reason double maxMinDist = 0, crntMinDist;//work as the threshold to select the sample set ArrayList<Integer>[] setWithMaxMinDist = null; //generate L sets of sampleSetSize points for (int i = 0; i < L; i++) { ArrayList<Integer>[] setPerm = generateOneSampleSet(sampleSetSize, atts.size()); //compute the minimum distance minDist between any sample pair for each set crntMinDist = minDistForSet(setPerm); //select the set with the maximum minDist if (crntMinDist > maxMinDist) { setWithMaxMinDist = setPerm; maxMinDist = crntMinDist; } } //generate and output the set with the maximum minDist as the result //first, divide the domain of each attribute into sampleSetSize equal subdomain double[][] bounds = new double[atts.size()][sampleSetSize + 1];//sampleSetSize+1 to include the lower and upper bounds Iterator<Attribute> itr = atts.iterator(); Attribute crntAttr; double pace; for (int i = 0; i < bounds.length; i++) { crntAttr = itr.next(); if (crntAttr.isNumeric()) { bounds[i][0] = crntAttr.getLowerNumericBound(); bounds[i][sampleSetSize] = crntAttr.getUpperNumericBound(); pace = (crntAttr.getUpperNumericBound() - crntAttr.getLowerNumericBound()) / sampleSetSize; for (int j = 1; j < sampleSetSize; j++) { bounds[i][j] = bounds[i][j - 1] + pace; } } else {//crntAttr.isNominal() if (crntAttr.numValues() >= sampleSetSize) { //randomly select among the set for (int j = 0; j <= sampleSetSize; j++) bounds[i][j] = uniRand.nextInt(crntAttr.numValues());//the position of one of the nominal values } else { //first round-robin int lastPart = sampleSetSize % crntAttr.numValues(); for (int j = 0; j < sampleSetSize - lastPart; j++) bounds[i][j] = j % crntAttr.numValues(); //then randomly select for (int j = sampleSetSize - lastPart; j <= sampleSetSize; j++) bounds[i][j] = uniRand.nextInt(crntAttr.numValues()); } } //nominal attribute } //get all subdomains //second, generate the set according to setWithMaxMinDist Instances data = new Instances("InitialSetByLHS", atts, sampleSetSize); for (int i = 0; i < sampleSetSize; i++) { double[] vals = new double[atts.size()]; for (int j = 0; j < vals.length; j++) { if (atts.get(j).isNumeric()) { vals[j] = useMid ? (bounds[j][setWithMaxMinDist[j].get(i)] + bounds[j][setWithMaxMinDist[j].get(i) + 1]) / 2 : bounds[j][setWithMaxMinDist[j].get(i)] + ((bounds[j][setWithMaxMinDist[j].get(i) + 1] - bounds[j][setWithMaxMinDist[j].get(i)]) * uniRand.nextDouble()); } else {//isNominal() vals[j] = bounds[j][setWithMaxMinDist[j].get(i)]; } } data.add(new DenseInstance(1.0, vals)); } //third, return the generated points return data; }
From source file:cn.ict.zyq.bestConf.util.LHSInitializer.java
License:Open Source License
/** * At current version, we assume all attributes are numeric attributes with bounds * //w w w. ja va 2 s .c o m * Let PACE be log10(upper/lower) * * @param useMid true if to use the middle point of a subdomain, false if to use a random point within a subdomain */ public static Instances getMultiDimContinuousLog(ArrayList<Attribute> atts, int sampleSetSize, boolean useMid) { int L = Math.min(7, Math.max(sampleSetSize, atts.size()));//7 is chosen for no special reason double maxMinDist = 0, crntMinDist;//work as the threshold to select the sample set ArrayList<Integer>[] setWithMaxMinDist = null; //generate L sets of sampleSetSize points for (int i = 0; i < L; i++) { ArrayList<Integer>[] setPerm = generateOneSampleSet(sampleSetSize, atts.size()); //compute the minimum distance minDist between any sample pair for each set crntMinDist = minDistForSet(setPerm); //select the set with the maximum minDist if (crntMinDist > maxMinDist) { setWithMaxMinDist = setPerm; maxMinDist = crntMinDist; } } //generate and output the set with the maximum minDist as the result //first, divide the domain of each attribute into sampleSetSize equal subdomain double[][] bounds = new double[atts.size()][sampleSetSize + 1];//sampleSetSize+1 to include the lower and upper bounds Iterator<Attribute> itr = atts.iterator(); Attribute crntAttr; int step, crntStep; for (int i = 0; i < bounds.length; i++) { crntAttr = itr.next(); bounds[i][0] = crntAttr.getLowerNumericBound(); bounds[i][sampleSetSize] = crntAttr.getUpperNumericBound(); crntStep = (int) Math.log10(bounds[i][sampleSetSize] - bounds[i][0]); step = sampleSetSize / crntStep;//num of points drawn after the multiplication of 10 int left = sampleSetSize % crntStep; if (bounds[i][0] == 0) bounds[i][0] = uniRand.nextInt(10); crntStep = 1; double theBound = bounds[i][sampleSetSize] / 10; for (int j = 1; j < sampleSetSize; j++) { if (crntStep >= step && bounds[i][j - 1] <= theBound) crntStep = 0; if (crntStep == 0) bounds[i][j] = bounds[i][j - step] * 10; else if (crntStep < step) bounds[i][j] = bounds[i][j - crntStep] * ((double) crntStep * 10. / ((double) step + 1.)); else if (crntStep >= step) bounds[i][j] = bounds[i][j - crntStep] * ((double) crntStep * 10. / (double) (left + step + 1)); if (bounds[i][j] >= bounds[i][sampleSetSize]) System.err.println("be careful!!!!"); crntStep++; } } //second, generate the set according to setWithMaxMinDist Instances data = new Instances("InitialSetByLHS", atts, sampleSetSize); for (int i = 0; i < sampleSetSize; i++) { double[] vals = new double[atts.size()]; for (int j = 0; j < vals.length; j++) { vals[j] = useMid ? (bounds[j][setWithMaxMinDist[j].get(i)] + bounds[j][setWithMaxMinDist[j].get(i) + 1]) / 2 : bounds[j][setWithMaxMinDist[j].get(i)] + ((bounds[j][setWithMaxMinDist[j].get(i) + 1] - bounds[j][setWithMaxMinDist[j].get(i)]) * uniRand.nextDouble()); } data.add(new DenseInstance(1.0, vals)); } //third, return the generated points return data; }
From source file:cn.ict.zyq.bestConf.util.LHSInitializer.java
License:Open Source License
/** * At current version, we assume all attributes are numeric attributes with bounds * //from w ww .j a v a 2 s . com * Let PACE be log10(upper/lower) * * @param useMid true if to use the middle point of a subdomain, false if to use a random point within a subdomain */ public static Instances getMultiDimContinuous(ArrayList<Attribute> atts, int sampleSetSize, boolean useMid) { int L = Math.min(7, Math.max(sampleSetSize, atts.size()));//7 is chosen for no special reason double maxMinDist = 0, crntMinDist;//work as the threshold to select the sample set ArrayList<Integer>[] setWithMaxMinDist = null; //generate L sets of sampleSetSize points for (int i = 0; i < L; i++) { ArrayList<Integer>[] setPerm = generateOneSampleSet(sampleSetSize, atts.size()); //compute the minimum distance minDist between any sample pair for each set crntMinDist = minDistForSet(setPerm); //select the set with the maximum minDist if (crntMinDist > maxMinDist) { setWithMaxMinDist = setPerm; maxMinDist = crntMinDist; } } //generate and output the set with the maximum minDist as the result //first, divide the domain of each attribute into sampleSetSize equal subdomain double[][] bounds = new double[atts.size()][sampleSetSize + 1];//sampleSetSize+1 to include the lower and upper bounds Iterator<Attribute> itr = atts.iterator(); Attribute crntAttr; boolean[] roundToInt = new boolean[atts.size()]; for (int i = 0; i < bounds.length; i++) { crntAttr = itr.next(); uniBoundsGeneration(bounds[i], crntAttr, sampleSetSize); //flexibleBoundsGeneration(bounds[i], crntAttr, sampleSetSize); if (bounds[i][sampleSetSize] - bounds[i][0] > sampleSetSize) roundToInt[i] = true; } //second, generate the set according to setWithMaxMinDist Instances data = new Instances("InitialSetByLHS", atts, sampleSetSize); for (int i = 0; i < sampleSetSize; i++) { double[] vals = new double[atts.size()]; for (int j = 0; j < vals.length; j++) { vals[j] = useMid ? (bounds[j][setWithMaxMinDist[j].get(i)] + bounds[j][setWithMaxMinDist[j].get(i) + 1]) / 2 : bounds[j][setWithMaxMinDist[j].get(i)] + ((bounds[j][setWithMaxMinDist[j].get(i) + 1] - bounds[j][setWithMaxMinDist[j].get(i)]) * uniRand.nextDouble()); if (roundToInt[j]) vals[j] = (int) vals[j]; } data.add(new DenseInstance(1.0, vals)); } //third, return the generated points return data; }
From source file:cn.ict.zyq.bestConf.util.LHSInitializer.java
License:Open Source License
/** * At current version, we assume all attributes are numeric attributes with bounds * //from ww w .j a v a 2s . c om * Let PACE be upper-lower DIVided by the sampleSetSize * * @param useMid true if to use the middle point of a subdomain, false if to use a random point within a subdomain */ public static Instances getMultiDimContinuousDiv(ArrayList<Attribute> atts, int sampleSetSize, boolean useMid) { int L = Math.min(7, Math.max(sampleSetSize, atts.size()));//7 is chosen for no special reason double maxMinDist = 0, crntMinDist;//work as the threshold to select the sample set ArrayList<Integer>[] setWithMaxMinDist = null; //generate L sets of sampleSetSize points for (int i = 0; i < L; i++) { ArrayList<Integer>[] setPerm = generateOneSampleSet(sampleSetSize, atts.size()); //compute the minimum distance minDist between any sample pair for each set crntMinDist = minDistForSet(setPerm); //select the set with the maximum minDist if (crntMinDist > maxMinDist) { setWithMaxMinDist = setPerm; maxMinDist = crntMinDist; } } //generate and output the set with the maximum minDist as the result //first, divide the domain of each attribute into sampleSetSize equal subdomain double[][] bounds = new double[atts.size()][sampleSetSize + 1];//sampleSetSize+1 to include the lower and upper bounds Iterator<Attribute> itr = atts.iterator(); Attribute crntAttr; double pace; for (int i = 0; i < bounds.length; i++) { crntAttr = itr.next(); bounds[i][0] = crntAttr.getLowerNumericBound(); bounds[i][sampleSetSize] = crntAttr.getUpperNumericBound(); pace = (bounds[i][sampleSetSize] - bounds[i][0]) / sampleSetSize; for (int j = 1; j < sampleSetSize; j++) { bounds[i][j] = bounds[i][j - 1] + pace; } } //second, generate the set according to setWithMaxMinDist Instances data = new Instances("InitialSetByLHS", atts, sampleSetSize); for (int i = 0; i < sampleSetSize; i++) { double[] vals = new double[atts.size()]; for (int j = 0; j < vals.length; j++) { vals[j] = useMid ? (bounds[j][setWithMaxMinDist[j].get(i)] + bounds[j][setWithMaxMinDist[j].get(i) + 1]) / 2 : bounds[j][setWithMaxMinDist[j].get(i)] + ((bounds[j][setWithMaxMinDist[j].get(i) + 1] - bounds[j][setWithMaxMinDist[j].get(i)]) * uniRand.nextDouble()); } data.add(new DenseInstance(1.0, vals)); } //third, return the generated points return data; }
From source file:com.actelion.research.orbit.imageAnalysis.imaging.TMAPoints.java
License:Open Source License
/** * returns x/y pairs for each input point * * @param pList/*from w w w .ja v a 2s .c om*/ * @return */ private HashMap<Point, Point> clusterLines(List<Point> pList) { ArrayList<Attribute> attrListX = new ArrayList<Attribute>(2); attrListX.add(new Attribute("xvalue")); ArrayList<Attribute> attrListY = new ArrayList<Attribute>(2); attrListY.add(new Attribute("yvalue")); //attrList.add(new Attribute("class")); Instances xInst = new Instances("xlines", attrListX, pList.size()); Instances yInst = new Instances("ylines", attrListY, pList.size()); //instances.setClassIndex(1); for (Point p : pList) { //Instance inst = new DenseInstance(1d, new double[]{p.x,Double.NaN}); Instance instX = new DenseInstance(1d, new double[] { p.x }); instX.setDataset(xInst); //inst.setClassMissing(); xInst.add(instX); Instance instY = new DenseInstance(1d, new double[] { p.y }); instY.setDataset(yInst); yInst.add(instY); } try { EM colClusterer = new EM(); int numCols = guessNumClusters(colClusterer, xInst, 1, 20); colClusterer.setNumClusters(numCols); colClusterer.buildClusterer(xInst); logger.debug("NumCols: " + colClusterer.getNumClusters()); EM rowClusterer = new EM(); int numRows = guessNumClusters(rowClusterer, yInst, 1, 20); rowClusterer.setNumClusters(numRows); rowClusterer.buildClusterer(yInst); logger.debug("NumRows: " + rowClusterer.getNumClusters()); logger.trace("ColClusterer:"); HashMap<Integer, Integer> colHash = sortAndpPrintCluster(colClusterer); logger.trace("RowClusterer:"); HashMap<Integer, Integer> rowHash = sortAndpPrintCluster(rowClusterer); if (logger.isTraceEnabled()) { logger.trace("ColHash:"); for (Integer i : colHash.keySet()) { logger.trace("cluster " + i + ": " + colHash.get(i)); } logger.trace("RowHash:"); for (Integer i : rowHash.keySet()) { logger.trace("cluster " + i + ": " + rowHash.get(i)); } } // classify points HashMap<Point, Point> pMap = new HashMap<Point, Point>(); for (Point p : pList) { Instance instX = new DenseInstance(1d, new double[] { p.x }); instX.setDataset(xInst); Instance instY = new DenseInstance(1d, new double[] { p.y }); instY.setDataset(yInst); int x = colClusterer.clusterInstance(instX); int y = rowClusterer.clusterInstance(instY); x = colHash.get(x); y = rowHash.get(y); logger.trace(p + ": " + x + "/" + y); pMap.put(p, new Point(x, y)); } return pMap; } catch (Exception e) { e.printStackTrace(); logger.error("error while clustering points", e); return null; } }
From source file:com.daniel.convert.IncrementalClassifier.java
License:Open Source License
/** * Expects an ARFF file as first argument (class attribute is assumed to be * the last attribute)./*from www . j av a 2s.c o m*/ * * @param args * the commandline arguments * @throws Exception * if something goes wrong */ public static BayesNet treinar(String[] args) throws Exception { // load data ArffLoader loader = new ArffLoader(); loader.setFile(new File(args[0])); Instances structure = loader.getStructure(); structure.setClassIndex(structure.numAttributes() - 1); // train NaiveBayes BayesNet BayesNet = new BayesNet(); Instance current; while ((current = loader.getNextInstance(structure)) != null) { structure.add(current); } BayesNet.buildClassifier(structure); // output generated model // System.out.println(nb); // test set BayesNet BayesNetTest = new BayesNet(); // test the model Evaluation eTest = new Evaluation(structure); // eTest.evaluateModel(nb, structure); eTest.crossValidateModel(BayesNetTest, structure, 15, new Random(1)); // Print the result la Weka explorer: String strSummary = eTest.toSummaryString(); System.out.println(strSummary); return BayesNet; }
From source file:com.dhamacher.sentimentanalysis4tweets.preprocessing.TweetFeatureExtractor.java
License:Apache License
/** * Method which contructs the arff file for weka with the training data */// w w w.j av a 2 s. c om public static void constructModel() { Instances instdata = null; try { FastVector atts; atts = new FastVector(); atts.addElement(new Attribute("content", (FastVector) null)); FastVector fvClassVal = new FastVector(4); fvClassVal.addElement(""); fvClassVal.addElement("neutral"); fvClassVal.addElement("negative"); fvClassVal.addElement("positive"); Attribute ClassAttribute = new Attribute("Class", fvClassVal); atts.addElement(ClassAttribute); instdata = new Instances("tweetData", atts, 0); CsvReader data = new CsvReader("../classified data/traindata.csv"); int i = 0; while (data.readRecord()) { double[] vals = new double[instdata.numAttributes()]; String class_id = data.get(0); switch (Integer.parseInt(class_id)) { case 0: class_id = "negative"; break; case 2: class_id = "neutral"; break; case 4: class_id = "positive"; break; } String tweet_content = data.get(5); Instance iInst = new Instance(2); iInst.setValue((Attribute) atts.elementAt(0), tweet_content); iInst.setValue((Attribute) atts.elementAt(1), class_id); instdata.add(iInst); System.out.println("[" + i + "] " + class_id + ":" + tweet_content); i++; } data.close(); StringToWordVector filter = new StringToWordVector(); instdata.setClassIndex(instdata.numAttributes() - 1); filter.setInputFormat(instdata); Instances newdata = Filter.useFilter(instdata, filter); ArffSaver saver = new ArffSaver(); saver.setInstances(newdata); saver.setFile(new File("./data/train2data.arff")); saver.writeBatch(); } catch (Exception ex) { Logger.getLogger(TweetFeatureExtractor.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:com.entopix.maui.main.MauiModelBuilder.java
License:Open Source License
/** * Builds the model from the training data * @throws MauiFilterException //from w ww . j a va 2 s . co m */ public MauiFilter buildModel(List<MauiDocument> documents) throws MauiFilterException { log.info("-- Building the model... "); FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("document", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); mauiFilter = new MauiFilter(); mauiFilter.setMaxPhraseLength(maxPhraseLength); mauiFilter.setMinPhraseLength(minPhraseLength); mauiFilter.setMinNumOccur(minNumOccur); mauiFilter.setStemmer(stemmer); mauiFilter.setDocumentLanguage(documentLanguage); mauiFilter.setVocabularyName(vocabularyName); mauiFilter.setVocabularyFormat(vocabularyFormat); mauiFilter.setStopwords(stopwords); mauiFilter.setVocabulary(vocabulary); if (classifier != null) { mauiFilter.setClassifier(classifier); } mauiFilter.setInputFormat(data); // set features configurations mauiFilter.setBasicFeatures(useBasicFeatures); mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature); mauiFilter.setFrequencyFeatures(useFrequencyFeatures); mauiFilter.setPositionsFeatures(usePositionsFeatures); mauiFilter.setLengthFeature(useLengthFeature); mauiFilter.setThesaurusFeatures(useThesaurusFeatures); mauiFilter.setWikipediaFeatures(useWikipediaFeatures, wikiFeatures); mauiFilter.setClassifier(classifier); if (!vocabularyName.equals("none")) { loadVocabulary(); mauiFilter.setVocabulary(vocabulary); } log.info("-- Adding documents as instances... "); for (MauiDocument document : documents) { double[] newInst = new double[3]; newInst[0] = data.attribute(0).addStringValue(document.getFileName()); // Adding the text and the topics for the document to the instance if (document.getTextContent().length() > 0) { newInst[1] = data.attribute(1).addStringValue(document.getTextContent()); } else { newInst[1] = Instance.missingValue(); } if (document.getTopicsString().length() > 0) { newInst[2] = data.attribute(2).addStringValue(document.getTopicsString()); } else { newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); } log.info("-- Building the model... "); mauiFilter.batchFinished(); while ((mauiFilter.output()) != null) { } return mauiFilter; }