List of usage examples for weka.core Instances classIndex
publicint classIndex()
From source file:org.opentox.jaqpot3.qsar.trainer.SvmRegression.java
License:Open Source License
@Override public Model train(Instances data) throws JaqpotException { try {/*from w w w . jav a2s . c o m*/ Attribute target = data.attribute(targetUri.toString()); if (target == null) { throw new QSARException("The prediction feature you provided was not found in the dataset"); } else { if (!target.isNumeric()) { throw new QSARException("The prediction feature you provided is not numeric."); } } data.setClass(target); //data.deleteAttributeAt(0);//remove the first attribute, i.e. 'compound_uri' or 'URI' /* Very important: place the target feature at the end! (target = last)*/ int numAttributes = data.numAttributes(); int classIndex = data.classIndex(); Instances orderedTrainingSet = null; List<String> properOrder = new ArrayList<String>(numAttributes); for (int j = 0; j < numAttributes; j++) { if (j != classIndex) { properOrder.add(data.attribute(j).name()); } } properOrder.add(data.attribute(classIndex).name()); try { orderedTrainingSet = InstancesUtil.sortByFeatureAttrList(properOrder, data, -1); } catch (JaqpotException ex) { logger.error(null, ex); } orderedTrainingSet.setClass(orderedTrainingSet.attribute(targetUri.toString())); getTask().getMeta() .addComment("Dataset successfully retrieved and converted into a weka.core.Instances object"); UpdateTask firstTaskUpdater = new UpdateTask(getTask()); firstTaskUpdater.setUpdateMeta(true); firstTaskUpdater.setUpdateTaskStatus(true);//TODO: Is this necessary? try { firstTaskUpdater.update(); } catch (DbException ex) { throw new JaqpotException(ex); } finally { try { firstTaskUpdater.close(); } catch (DbException ex) { throw new JaqpotException(ex); } } Model m = new Model(Configuration.getBaseUri().augment("model", getUuid().toString())); // INITIALIZE THE REGRESSOR regressor SVMreg regressor = new SVMreg(); final String[] regressorOptions = { "-P", Double.toString(epsilon), "-T", Double.toString(tolerance) }; Kernel svm_kernel = null; if (kernel.equalsIgnoreCase("rbf")) { RBFKernel rbf_kernel = new RBFKernel(); rbf_kernel.setGamma(Double.parseDouble(Double.toString(gamma))); rbf_kernel.setCacheSize(Integer.parseInt(Integer.toString(cacheSize))); svm_kernel = rbf_kernel; } else if (kernel.equalsIgnoreCase("polynomial")) { PolyKernel poly_kernel = new PolyKernel(); poly_kernel.setExponent(Double.parseDouble(Integer.toString(degree))); poly_kernel.setCacheSize(Integer.parseInt(Integer.toString(cacheSize))); poly_kernel.setUseLowerOrder(true); svm_kernel = poly_kernel; } else if (kernel.equalsIgnoreCase("linear")) { PolyKernel poly_kernel = new PolyKernel(); poly_kernel.setExponent((double) 1.0); poly_kernel.setCacheSize(Integer.parseInt(Integer.toString(cacheSize))); poly_kernel.setUseLowerOrder(true); svm_kernel = poly_kernel; } try { regressor.setOptions(regressorOptions); } catch (final Exception ex) { throw new QSARException("Bad options in SVM trainer for epsilon = {" + epsilon + "} or " + "tolerance = {" + tolerance + "}.", ex); } regressor.setKernel(svm_kernel); // START TRAINING & CREATE MODEL try { regressor.buildClassifier(orderedTrainingSet); // evaluate classifier and print some statistics Evaluation eval = new Evaluation(orderedTrainingSet); eval.evaluateModel(regressor, orderedTrainingSet); String stats = eval.toSummaryString("", false); ActualModel am = new ActualModel(regressor); am.setStatistics(stats); m.setActualModel(am); // m.setStatistics(stats); } catch (NotSerializableException ex) { String message = "Model is not serializable"; logger.error(message, ex); throw new JaqpotException(message, ex); } catch (final Exception ex) { throw new QSARException("Unexpected condition while trying to train " + "the model. Possible explanation : {" + ex.getMessage() + "}", ex); } m.setAlgorithm(getAlgorithm()); m.setCreatedBy(getTask().getCreatedBy()); m.setDataset(datasetUri); m.addDependentFeatures(dependentFeature); try { dependentFeature.loadFromRemote(); } catch (ServiceInvocationException ex) { java.util.logging.Logger.getLogger(SvmRegression.class.getName()).log(Level.SEVERE, null, ex); } m.addDependentFeatures(dependentFeature); m.setIndependentFeatures(independentFeatures); String predictionFeatureUri = null; Feature predictedFeature = publishFeature(m, dependentFeature.getUnits(), "Feature created as prediction feature for SVM model " + m.getUri(), datasetUri, featureService); m.addPredictedFeatures(predictedFeature); predictionFeatureUri = predictedFeature.getUri().toString(); getTask().getMeta().addComment("Prediction feature " + predictionFeatureUri + " was created."); /* SET PARAMETERS FOR THE TRAINED MODEL */ m.setParameters(new HashSet<Parameter>()); Parameter<String> kernelParam = new Parameter("kernel", new LiteralValue<String>(kernel)) .setScope(Parameter.ParameterScope.OPTIONAL); kernelParam.setUri(Services.anonymous().augment("parameter", RANDOM.nextLong())); Parameter<Double> costParam = new Parameter("cost", new LiteralValue<Double>(cost)) .setScope(Parameter.ParameterScope.OPTIONAL); costParam.setUri(Services.anonymous().augment("parameter", RANDOM.nextLong())); Parameter<Double> gammaParam = new Parameter("gamma", new LiteralValue<Double>(gamma)) .setScope(Parameter.ParameterScope.OPTIONAL); gammaParam.setUri(Services.anonymous().augment("parameter", RANDOM.nextLong())); Parameter<Double> epsilonParam = new Parameter("espilon", new LiteralValue<Double>(epsilon)) .setScope(Parameter.ParameterScope.OPTIONAL); epsilonParam.setUri(Services.anonymous().augment("parameter", RANDOM.nextLong())); Parameter<Integer> degreeParam = new Parameter("degree", new LiteralValue<Integer>(degree)) .setScope(Parameter.ParameterScope.OPTIONAL); degreeParam.setUri(Services.anonymous().augment("parameter", RANDOM.nextLong())); Parameter<Double> toleranceParam = new Parameter("tolerance", new LiteralValue<Double>(tolerance)) .setScope(Parameter.ParameterScope.OPTIONAL); toleranceParam.setUri(Services.anonymous().augment("parameter", RANDOM.nextLong())); m.getParameters().add(kernelParam); m.getParameters().add(costParam); m.getParameters().add(gammaParam); m.getParameters().add(epsilonParam); m.getParameters().add(degreeParam); m.getParameters().add(toleranceParam); //save the instances being predicted to abstract trainer for calculating DoA predictedInstances = orderedTrainingSet; excludeAttributesDoA.add(dependentFeature.getUri().toString()); return m; } catch (QSARException ex) { logger.debug(null, ex); throw new JaqpotException(ex); } }
From source file:org.opentox.qsar.processors.trainers.classification.NaiveBayesTrainer.java
License:Open Source License
public QSARModel train(Instances data) throws QSARException { // GET A UUID AND DEFINE THE TEMPORARY FILE WHERE THE TRAINING DATA // ARE STORED IN ARFF FORMAT PRIOR TO TRAINING. final String rand = java.util.UUID.randomUUID().toString(); final String temporaryFilePath = ServerFolders.temp + "/" + rand + ".arff"; final File tempFile = new File(temporaryFilePath); // SAVE THE DATA IN THE TEMPORARY FILE try {/*from w ww. ja va2 s. co m*/ ArffSaver dataSaver = new ArffSaver(); dataSaver.setInstances(data); dataSaver.setDestination(new FileOutputStream(tempFile)); dataSaver.writeBatch(); if (!tempFile.exists()) { throw new IOException("Temporary File was not created"); } } catch (final IOException ex) {/* * The content of the dataset cannot be * written to the destination file due to * some communication issue. */ tempFile.delete(); throw new RuntimeException( "Unexpected condition while trying to save the " + "dataset in a temporary ARFF file", ex); } NaiveBayes classifier = new NaiveBayes(); String[] generalOptions = { "-c", Integer.toString(data.classIndex() + 1), "-t", temporaryFilePath, /// Save the model in the following directory "-d", ServerFolders.models_weka + "/" + uuid }; try { Evaluation.evaluateModel(classifier, generalOptions); } catch (final Exception ex) { tempFile.delete(); throw new QSARException(Cause.XQReg350, "Unexpected condition while trying to train " + "an SVM model. Possible explanation : {" + ex.getMessage() + "}", ex); } QSARModel model = new QSARModel(); model.setParams(getParameters()); model.setCode(uuid.toString()); model.setAlgorithm(YaqpAlgorithms.NAIVE_BAYES); model.setDataset(datasetUri); model.setModelStatus(ModelStatus.UNDER_DEVELOPMENT); ArrayList<Feature> independentFeatures = new ArrayList<Feature>(); for (int i = 0; i < data.numAttributes(); i++) { Feature f = new Feature(data.attribute(i).name()); if (data.classIndex() != i) { independentFeatures.add(f); } } Feature dependentFeature = new Feature(data.classAttribute().name()); Feature predictedFeature = dependentFeature; model.setDependentFeature(dependentFeature); model.setIndependentFeatures(independentFeatures); model.setPredictionFeature(predictedFeature); tempFile.delete(); return model; }
From source file:org.opentox.qsar.processors.trainers.classification.SVCTrainer.java
License:Open Source License
public QSARModel train(Instances data) throws QSARException { // GET A UUID AND DEFINE THE TEMPORARY FILE WHERE THE TRAINING DATA // ARE STORED IN ARFF FORMAT PRIOR TO TRAINING. final String rand = java.util.UUID.randomUUID().toString(); final String temporaryFilePath = ServerFolders.temp + "/" + rand + ".arff"; final File tempFile = new File(temporaryFilePath); // SAVE THE DATA IN THE TEMPORARY FILE try {/*from www .j a va 2 s . co m*/ ArffSaver dataSaver = new ArffSaver(); dataSaver.setInstances(data); dataSaver.setDestination(new FileOutputStream(tempFile)); dataSaver.writeBatch(); if (!tempFile.exists()) { throw new IOException("Temporary File was not created"); } } catch (final IOException ex) {/* * The content of the dataset cannot be * written to the destination file due to * some communication issue. */ tempFile.delete(); throw new RuntimeException( "Unexpected condition while trying to save the " + "dataset in a temporary ARFF file", ex); } // INITIALIZE THE CLASSIFIER SMO classifier = new SMO(); classifier.setEpsilon(0.1); classifier.setToleranceParameter(tolerance); // CONSTRUCT A KERNEL ACCORDING TO THE POSTED PARAMETERS // SUPPORTED KERNELS ARE {rbf, linear, polynomial} Kernel svc_kernel = null; if (this.kernel.equalsIgnoreCase("rbf")) { RBFKernel rbf_kernel = new RBFKernel(); rbf_kernel.setGamma(gamma); rbf_kernel.setCacheSize(cacheSize); svc_kernel = rbf_kernel; } else if (this.kernel.equalsIgnoreCase("polynomial")) { PolyKernel poly_kernel = new PolyKernel(); poly_kernel.setExponent(degree); poly_kernel.setCacheSize(cacheSize); poly_kernel.setUseLowerOrder(true); svc_kernel = poly_kernel; } else if (this.kernel.equalsIgnoreCase("linear")) { PolyKernel linear_kernel = new PolyKernel(); linear_kernel.setExponent((double) 1.0); linear_kernel.setCacheSize(cacheSize); linear_kernel.setUseLowerOrder(true); svc_kernel = linear_kernel; } classifier.setKernel(svc_kernel); String modelFilePath = ServerFolders.models_weka + "/" + uuid.toString(); String[] generalOptions = { "-c", Integer.toString(data.classIndex() + 1), "-t", temporaryFilePath, /// Save the model in the following directory "-d", modelFilePath }; // AFTER ALL, BUILD THE CLASSIFICATION MODEL AND SAVE IT AS A SERIALIZED // WEKA FILE IN THE CORRESPONDING DIRECTORY. try { Evaluation.evaluateModel(classifier, generalOptions); } catch (final Exception ex) { tempFile.delete(); throw new QSARException(Cause.XQReg350, "Unexpected condition while trying to train " + "a support vector classification model. Possible explanation : {" + ex.getMessage() + "}", ex); } ArrayList<Feature> independentFeatures = new ArrayList<Feature>(); for (int i = 0; i < data.numAttributes(); i++) { Feature f = new Feature(data.attribute(i).name()); if (data.classIndex() != i) { independentFeatures.add(f); } } Feature dependentFeature = new Feature(data.classAttribute().name()); Feature predictedFeature = dependentFeature; QSARModel model = new QSARModel(); model.setCode(uuid.toString()); model.setAlgorithm(YaqpAlgorithms.SVC); model.setPredictionFeature(predictedFeature); model.setDependentFeature(dependentFeature); model.setIndependentFeatures(independentFeatures); model.setDataset(datasetUri); model.setParams(getParameters()); model.setModelStatus(ModelStatus.UNDER_DEVELOPMENT); tempFile.delete(); return model; }
From source file:org.opentox.qsar.processors.trainers.regression.MLRTrainer.java
License:Open Source License
/** * Trains the MLR model given an Instances object with the training data. The prediction * feature (class attributre) is specified in the constructor of the class. * @param data The training data as <code>weka.core.Instances</code> object. * @return The QSARModel corresponding to the trained model. * @throws QSARException In case the model cannot be trained * <p>//from ww w .j a v a 2 s. co m * <table> * <thead> * <tr> * <td><b>Code</b></td><td><b>Explanation</b></td> * </tr> * </thead> * <tbody> * <tr> * <td>XQReg1</td><td>Could not train the an model</td> * </tr> * <tr> * <td>XQReg2</td><td>Could not generate PMML representation for the model</td> * </tr> * <tr> * <td>XQReg202</td><td>The prediction feature you provided is not a valid numeric attribute of the dataset</td> * </tr> * </tbody> * </table> * </p> * @throws NullPointerException * In case the provided training data is null. */ public QSARModel train(Instances data) throws QSARException { // GET A UUID AND DEFINE THE TEMPORARY FILE WHERE THE TRAINING DATA // ARE STORED IN ARFF FORMAT PRIOR TO TRAINING. final String rand = java.util.UUID.randomUUID().toString(); final String temporaryFilePath = ServerFolders.temp + "/" + rand + ".arff"; final File tempFile = new File(temporaryFilePath); // SAVE THE DATA IN THE TEMPORARY FILE try { ArffSaver dataSaver = new ArffSaver(); dataSaver.setInstances(data); dataSaver.setDestination(new FileOutputStream(tempFile)); dataSaver.writeBatch(); } catch (final IOException ex) { tempFile.delete(); throw new RuntimeException( "Unexpected condition while trying to save the " + "dataset in a temporary ARFF file", ex); } LinearRegression linreg = new LinearRegression(); String[] linRegOptions = { "-S", "1", "-C" }; try { linreg.setOptions(linRegOptions); linreg.buildClassifier(data); } catch (final Exception ex) {// illegal options or could not build the classifier! String message = "MLR Model could not be trained"; YaqpLogger.LOG.log(new Trace(getClass(), message + " :: " + ex)); throw new QSARException(Cause.XQReg1, message, ex); } try { generatePMML(linreg, data); } catch (final YaqpIOException ex) { String message = "Could not generate PMML representation for MLR model :: " + ex; throw new QSARException(Cause.XQReg2, message, ex); } // PERFORM THE TRAINING String[] generalOptions = { "-c", Integer.toString(data.classIndex() + 1), "-t", temporaryFilePath, /// Save the model in the following directory "-d", ServerFolders.models_weka + "/" + uuid }; try { Evaluation.evaluateModel(linreg, generalOptions); } catch (final Exception ex) { tempFile.delete(); throw new QSARException(Cause.XQReg350, "Unexpected condition while trying to train " + "an SVM model. Possible explanation : {" + ex.getMessage() + "}", ex); } ArrayList<Feature> independentFeatures = new ArrayList<Feature>(); for (int i = 0; i < data.numAttributes(); i++) { Feature f = new Feature(data.attribute(i).name()); if (data.classIndex() != i) { independentFeatures.add(f); } } Feature dependentFeature = new Feature(data.classAttribute().name()); Feature predictedFeature = dependentFeature; QSARModel model = new QSARModel(uuid.toString(), predictedFeature, dependentFeature, independentFeatures, YaqpAlgorithms.MLR, new User(), null, datasetUri, ModelStatus.UNDER_DEVELOPMENT); model.setParams(new HashMap<String, AlgorithmParameter>()); return model; }
From source file:org.opentox.qsar.processors.trainers.regression.MLRTrainer.java
License:Open Source License
/** * Generates the PMML representation of the model and stores in the hard * disk./* w w w . ja va2 s . c om*/ * @param coefficients The vector of the coefficients of the MLR model. * @param model_id The id of the generated model. * TODO: build the XML using some XML editor */ // <editor-fold defaultstate="collapsed" desc="PMML generation routine!"> private void generatePMML(final LinearRegression wekaModel, final Instances data) throws YaqpIOException { final double[] coefficients = wekaModel.coefficients(); StringBuilder pmml = new StringBuilder(); pmml.append("<?xml version=\"1.0\" ?>"); pmml.append(PMMLIntro); pmml.append("<Model ID=\"" + uuid.toString() + "\" Name=\"MLR Model\">\n"); pmml.append("<AlgorithmID href=\"" + Configuration.BASE_URI + "/algorithm/mlr\"/>\n"); pmml.append("<DatasetID href=\"" + datasetUri + "\"/>\n"); pmml.append("<AlgorithmParameters />\n"); pmml.append("<FeatureDefinitions>\n"); for (int k = 1; k <= data.numAttributes(); k++) { pmml.append("<link href=\"" + data.attribute(k - 1).name() + "\"/>\n"); } pmml.append("<target index=\"" + data.attribute(predictionFeature).index() + "\" name=\"" + predictionFeature + "\"/>\n"); pmml.append("</FeatureDefinitions>\n"); pmml.append("<Timestamp>" + java.util.GregorianCalendar.getInstance().getTime() + "</Timestamp>\n"); pmml.append("</Model>\n"); pmml.append("<DataDictionary numberOfFields=\"" + data.numAttributes() + "\" >\n"); for (int k = 0; k <= data.numAttributes() - 1; k++) { pmml.append("<DataField name=\"" + data.attribute(k).name() + "\" optype=\"continuous\" dataType=\"double\" />\n"); } pmml.append("</DataDictionary>\n"); // RegressionModel pmml.append("<RegressionModel modelName=\"" + uuid.toString() + "\"" + " functionName=\"regression\"" + " modelType=\"linearRegression\"" + " algorithmName=\"linearRegression\"" + " targetFieldName=\"" + data.classAttribute().name() + "\"" + ">\n"); // RegressionModel::MiningSchema pmml.append("<MiningSchema>\n"); for (int k = 0; k <= data.numAttributes() - 1; k++) { if (k != data.classIndex()) { pmml.append("<MiningField name=\"" + data.attribute(k).name() + "\" />\n"); } } pmml.append("<MiningField name=\"" + data.attribute(data.classIndex()).name() + "\" " + "usageType=\"predicted\"/>\n"); pmml.append("</MiningSchema>\n"); // RegressionModel::RegressionTable pmml.append("<RegressionTable intercept=\"" + coefficients[coefficients.length - 1] + "\">\n"); for (int k = 0; k <= data.numAttributes() - 1; k++) { if (!(predictionFeature.equals(data.attribute(k).name()))) { pmml.append("<NumericPredictor name=\"" + data.attribute(k).name() + "\" " + " exponent=\"1\" " + "coefficient=\"" + coefficients[k] + "\"/>\n"); } } pmml.append("</RegressionTable>\n"); pmml.append("</RegressionModel>\n"); pmml.append("</PMML>\n\n"); try { FileWriter fwriter = new FileWriter(ServerFolders.models_pmml + "/" + uuid.toString()); BufferedWriter writer = new BufferedWriter(fwriter); writer.write(pmml.toString()); writer.flush(); writer.close(); } catch (IOException ex) { throw new YaqpIOException(Cause.XQReg3, "Could not write data to PMML file :" + uuid.toString(), ex); } }
From source file:org.opentox.qsar.processors.trainers.regression.SVMTrainer.java
License:Open Source License
/** * * @param data/*from ww w .jav a2s . co m*/ * @return * @throws QSARException */ public QSARModel train(Instances data) throws QSARException { // NOTE: The checks (check if data is null and if the prediction feature is // acceptable are found in WekaRegressor. The method preprocessData(Instances) // does this job. // GET A UUID AND DEFINE THE TEMPORARY FILE WHERE THE TRAINING DATA // ARE STORED IN ARFF FORMAT PRIOR TO TRAINING. final String rand = java.util.UUID.randomUUID().toString(); final String temporaryFilePath = ServerFolders.temp + "/" + rand + ".arff"; final File tempFile = new File(temporaryFilePath); // SAVE THE DATA IN THE TEMPORARY FILE try { ArffSaver dataSaver = new ArffSaver(); dataSaver.setInstances(data); dataSaver.setDestination(new FileOutputStream(tempFile)); dataSaver.writeBatch(); } catch (final IOException ex) { tempFile.delete(); throw new RuntimeException( "Unexpected condition while trying to save the " + "dataset in a temporary ARFF file", ex); } // INITIALIZE THE REGRESSOR SVMreg regressor = new SVMreg(); final String[] regressorOptions = { "-P", Double.toString(epsilon), "-T", Double.toString(tolerance) }; Kernel svm_kernel = null; if (kernel.equalsIgnoreCase("rbf")) { RBFKernel rbf_kernel = new RBFKernel(); rbf_kernel.setGamma(Double.parseDouble(Double.toString(gamma))); rbf_kernel.setCacheSize(Integer.parseInt(Integer.toString(cacheSize))); svm_kernel = rbf_kernel; } else if (kernel.equalsIgnoreCase("polynomial")) { PolyKernel poly_kernel = new PolyKernel(); poly_kernel.setExponent(Double.parseDouble(Integer.toString(degree))); poly_kernel.setCacheSize(Integer.parseInt(Integer.toString(cacheSize))); poly_kernel.setUseLowerOrder(true); svm_kernel = poly_kernel; } else if (kernel.equalsIgnoreCase("linear")) { PolyKernel poly_kernel = new PolyKernel(); poly_kernel.setExponent((double) 1.0); poly_kernel.setCacheSize(Integer.parseInt(Integer.toString(cacheSize))); poly_kernel.setUseLowerOrder(true); svm_kernel = poly_kernel; } regressor.setKernel(svm_kernel); try { regressor.setOptions(regressorOptions); } catch (final Exception ex) { tempFile.delete(); throw new IllegalArgumentException("Bad options in SVM trainer for epsilon = {" + epsilon + "} or " + "tolerance = {" + tolerance + "}.", ex); } // PERFORM THE TRAINING String[] generalOptions = { "-c", Integer.toString(data.classIndex() + 1), "-t", temporaryFilePath, /// Save the model in the following directory "-d", ServerFolders.models_weka + "/" + uuid }; try { Evaluation.evaluateModel(regressor, generalOptions); } catch (final Exception ex) { tempFile.delete(); throw new QSARException(Cause.XQReg350, "Unexpected condition while trying to train " + "an SVM model. Possible explanation : {" + ex.getMessage() + "}", ex); } QSARModel model = new QSARModel(); model.setParams(getParameters()); model.setCode(uuid.toString()); model.setAlgorithm(YaqpAlgorithms.SVM); model.setDataset(datasetUri); model.setModelStatus(ModelStatus.UNDER_DEVELOPMENT); ArrayList<Feature> independentFeatures = new ArrayList<Feature>(); for (int i = 0; i < data.numAttributes(); i++) { Feature f = new Feature(data.attribute(i).name()); if (data.classIndex() != i) { independentFeatures.add(f); } } Feature dependentFeature = new Feature(data.classAttribute().name()); Feature predictedFeature = dependentFeature; model.setDependentFeature(dependentFeature); model.setIndependentFeatures(independentFeatures); model.setPredictionFeature(predictedFeature); tempFile.delete(); return model; }
From source file:org.packDataMining.SMOTE.java
License:Open Source License
/** * The procedure implementing the SMOTE algorithm. The output * instances are pushed onto the output queue for collection. * //from www . j av a2 s .c om * @throws Exception if provided options cannot be executed * on input instances */ protected void doSMOTE() throws Exception { int minIndex = 0; int min = Integer.MAX_VALUE; if (m_DetectMinorityClass) { // find minority class int[] classCounts = getInputFormat().attributeStats(getInputFormat().classIndex()).nominalCounts; for (int i = 0; i < classCounts.length; i++) { if (classCounts[i] != 0 && classCounts[i] < min) { min = classCounts[i]; minIndex = i; } } } else { String classVal = getClassValue(); if (classVal.equalsIgnoreCase("first")) { minIndex = 1; } else if (classVal.equalsIgnoreCase("last")) { minIndex = getInputFormat().numClasses(); } else { minIndex = Integer.parseInt(classVal); } if (minIndex > getInputFormat().numClasses()) { throw new Exception("value index must be <= the number of classes"); } minIndex--; // make it an index } int nearestNeighbors; if (min <= getNearestNeighbors()) { nearestNeighbors = min - 1; } else { nearestNeighbors = getNearestNeighbors(); } if (nearestNeighbors < 1) throw new Exception("Cannot use 0 neighbors!"); // compose minority class dataset // also push all dataset instances Instances sample = getInputFormat().stringFreeStructure(); Enumeration instanceEnum = getInputFormat().enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); push((Instance) instance.copy()); if ((int) instance.classValue() == minIndex) { sample.add(instance); } } // compute Value Distance Metric matrices for nominal features Map vdmMap = new HashMap(); Enumeration attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { if (attr.isNominal() || attr.isString()) { double[][] vdm = new double[attr.numValues()][attr.numValues()]; vdmMap.put(attr, vdm); int[] featureValueCounts = new int[attr.numValues()]; int[][] featureValueCountsByClass = new int[getInputFormat().classAttribute().numValues()][attr .numValues()]; instanceEnum = getInputFormat().enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); int value = (int) instance.value(attr); int classValue = (int) instance.classValue(); featureValueCounts[value]++; featureValueCountsByClass[classValue][value]++; } for (int valueIndex1 = 0; valueIndex1 < attr.numValues(); valueIndex1++) { for (int valueIndex2 = 0; valueIndex2 < attr.numValues(); valueIndex2++) { double sum = 0; for (int classValueIndex = 0; classValueIndex < getInputFormat() .numClasses(); classValueIndex++) { double c1i = (double) featureValueCountsByClass[classValueIndex][valueIndex1]; double c2i = (double) featureValueCountsByClass[classValueIndex][valueIndex2]; double c1 = (double) featureValueCounts[valueIndex1]; double c2 = (double) featureValueCounts[valueIndex2]; double term1 = c1i / c1; double term2 = c2i / c2; sum += Math.abs(term1 - term2); } vdm[valueIndex1][valueIndex2] = sum; } } } } } // use this random source for all required randomness Random rand = new Random(getRandomSeed()); // find the set of extra indices to use if the percentage is not evenly divisible by 100 List extraIndices = new LinkedList(); double percentageRemainder = (getPercentage() / 100) - Math.floor(getPercentage() / 100.0); int extraIndicesCount = (int) (percentageRemainder * sample.numInstances()); if (extraIndicesCount >= 1) { for (int i = 0; i < sample.numInstances(); i++) { extraIndices.add(i); } } Collections.shuffle(extraIndices, rand); extraIndices = extraIndices.subList(0, extraIndicesCount); Set extraIndexSet = new HashSet(extraIndices); // the main loop to handle computing nearest neighbors and generating SMOTE // examples from each instance in the original minority class data Instance[] nnArray = new Instance[nearestNeighbors]; for (int i = 0; i < sample.numInstances(); i++) { Instance instanceI = sample.instance(i); // find k nearest neighbors for each instance List distanceToInstance = new LinkedList(); for (int j = 0; j < sample.numInstances(); j++) { Instance instanceJ = sample.instance(j); if (i != j) { double distance = 0; attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { double iVal = instanceI.value(attr); double jVal = instanceJ.value(attr); if (attr.isNumeric()) { distance += Math.pow(iVal - jVal, 2); } else { distance += ((double[][]) vdmMap.get(attr))[(int) iVal][(int) jVal]; } } } distance = Math.pow(distance, .5); distanceToInstance.add(new Object[] { distance, instanceJ }); } } // sort the neighbors according to distance Collections.sort(distanceToInstance, new Comparator() { public int compare(Object o1, Object o2) { double distance1 = (Double) ((Object[]) o1)[0]; double distance2 = (Double) ((Object[]) o2)[0]; return (int) Math.ceil(distance1 - distance2); } }); // populate the actual nearest neighbor instance array Iterator entryIterator = distanceToInstance.iterator(); int j = 0; while (entryIterator.hasNext() && j < nearestNeighbors) { nnArray[j] = (Instance) ((Object[]) entryIterator.next())[1]; j++; } // create synthetic examples int n = (int) Math.floor(getPercentage() / 100); while (n > 0 || extraIndexSet.remove(i)) { double[] values = new double[sample.numAttributes()]; int nn = rand.nextInt(nearestNeighbors); attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { if (attr.isNumeric()) { double dif = nnArray[nn].value(attr) - instanceI.value(attr); double gap = rand.nextDouble(); values[attr.index()] = (double) (instanceI.value(attr) + gap * dif); } else if (attr.isDate()) { double dif = nnArray[nn].value(attr) - instanceI.value(attr); double gap = rand.nextDouble(); values[attr.index()] = (long) (instanceI.value(attr) + gap * dif); } else { int[] valueCounts = new int[attr.numValues()]; int iVal = (int) instanceI.value(attr); valueCounts[iVal]++; for (int nnEx = 0; nnEx < nearestNeighbors; nnEx++) { int val = (int) nnArray[nnEx].value(attr); valueCounts[val]++; } int maxIndex = 0; int max = Integer.MIN_VALUE; for (int index = 0; index < attr.numValues(); index++) { if (valueCounts[index] > max) { max = valueCounts[index]; maxIndex = index; } } values[attr.index()] = maxIndex; } } } values[sample.classIndex()] = minIndex; Instance synthetic = new Instance(1.0, values); push(synthetic); n--; } } }
From source file:org.pentaho.di.scoring.WekaScoringData.java
License:Open Source License
/** * Finds a mapping between the attributes that a Weka model has been trained * with and the incoming Kettle row format. Returns an array of indices, where * the element at index 0 of the array is the index of the Kettle field that * corresponds to the first attribute in the Instances structure, the element * at index 1 is the index of the Kettle fields that corresponds to the second * attribute, .../*from ww w . j a v a 2 s.c o m*/ * * @param header the Instances header * @param inputRowMeta the meta data for the incoming rows * @param updateIncrementalModel true if the model is incremental and should * be updated on the incoming instances * @param log the log to use */ public void mapIncomingRowMetaData(Instances header, RowMetaInterface inputRowMeta, boolean updateIncrementalModel, LogChannelInterface log) { m_mappingIndexes = WekaScoringData.findMappings(header, inputRowMeta); m_updateIncrementalModel = updateIncrementalModel; // If updating of incremental models has been selected, then // check on the ability to do this if (m_updateIncrementalModel && m_model.isSupervisedLearningModel()) { if (m_model.isUpdateableModel()) { // Do we have the class mapped successfully to an incoming // Kettle field if (m_mappingIndexes[header.classIndex()] == WekaScoringData.NO_MATCH || m_mappingIndexes[header.classIndex()] == WekaScoringData.TYPE_MISMATCH) { m_updateIncrementalModel = false; log.logError( BaseMessages.getString(WekaScoringMeta.PKG, "WekaScoringMeta.Log.NoMatchForClass")); //$NON-NLS-1$ } } else { m_updateIncrementalModel = false; log.logError(BaseMessages.getString(WekaScoringMeta.PKG, "WekaScoringMeta.Log.ModelNotUpdateable")); //$NON-NLS-1$ } } }
From source file:org.pentaho.di.scoring.WekaScoringDialog.java
License:Open Source License
private void checkAbilityToProduceProbabilities(WekaScoringModel tempM) { // take a look at the model-type and then the class // attribute (if set and if necessary) in order // to determine whether to disable/enable the // output probabilities checkbox if (!tempM.isSupervisedLearningModel()) { // now, does the clusterer produce probabilities? if (((WekaScoringClusterer) tempM).canProduceProbabilities()) { m_wOutputProbs.setEnabled(true); } else {//from ww w. j av a 2 s. c o m m_wOutputProbs.setSelection(false); m_wOutputProbs.setEnabled(false); } } else { // take a look at the header and disable the output // probs checkbox if there is a class attribute set // and the class is numeric Instances header = tempM.getHeader(); if (header.classIndex() >= 0) { if (header.classAttribute().isNumeric()) { m_wOutputProbs.setSelection(false); m_wOutputProbs.setEnabled(false); } else { m_wOutputProbs.setEnabled(true); } } } }
From source file:org.scify.NewSumServer.Server.MachineLearning.labelTagging.java
License:Apache License
/** * Find the recommend labels from classifier * * @return the recommend labels//from www . ja v a 2 s . c om */ public static String recommendation(INSECTDB file, String text) { String labelList = "-none-"; //create IVector String Ivector = vector.labellingVector(text, file); // take the similarity vectors for each class graph try { Instances dataTrainSet = dataSets.trainingSet(file); //take the train dataset Instances dataLabelSet = dataSets.labelingSet(file, Ivector);//take tha labe dataset ArffSaver saver = new ArffSaver(); saver.setInstances(dataTrainSet); saver.setFile(new File("./data/dataTrainSet.arff")); saver.writeBatch(); ArffSaver saver2 = new ArffSaver(); saver2.setInstances(dataLabelSet); saver2.setFile(new File("./data/dataLabelSet.arff")); saver2.writeBatch(); File temp = File.createTempFile("exportFile", null); //TODO: creat classifier // String option = "-S 2 -K 2 -D 3 -G 0.0 -R 0.0 -N 0.5 -M 40.0 -C 1.0 -E 0.001 -P 0.1"; // classifier options // String[] options = option.split("\\s+"); if (dataTrainSet.classIndex() == -1) { dataTrainSet.setClassIndex(dataTrainSet.numAttributes() - 1); } // Create a classifier LibSVM // NaiveBayes nb = new NaiveBayes(); // RandomForest nb = new RandomForest(); J48 nb = new J48(); // nb.setOptions(options); nb.buildClassifier(dataTrainSet); // End train method if (dataLabelSet.classIndex() == -1) { dataLabelSet.setClassIndex(dataLabelSet.numAttributes() - 1); } StringBuffer writer = new StringBuffer(); PlainText output = new PlainText(); output.setBuffer(writer); output.setHeader(dataLabelSet); output.printClassifications(nb, dataLabelSet); // PrintStream ps2 = new PrintStream(classGname); // ps2.print(writer.toString()); // ps2.close(); PrintStream ps = new PrintStream(temp); //Add to temp file the results of classifying ps.print(writer.toString()); ps.close(); //TODO: export result // labelList = result(temp); //if result is true adds the current class graph name in label list labelList = result(temp) + " --------->> " + text; //if result is true adds the current class graph name in label list Utilities.appendToFile(labelList); } catch (Exception ex) { Logger.getLogger(labelTagging.class.getName()).log(Level.SEVERE, null, ex); } return labelList; }