List of usage examples for weka.core Instance isMissing
public boolean isMissing(Attribute att);
From source file:com.esda.util.StringToWordVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * * @oaram instance the instance to convert * @param v/*from w w w. j a v a2 s. c o m*/ * @return the conerted instance */ private int convertInstancewoDocNorm(Instance instance, FastVector v) { // Convert the instance into a sorted set of indexes TreeMap contained = new TreeMap(); // Copy all non-converted attributes from input to output int firstCopy = 0; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().attribute(i).type() != Attribute.STRING && getInputFormat().attribute(i).type() != Attribute.RELATIONAL) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { contained.put(new Integer(firstCopy), new Double(instance.value(i))); } } else { if (instance.isMissing(i)) { contained.put(new Integer(firstCopy), new Double(Double.NaN)); } else if (getInputFormat().attribute(i).type() == Attribute.STRING) { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(firstCopy) .addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(firstCopy) .addStringValue(instance.stringValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } else { // relational if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { Instances relationalHeader = outputFormatPeek().attribute(firstCopy).relation(); // hack to defeat sparse instances bug outputFormatPeek().attribute(firstCopy).addRelation(relationalHeader); } int newIndex = outputFormatPeek().attribute(firstCopy) .addRelation(instance.relationalValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } } firstCopy++; } } for (int j = 0; j < instance.numAttributes(); j++) { // if ((getInputFormat().attribute(j).type() == Attribute.STRING) if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { m_Tokenizer.tokenize(instance.stringValue(j)); while (m_Tokenizer.hasMoreElements()) { String word = (String) m_Tokenizer.nextElement(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); Integer index = (Integer) m_Dictionary.get(word); if (index != null) { if (m_OutputCounts) { // Separate if here rather than // two lines down to avoid // hashtable lookup Double count = (Double) contained.get(index); if (count != null) { contained.put(index, new Double(count.doubleValue() + 1.0)); } else { contained.put(index, new Double(1)); } } else { contained.put(index, new Double(1)); } } } } } // Doing TFTransform if (m_TFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = Math.log(val + 1); contained.put(index, new Double(val)); } } } // Doing IDFTransform if (m_IDFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]); contained.put(index, new Double(val)); } } } // Convert the set to structures needed to create a sparse instance. double[] values = new double[contained.size()]; int[] indices = new int[contained.size()]; Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); Double value = (Double) contained.get(index); values[i] = value.doubleValue(); indices[i] = index.intValue(); } Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes()); inst.setDataset(outputFormatPeek()); v.addElement(inst); return firstCopy; }
From source file:com.openkm.kea.filter.KEAFilter.java
License:Open Source License
/** * Converts an instance.//from w ww . ja v a2 s. c om */ private FastVector convertInstance(Instance instance, boolean training) throws Exception { FastVector vector = new FastVector(); if (m_Debug) { log.info("-- Converting instance"); } // Get the key phrases for the document HashMap<String, Counter> hashKeyphrases = null; HashMap<String, Counter> hashKeysEval = null; if (!instance.isMissing(m_KeyphrasesAtt)) { String keyphrases = instance.stringValue(m_KeyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases, false); hashKeysEval = getGivenKeyphrases(keyphrases, true); } // Get the phrases for the document HashMap<String, FastVector> hash = new HashMap<String, FastVector>(); int length = getPhrases(hash, instance.stringValue(m_DocumentAtt)); // hash = getComposits(hash); /* Experimental: To compute how many of the manual keyphrases appear in the documents: log.info("Doc phrases found " + hash.size()); log.info("Manual keyphrases: "); Iterator iter = hashKeyphrases.keySet().iterator(); int count = 0; while (iter.hasNext()) { String id = (String)iter.next(); if (hash.containsKey(id)) { count++; } } double max_recall = (double)count/(double)hashKeyphrases.size(); m_max_recall += max_recall; doc++; double avg_m_max_recall = m_max_recall/(double)doc; String file = instance.stringValue(2); log.info(count + " out of " + hashKeyphrases.size() + " are in the document "); log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents "); */ // Compute number of extra attributes int numFeatures = 5; if (m_Debug) { if (m_KFused) { numFeatures = numFeatures + 1; } } if (m_STDEVfeature) { numFeatures = numFeatures + 1; } if (m_NODEfeature) { numFeatures = numFeatures + 1; } if (m_LENGTHfeature) { numFeatures = numFeatures + 1; } // Set indices of key attributes //int phraseAttIndex = m_DocumentAtt; int tfidfAttIndex = m_DocumentAtt + 2; int distAttIndex = m_DocumentAtt + 3; int probsAttIndex = m_DocumentAtt + numFeatures - 1; //int classAttIndex = numFeatures; // Go through the phrases and convert them into instances Iterator<String> it = hash.keySet().iterator(); while (it.hasNext()) { String id = it.next(); FastVector phraseInfo = (FastVector) hash.get(id); double[] vals = featVals(id, phraseInfo, training, hashKeysEval, hashKeyphrases, length, hash); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(m_ClassifierData); // Get probability of a phrase being key phrase double[] probs = m_Classifier.distributionForInstance(inst); // If simple Naive Bayes used, change here to //double prob = probs[1]; double prob = probs[0]; // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // output of values for a given phrase: // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(id); newInst[pos++] = index; // Add original version String orig = (String) phraseInfo.elementAt(2); if (orig != null) { index = outputFormatPeek().attribute(pos).addStringValue(orig); } else { index = outputFormatPeek().attribute(pos).addStringValue(id); } newInst[pos++] = index; // Add TFxIDF newInst[pos++] = inst.value(m_TfidfIndex); // Add distance newInst[pos++] = inst.value(m_FirstOccurIndex); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = inst.value(m_KeyFreqIndex); } } if (m_STDEVfeature) { newInst[pos++] = inst.value(m_STDEVIndex); } if (m_NODEfeature) { newInst[pos++] = inst.value(m_NodeIndex); } if (m_LENGTHfeature) { newInst[pos++] = inst.value(m_LengthIndex); } // Add probability probsAttIndex = pos; newInst[pos++] = prob; // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); } // Add dummy instances for keyphrases that don't occur // in the document if (hashKeysEval != null) { Iterator<String> phrases = hashKeysEval.keySet().iterator(); while (phrases.hasNext()) { String phrase = phrases.next(); double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // log.info("Here: " + phrase); // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add original version index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add TFxIDF newInst[pos++] = Instance.missingValue(); // Add distance newInst[pos++] = Instance.missingValue(); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = Instance.missingValue(); } } if (m_STDEVfeature) { newInst[pos++] = Instance.missingValue(); } if (m_NODEfeature) { newInst[pos++] = Instance.missingValue(); } if (m_LENGTHfeature) { newInst[pos++] = Instance.missingValue(); } // Add probability and rank newInst[pos++] = -Double.MAX_VALUE; // newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = 1; // Keyphrase } else { newInst[pos++] = instance.value(i); } Instance inst = new Instance(instance.weight(), newInst); inst.setDataset(outputFormatPeek()); vector.addElement(inst); } } } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // Short cut: if phrase very unlikely make rank very low and continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } currentInstance.setValue(probsAttIndex + 1, rank++); } return vector; }
From source file:com.openkm.kea.filter.KEAPhraseFilter.java
License:Open Source License
/** * Converts an instance by removing all non-alphanumeric characters * from its string attribute values./* w w w . j av a 2s .c o m*/ */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if (!instance.attribute(i).isString() || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { if (!m_SelectCols.isInRange(i)) { int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i)); instVals[i] = (double) index; continue; } // aly: str = text of the document String str = instance.stringValue(i); String tokenized = tokenize(str); // aly: resultStr is the clean version of str // log.info(resultStr.toString()); int index = getOutputFormat().attribute(i).addStringValue(tokenized); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:com.openkm.kea.filter.NumbersFilter.java
License:Open Source License
/** * Converts an instance. A phrase boundary is inserted where * a number is found./*from w ww . jav a 2 s . c om*/ */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if ((!instance.attribute(i).isString()) || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { String str = instance.stringValue(i); StringBuffer resultStr = new StringBuffer(); StringTokenizer tok = new StringTokenizer(str, " \t\n", true); while (tok.hasMoreTokens()) { String token = tok.nextToken(); // Everything that doesn't contain at least // one letter is considered to be a number boolean isNumber = true; for (int j = 0; j < token.length(); j++) { if (Character.isLetter(token.charAt(j))) { isNumber = false; break; } } if (!isNumber) { resultStr.append(token); } else { if (token.equals(" ") || token.equals("\t") || token.equals("\n")) { resultStr.append(token); } else { resultStr.append(" \n "); } } } int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString()); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:com.tum.classifiertest.FastRandomTree.java
License:Open Source License
/** * Computes class distribution of an instance using the FastRandomTree.<p> * * In Weka's RandomTree, the distributions were normalized so that all * probabilities sum to 1; this would abolish the effect of instance weights * on voting. In FastRandomForest 0.97 onwards, the distributions are * normalized by dividing with the number of instances going into a leaf.<p> * /*w ww . j a v a2 s.c om*/ * @param instance the instance to compute the distribution for * @return the computed class distribution * @throws Exception if computation fails */ @Override public double[] distributionForInstance(Instance instance) throws Exception { double[] returnedDist = null; if (m_Attribute > -1) { // ============================ node is not a leaf if (instance.isMissing(m_Attribute)) { // ---------------- missing value returnedDist = new double[m_MotherForest.getM_Info().numClasses()]; // split instance up for (int i = 0; i < m_Successors.length; i++) { double[] help = m_Successors[i].distributionForInstance(instance); if (help != null) { for (int j = 0; j < help.length; j++) { returnedDist[j] += m_Prop[i] * help[j]; } } } } else if (m_MotherForest.getM_Info().attribute(m_Attribute).isNominal()) { // ------ nominal //returnedDist = m_Successors[(int) instance.value(m_Attribute)] // .distributionForInstance(instance); // 0.99: new - binary splits (also) for nominal attributes if (instance.value(m_Attribute) == m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } else { // ------------------------------------------ numeric attributes if (instance.value(m_Attribute) < m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } return returnedDist; } else { // =============================================== node is a leaf return m_ClassProbs; } }
From source file:core.DatabaseSaverEx.java
License:Open Source License
/** * inserts the given instance into the table. * //from ww w . j av a 2 s .co m * @param inst the instance to insert * @throws Exception if something goes wrong */ public void writeInstance(Instance inst) throws Exception { StringBuffer insert = new StringBuffer(); insert.append("INSERT INTO "); insert.append(m_tableName); insert.append(" VALUES ( "); if (m_id) { insert.append(m_count); insert.append(", "); m_count++; } for (int j = 0; j < inst.numAttributes(); j++) { if (inst.isMissing(j)) insert.append("NULL"); else { if ((inst.attribute(j)).isDate()) insert.append("'" + m_DateFormat.format((long) inst.value(j)) + "'"); else if ((inst.attribute(j)).isNumeric()) insert.append(inst.value(j)); else { String stringInsert = "'" + inst.stringValue(j) + "'"; if (stringInsert.length() > 2) stringInsert = stringInsert.replaceAll("''", "'"); insert.append(stringInsert); } } if (j != inst.numAttributes() - 1) insert.append(", "); } insert.append(" )"); //System.out.println(insert.toString()); if (m_DataBaseConnection.update(insert.toString()) < 1) { throw new IOException("Tuple cannot be inserted."); } else { m_DataBaseConnection.close(); } }
From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java
License:Apache License
@Override public Instances load(File fileMetricsFile) { // first determine all files String path = fileMetricsFile.getParentFile().getAbsolutePath(); String project = fileMetricsFile.getName().split("_")[0]; File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv"); File networkMetrics = new File(path + "/" + project + "_network_metrics.csv"); Instances metricsData = null;/* w w w. j a v a 2 s .com*/ try { CSVLoader wekaCsvLoader = new CSVLoader(); wekaCsvLoader.setSource(fileMetricsFile); metricsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(bugsFile); Instances bugsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(networkMetrics); Instances networkData = wekaCsvLoader.getDataSet(); metricsData.setRelationName(project); // fix nominal attributes (i.e., NA values) for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isNominal()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } // fix string attributes for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isString()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } Map<String, Integer> filenames = new HashMap<>(); for (int j = 0; j < metricsData.size(); j++) { filenames.put(metricsData.instance(j).stringValue(0), j); } // merge with network data int attributeIndex; for (int j = 2; j < networkData.numAttributes(); j++) { attributeIndex = metricsData.numAttributes(); metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex); for (int i = 0; i < networkData.size(); i++) { Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, networkData.instance(i).value(j)); } } } // add bug information attributeIndex = metricsData.numAttributes(); final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); metricsData.insertAttributeAt(classAtt, attributeIndex); for (int i = 0; i < bugsData.size(); i++) { if (bugsData.instance(i).value(2) > 0.0d) { Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0); } } } // remove filenames metricsData.deleteAttributeAt(0); Attribute eigenvector = metricsData.attribute("eigenvector"); if (eigenvector != null) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.attribute(j) == eigenvector) { metricsData.deleteAttributeAt(j); } } } metricsData.setClassIndex(metricsData.numAttributes() - 1); // set all missing values to 0 for (int i = 0; i < metricsData.size(); i++) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.instance(i).isMissing(j)) { metricsData.instance(i).setValue(j, 0.0d); } } } } catch (IOException e) { Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage()); metricsData = null; } return metricsData; }
From source file:edu.drexel.psal.jstylo.verifiers.WLSVM.java
License:Open Source License
/** * Converts an ARFF Instance into a string in the sparse format accepted by * LIBSVM/*from w ww .j a va2 s .c om*/ * * @param instance * @return */ protected String InstanceToSparse(Instance instance) { String line = new String(); int c = (int) instance.classValue(); if (c == 0) c = -1; line = c + " "; for (int j = 1; j < instance.numAttributes(); j++) { if (j - 1 == instance.classIndex()) { continue; } if (instance.isMissing(j - 1)) continue; if (instance.value(j - 1) != 0) line += " " + j + ":" + instance.value(j - 1); } // LOG.info(line); return (line + "\n"); }
From source file:en_deep.mlprocess.manipulation.featmodif.ReplaceMissing.java
License:Open Source License
/** * Convert a single instance over if the class is nominal. The converted * instance is added to the end of the output queue. * * @param instance the instance to convert */// w w w. j a va 2 s .c o m private void convertInstance(Instance instance) { // create a copy of the input instance Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), instance.toDoubleArray()); } else { inst = new DenseInstance(instance.weight(), instance.toDoubleArray()); } // copy the string values from this instance as well (only the existing ones) inst.setDataset(getOutputFormat()); copyValues(inst, false, instance.dataset(), getOutputFormat()); // beware of weird behavior of this function (see source)!! inst.setDataset(getOutputFormat()); // find the missing values to be filled + the double values for the new "missing" label and store it double[] vals = instance.toDoubleArray(); for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = instance.attribute(j); if (m_Columns.isInRange(j) && instance.isMissing(j)) { // find the "missing" value in the output nominal attribute if (att.isNominal()) { vals[j] = inst.dataset().attribute(j).indexOfValue(m_ReplVal); } // add a string value for the new "missing" label else if (att.isString()) { vals[j] = inst.dataset().attribute(j).addStringValue(m_ReplVal); } } } // fill in the missing values found inst.replaceMissingValues(vals); push(inst); }
From source file:etc.aloe.filters.AbstractRegexFilter.java
License:Open Source License
@Override protected Instance process(Instance instance) throws Exception { if (stringAttributeIndex < 0) { throw new IllegalStateException("String attribute not set"); }//from www .j a v a 2s. c o m String stringValue = instance.stringValue(stringAttributeIndex); NamedRegex[] regexFeatures = getRegexFeatures(); int numOldValues = instance.numAttributes(); int numNewFeatures = regexFeatures.length; if (countRegexLengths) { numNewFeatures = regexFeatures.length * 2; } double[] newValues = new double[numOldValues + numNewFeatures]; // Copy all attributes from input to output for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { newValues[i] = instance.value(i); } } else { if (instance.isMissing(i)) { newValues[i] = Utils.missingValue(); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(i).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i)); newValues[i] = newIndex; } } } for (int i = 0; i < regexFeatures.length; i++) { Pattern pattern = regexFeatures[i].getPattern(); Matcher matches = pattern.matcher(stringValue); int count = 0; int maxLength = 0; while (matches.find()) { count++; int len = matches.group().length(); if (len > maxLength) { maxLength = len; } } int index = numOldValues + i; if (countRegexLengths) { index = numOldValues + 2 * i; } newValues[index] = count; if (countRegexLengths) { newValues[numOldValues + 2 * i + 1] = maxLength; } } Instance result = new SparseInstance(instance.weight(), newValues); return result; }