List of usage examples for weka.core Instance numAttributes
public int numAttributes();
From source file:pk.lums.edu.sma.processing.ml.DBSCAN.EuclideanDataObject.java
License:Open Source License
/** * Calculates the euclidian-distance between dataObject and this.dataObject * /*from www . j a v a2 s. c o m*/ * @param dataObject * The DataObject, that is used for distance-calculation with * this.dataObject; now assumed to be of the same type and with * the same structure * @return double-value The euclidian-distance between dataObject and * this.dataObject */ public double distance(DataObject dataObject) { double dist = 0.0; Instance firstInstance = getInstance(); Instance secondInstance = dataObject.getInstance(); int firstNumValues = firstInstance.numValues(); int secondNumValues = secondInstance.numValues(); int numAttributes = firstInstance.numAttributes(); int firstI, secondI; for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) { if (p1 >= firstNumValues) { firstI = numAttributes; } else { firstI = firstInstance.index(p1); } if (p2 >= secondNumValues) { secondI = numAttributes; } else { secondI = secondInstance.index(p2); } double cDistance = 0; if (firstI == secondI) { cDistance = computeDistance(firstI, firstInstance.valueSparse(p1), secondInstance.valueSparse(p2)); p1++; p2++; } else if (firstI > secondI) { cDistance = computeDistance(secondI, 0, secondInstance.valueSparse(p2)); p2++; } else { cDistance = computeDistance(firstI, firstInstance.valueSparse(p1), 0); p1++; } dist += cDistance * cDistance; } return Math.sqrt(dist); }
From source file:pk.lums.edu.sma.processing.ml.DBSCAN.ManhattanDataObject.java
License:Open Source License
/** * Calculates the manhattan-distance between dataObject and this.dataObject * /*from w w w.j av a2s .c o m*/ * @param dataObject * The DataObject, that is used for distance-calculation with * this.dataObject now assumed to be of the same type and with * the same structure * @return double-value The manhattan-distance between dataObject and * this.dataObject NaN, if the computation could not be performed */ public double distance(DataObject dataObject) { double dist = 0.0; Instance firstInstance = getInstance(); Instance secondInstance = dataObject.getInstance(); int firstNumValues = firstInstance.numValues(); int secondNumValues = secondInstance.numValues(); int numAttributes = firstInstance.numAttributes(); int firstI, secondI; for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) { if (p1 >= firstNumValues) { firstI = numAttributes; } else { firstI = firstInstance.index(p1); } if (p2 >= secondNumValues) { secondI = numAttributes; } else { secondI = secondInstance.index(p2); } double cDistance = 0; if (firstI == secondI) { cDistance = computeDistance(firstI, firstInstance.valueSparse(p1), secondInstance.valueSparse(p2)); p1++; p2++; } else if (firstI > secondI) { cDistance = computeDistance(secondI, 0, secondInstance.valueSparse(p2)); p2++; } else { cDistance = computeDistance(firstI, firstInstance.valueSparse(p1), 0); p1++; } dist += Math.abs(cDistance); } return dist; }
From source file:preprocess.StringToWordVector.java
License:Open Source License
/** * determines the dictionary.//from ww w . j a v a 2 s. c o m */ private void determineDictionary() { // initialize stopwords Stopwords stopwords = new Stopwords(); if (getUseStoplist()) { try { if (getStopwords().exists() && !getStopwords().isDirectory()) stopwords.read(getStopwords()); } catch (Exception e) { e.printStackTrace(); } } // Operate on a per-class basis if class attribute is set int classInd = getInputFormat().classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = getInputFormat().attribute(classInd).numValues(); } //TreeMap dictionaryArr [] = new TreeMap[values]; TreeMap[] dictionaryArr = new TreeMap[values]; for (int i = 0; i < values; i++) { dictionaryArr[i] = new TreeMap(); } // Make sure we know which fields to convert determineSelectedRange(); // Tokenize all training text into an orderedMap of "words". long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances()); for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance instance = getInputFormat().instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int) instance.classValue(); } // Iterate through all relevant string attributes of the current instance Hashtable h = new Hashtable(); for (int j = 0; j < instance.numAttributes(); j++) { if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { // Get tokenizer m_Tokenizer.tokenize(instance.stringValue(j)); // Iterate through tokens, perform stemming, and remove stopwords // (if required) while (m_Tokenizer.hasMoreElements()) { String word = ((String) m_Tokenizer.nextElement()).intern(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); if (this.m_useStoplist == true) if (stopwords.is(word)) continue; if (!(h.contains(word))) h.put(word, new Integer(0)); Count count = (Count) dictionaryArr[vInd].get(word); if (count == null) { dictionaryArr[vInd].put(word, new Count(1)); } else { count.count++; } } } } //updating the docCount for the words that have occurred in this //instance(document). Enumeration e = h.keys(); while (e.hasMoreElements()) { String word = (String) e.nextElement(); Count c = (Count) dictionaryArr[vInd].get(word); if (c != null) { c.docCount++; } else System.err.println( "Warning: A word should definitely be in the " + "dictionary.Please check the code"); } if (pruneRate > 0) { if (i % pruneRate == 0 && i > 0) { for (int z = 0; z < values; z++) { Vector d = new Vector(1000); Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count <= 1) { d.add(word); } } Iterator iter = d.iterator(); while (iter.hasNext()) { String word = (String) iter.next(); dictionaryArr[z].remove(word); } } } } } // Figure out the minimum required word frequency int totalsize = 0; int prune[] = new int[values]; for (int z = 0; z < values; z++) { totalsize += dictionaryArr[z].size(); int array[] = new int[dictionaryArr[z].size()]; int pos = 0; Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); array[pos] = count.count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Convert the dictionary into an attribute index // and create one attribute per word FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes()); // Add the non-converted attributes int classIndex = -1; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().classIndex() == i) { classIndex = attributes.size(); } attributes.addElement(getInputFormat().attribute(i).copy()); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) TreeMap newDictionary = new TreeMap(); int index = attributes.size(); for (int z = 0; z < values; z++) { Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count >= prune[z]) { if (newDictionary.get(word) == null) { newDictionary.put(word, new Integer(index++)); attributes.addElement(new Attribute(m_Prefix + word)); } } } } // Compute document frequencies m_DocsCounts = new int[attributes.size()]; Iterator it = newDictionary.keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); int idx = ((Integer) newDictionary.get(word)).intValue(); int docsCount = 0; for (int j = 0; j < values; j++) { Count c = (Count) dictionaryArr[j].get(word); if (c != null) docsCount += c.docCount; } m_DocsCounts[idx] = docsCount; } // Trim vector and set instance variables attributes.trimToSize(); m_Dictionary = newDictionary; m_NumInstances = getInputFormat().numInstances(); // Set the filter's output format Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); }
From source file:preprocess.StringToWordVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * /*w w w. j a va2 s .c o m*/ * @oaram instance the instance to convert * @param v * @return the conerted instance */ private int convertInstancewoDocNorm(Instance instance, FastVector v) { // Convert the instance into a sorted set of indexes TreeMap contained = new TreeMap(); // Copy all non-converted attributes from input to output int firstCopy = 0; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { contained.put(new Integer(firstCopy), new Double(instance.value(i))); } } else { if (instance.isMissing(i)) { contained.put(new Integer(firstCopy), new Double(Instance.missingValue())); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(firstCopy) .addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(firstCopy) .addStringValue(instance.stringValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } } firstCopy++; } } for (int j = 0; j < instance.numAttributes(); j++) { //if ((getInputFormat().attribute(j).type() == Attribute.STRING) if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { m_Tokenizer.tokenize(instance.stringValue(j)); while (m_Tokenizer.hasMoreElements()) { String word = (String) m_Tokenizer.nextElement(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); Integer index = (Integer) m_Dictionary.get(word); if (index != null) { if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup Double count = (Double) contained.get(index); if (count != null) { contained.put(index, new Double(count.doubleValue() + 1.0)); } else { contained.put(index, new Double(1)); } } else { contained.put(index, new Double(1)); } } } } } //Doing TFTransform if (m_TFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = Math.log(val + 1); contained.put(index, new Double(val)); } } } //Doing IDFTransform if (m_IDFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]); contained.put(index, new Double(val)); } } } // Convert the set to structures needed to create a sparse instance. double[] values = new double[contained.size()]; int[] indices = new int[contained.size()]; Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); Double value = (Double) contained.get(index); values[i] = value.doubleValue(); indices[i] = index.intValue(); } Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes()); inst.setDataset(outputFormatPeek()); v.addElement(inst); return firstCopy; }
From source file:regulyasocjacyjne.RegulyAsocjacyjne.java
public static void infoObj() throws Exception { Instances data = loadData("./src/date/irysy.arff"); for (int i = 0; i < data.numInstances(); i++) //Przegladanie obiektow {//from www . j av a 2s .co m System.out.println("Wiersz numer " + i + ":"); Instance instance = data.instance(i); //Pobranie obiektu (wiersza danych) o podanym numerze for (int j = 0; j < instance.numAttributes(); j++) //Przegladanie atrybutow w obiekcie { String textValue = instance.toString(j); //Pobranie wartosci atrybutu o podanym numerze (tzn. pobranie tekstowej reprezentacji wartosci) System.out.print(textValue + ", "); } System.out.println(); } }
From source file:sirius.predictor.main.PredictorFrame.java
License:Open Source License
private void runClassifier(ClassifierData classifierData, boolean allPositions) { //this method is for type 1 classifier with all positions and motif list //and type 2 classifier with all positions if (sequenceNameTableModel.getRowCount() < 1) { JOptionPane.showMessageDialog(this, "Please load File first!", "No Sequence", JOptionPane.INFORMATION_MESSAGE); return;/* w w w .j a va 2s .com*/ } if (loadFastaFileMenuItem.getState() == false) { JOptionPane.showMessageDialog(this, "Please load Fasta File! Currently, you have score file!", "Wrong File Format", JOptionPane.INFORMATION_MESSAGE); return; } if (onAllPositionsMenuItem.getState() == false && motifListTableModel.getSize() == 0) { JOptionPane.showMessageDialog(this, "There are no Motifs chosen in Motif List!", "No Motifs", JOptionPane.INFORMATION_MESSAGE); MotifListDialog dialog = new MotifListDialog(motifListTableModel); dialog.setLocationRelativeTo(this); dialog.setVisible(true); return; } while (outputDirectory == null) { JOptionPane.showMessageDialog(this, "Please set output directory first!", "Output Directory not set", JOptionPane.INFORMATION_MESSAGE); setOutputDirectory(); //return; } try { BufferedWriter output = new BufferedWriter(new FileWriter( outputDirectory + File.separator + "classifierone_" + classifierData.getClassifierName() + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores")); Classifier classifierOne = classifierData.getClassifierOne(); int leftMostPosition = classifierData.getLeftMostPosition(); int rightMostPosition = classifierData.getRightMostPosition(); //Reading and Storing the featureList Instances inst = classifierData.getInstances(); ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>(); for (int x = 0; x < inst.numAttributes() - 1; x++) { //-1 because class attribute must be ignored featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(x).name())); } for (int x = 0; x < sequenceNameTableModel.getRowCount(); x++) { if (stopClassifier == true) { statusPane.setText("Running of Classifier Stopped!"); stopClassifier = false; output.close(); return; } //if(x%100 == 0) statusPane.setText("Running " + classifierData.getClassifierName() + " - ClassifierOne @ " + x + " / " + sequenceNameTableModel.getRowCount()); //Header output.write(sequenceNameTableModel.getHeader(x)); output.newLine(); output.write(sequenceNameTableModel.getSequence(x)); output.newLine(); //Sequence Score -> index-score, index-score String sequence = sequenceNameTableModel.getSequence(x); int minSequenceLengthRequired; int targetLocationIndex; if (leftMostPosition < 0 && rightMostPosition > 0) {// -ve and +ve minSequenceLengthRequired = (leftMostPosition * -1) + rightMostPosition; targetLocationIndex = (leftMostPosition * -1); } else if (leftMostPosition < 0 && rightMostPosition < 0) {//-ve and -ve minSequenceLengthRequired = rightMostPosition - leftMostPosition + 1; targetLocationIndex = (leftMostPosition * -1); } else {//+ve and +ve minSequenceLengthRequired = rightMostPosition - leftMostPosition + 1; targetLocationIndex = (leftMostPosition * -1); } boolean firstEntryForClassifierOne = true; for (int y = 0; y + (minSequenceLengthRequired - 1) < sequence.length(); y++) { //Check if targetLocation match any motif in motif List if (allPositions == false && motifListTableModel .gotMotifMatch(sequence.substring(y + 0, y + targetLocationIndex)) == false) continue; String line2 = sequence.substring(y + 0, y + minSequenceLengthRequired); Instance tempInst; tempInst = new Instance(inst.numAttributes()); tempInst.setDataset(inst); for (int z = 0; z < inst.numAttributes() - 1; z++) { //-1 because class attribute can be ignored //Give the sequence and the featureList to get the feature freqs on the sequence Object obj = GenerateArff.getMatchCount("+1_Index(" + targetLocationIndex + ")", line2, featureDataArrayList.get(z), classifierData.getScoringMatrixIndex(), classifierData.getCountingStyleIndex(), classifierData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(z, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(z, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(z, (String) obj); else { output.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } //note that pos or neg does not matter as this is not used tempInst.setValue(inst.numAttributes() - 1, "neg"); double[] results = classifierOne.distributionForInstance(tempInst); if (firstEntryForClassifierOne) firstEntryForClassifierOne = false; else output.write(","); output.write(y + targetLocationIndex + "=" + results[0]); } output.newLine(); output.flush(); } output.flush(); output.close(); statusPane.setText("ClassifierOne finished running..."); //Run classifier Two if it is type 2 if (classifierData.getClassifierType() == 2) { BufferedWriter output2 = new BufferedWriter(new FileWriter( outputDirectory + File.separator + "classifiertwo_" + classifierData.getClassifierName() + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores")); BufferedReader input2 = new BufferedReader(new FileReader( outputDirectory + File.separator + "classifierone_" + classifierData.getClassifierName() + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores")); Classifier classifierTwo = classifierData.getClassifierTwo(); Instances inst2 = classifierData.getInstances2(); int setUpstream = classifierData.getSetUpstream(); int setDownstream = classifierData.getSetDownstream(); int minScoreWindowRequired; if (setUpstream < 0 && setDownstream < 0) {//-ve and -ve minScoreWindowRequired = setDownstream - setUpstream + 1; } else if (setUpstream < 0 && setDownstream > 0) {//-ve and +ve minScoreWindowRequired = (setUpstream * -1) + setDownstream; } else {//+ve and +ve minScoreWindowRequired = setDownstream - setUpstream + 1; } String lineHeader; String lineSequence; int lineCounter2 = 0; while ((lineHeader = input2.readLine()) != null) { if (stopClassifier == true) { statusPane.setText("Running of Classifier Stopped!"); stopClassifier = false; output2.close(); input2.close(); return; } //if(lineCounter2%100 == 0) statusPane.setText("Running " + classifierData.getClassifierName() + " - ClassifierTwo @ " + lineCounter2 + " / " + sequenceNameTableModel.getRowCount()); lineSequence = input2.readLine(); output2.write(lineHeader); output2.newLine(); output2.write(lineSequence); output2.newLine(); StringTokenizer locationScore = new StringTokenizer(input2.readLine(), ","); int totalTokens = locationScore.countTokens(); String[][] scores = new String[totalTokens][2]; int scoreIndex = 0; while (locationScore.hasMoreTokens()) { StringTokenizer locationScoreToken = new StringTokenizer(locationScore.nextToken(), "="); scores[scoreIndex][0] = locationScoreToken.nextToken();//location scores[scoreIndex][1] = locationScoreToken.nextToken();//score scoreIndex++; } int targetLocationIndex2; if (setUpstream == 0 || setDownstream == 0) { output2.close(); input2.close(); throw new Exception("setUpstream == 0 || setDownstream == 0"); } if (setUpstream < 0) { targetLocationIndex2 = Integer.parseInt(scores[0][0]) + (-setUpstream); } else {//setUpstream > 0 targetLocationIndex2 = Integer.parseInt(scores[0][0]); //first location } for (int x = 0; x + minScoreWindowRequired - 1 < totalTokens; x++) { //+1 is for the class index if (x != 0) output2.write(","); Instance tempInst2 = new Instance(minScoreWindowRequired + 1); tempInst2.setDataset(inst2); for (int y = 0; y < minScoreWindowRequired; y++) { tempInst2.setValue(y, Double.parseDouble(scores[x + y][1])); } tempInst2.setValue(tempInst2.numAttributes() - 1, "pos"); double[] results = classifierTwo.distributionForInstance(tempInst2); output2.write(targetLocationIndex2 + "=" + results[0]); targetLocationIndex2++; } lineCounter2++; output2.newLine(); } input2.close(); output2.close(); statusPane.setText("ClassifierTwo finished running..."); } if (classifierData.getClassifierType() == 1) loadScoreFile( outputDirectory + File.separator + "classifierone_" + classifierData.getClassifierName() + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores"); else loadScoreFile( outputDirectory + File.separator + "classifiertwo_" + classifierData.getClassifierName() + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores"); } catch (Exception e) { JOptionPane.showMessageDialog(null, "Exception Occured", "Error", JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } }
From source file:sirius.predictor.main.PredictorFrame.java
License:Open Source License
private void runType2ClassifierWithMotifList(ClassifierData classifierData) { //Checking.. if (sequenceNameTableModel.getRowCount() < 1) { JOptionPane.showMessageDialog(this, "Please load File first!", "No Sequence", JOptionPane.INFORMATION_MESSAGE); return;/*from ww w . j av a 2 s. c o m*/ } if (loadFastaFileMenuItem.getState() == false) { JOptionPane.showMessageDialog(this, "Please load Fasta File! Currently, you have score file!", "Wrong File Format", JOptionPane.INFORMATION_MESSAGE); return; } if (motifListTableModel.getSize() == 0) { JOptionPane.showMessageDialog(this, "There are no Motifs chosen in Motif List!", "No Motifs", JOptionPane.INFORMATION_MESSAGE); MotifListDialog dialog = new MotifListDialog(motifListTableModel); dialog.setLocationRelativeTo(this); dialog.setVisible(true); return; } //Proper running start try { //classifierOne score output BufferedWriter output = new BufferedWriter(new FileWriter( outputDirectory + File.separator + "classifierone_" + classifierData.getClassifierName() + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores")); Classifier classifierOne = classifierData.getClassifierOne(); int leftMostPosition = classifierData.getLeftMostPosition(); int rightMostPosition = classifierData.getRightMostPosition(); //Reading and Storing the featureList Instances inst = classifierData.getInstances(); ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>(); for (int x = 0; x < inst.numAttributes() - 1; x++) { //-1 because class attribute must be ignored featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(x).name())); } //initialization for type 2 classifier BufferedWriter output2 = new BufferedWriter(new FileWriter( outputDirectory + File.separator + "classifiertwo_" + classifierData.getClassifierName() + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores")); int setUpstream = classifierData.getSetUpstream(); int setDownstream = classifierData.getSetDownstream(); int minScoreWindowRequired; if (setUpstream < 0 && setDownstream < 0) {//-ve and -ve minScoreWindowRequired = setDownstream - setUpstream + 1; } else if (setUpstream < 0 && setDownstream > 0) {//-ve and +ve minScoreWindowRequired = (setUpstream * -1) + setDownstream; } else {//+ve and +ve minScoreWindowRequired = setDownstream - setUpstream + 1; } Classifier classifierTwo = classifierData.getClassifierTwo(); Instances inst2 = classifierData.getInstances2(); if (setUpstream == 0 || setDownstream == 0) { output.close(); output2.close(); throw new Exception("setUpstream == 0 || setDownstream == 0"); } //for each sequence for (int x = 0; x < sequenceNameTableModel.getRowCount(); x++) { if (stopClassifier == true) { statusPane.setText("Running of Classifier Stopped!"); stopClassifier = false; output.close(); output2.close(); return; } //if(x%100 == 0) statusPane.setText("Running " + classifierData.getClassifierName() + " - ClassifierOne @ " + x + " / " + sequenceNameTableModel.getRowCount()); //Header output.write(sequenceNameTableModel.getHeader(x)); output.newLine(); output.write(sequenceNameTableModel.getSequence(x)); output.newLine(); output2.write(sequenceNameTableModel.getHeader(x)); output2.newLine(); output2.write(sequenceNameTableModel.getSequence(x)); output2.newLine(); //Sequence Score -> index-score, index-score String sequence = sequenceNameTableModel.getSequence(x); int minSequenceLengthRequired; int targetLocationIndex; //set the targetLocationIndex and minSequenceLengthRequired if (leftMostPosition < 0 && rightMostPosition > 0) {// -ve and +ve minSequenceLengthRequired = (leftMostPosition * -1) + rightMostPosition; targetLocationIndex = (leftMostPosition * -1); } else if (leftMostPosition < 0 && rightMostPosition < 0) {//-ve and -ve minSequenceLengthRequired = rightMostPosition - leftMostPosition + 1; targetLocationIndex = (leftMostPosition * -1); } else {//+ve and +ve minSequenceLengthRequired = rightMostPosition - leftMostPosition + 1; targetLocationIndex = (leftMostPosition * -1); } //This hashtable is used to ensure that on positions where predictions are already made, //we just skip. This will happen only if it is a type 2 classifier Hashtable<Integer, Double> scoreTable = new Hashtable<Integer, Double>(); boolean firstEntryForClassifierOne = true; boolean firstEntryForClassifierTwo = true; for (int y = 0; y + (minSequenceLengthRequired - 1) < sequence.length(); y++) { int endPoint = y;//endPoint should be the exact position int currentY = y; int startPoint = y; //run only on Motifs? if (onMotifsOnlyMenuItem.getState()) { //Check if targetLocation match any motif in motif List if (motifListTableModel .gotMotifMatch(sequence.substring(y + 0, y + targetLocationIndex)) == false) continue; //position not found in motif list else //rollback to upstream and make prediction all the way till downstream //needed for type 2 classifier currentY += setUpstream; if (setUpstream > 0) currentY--; startPoint = currentY; //note that y starts from 0 so y is surely >= 0 endPoint += setDownstream; if (setDownstream > 0) endPoint--; //check still within bound of the sequence if (startPoint < 0 || endPoint >= sequence.length() - (minSequenceLengthRequired - 1)) continue;//out of bounds } while (currentY <= endPoint) { if (scoreTable.get(currentY + targetLocationIndex) != null) { currentY++; continue; } String line2 = sequence.substring(currentY + 0, currentY + minSequenceLengthRequired); Instance tempInst; tempInst = new Instance(inst.numAttributes()); tempInst.setDataset(inst); for (int z = 0; z < inst.numAttributes() - 1; z++) { //-1 because class attribute can be ignored //Give the sequence and the featureList to get the feature freqs on the sequence Object obj = GenerateArff.getMatchCount("+1_Index(" + targetLocationIndex + ")", line2, featureDataArrayList.get(z), classifierData.getScoringMatrixIndex(), classifierData.getCountingStyleIndex(), classifierData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(z, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(z, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(z, (String) obj); else { output.close(); output2.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } //note that pos or neg does not matter as this is not used tempInst.setValue(inst.numAttributes() - 1, "neg"); double[] results = classifierOne.distributionForInstance(tempInst); if (firstEntryForClassifierOne) firstEntryForClassifierOne = false; else output.write(","); output.write(currentY + targetLocationIndex + "=" + results[0]); scoreTable.put(currentY + targetLocationIndex, results[0]); currentY++; } Instance tempInst2 = new Instance(minScoreWindowRequired + 1);//+1 for class attribute tempInst2.setDataset(inst2); int indexForClassifier2Inst = 0; for (int z = startPoint; z <= endPoint; z++) { tempInst2.setValue(indexForClassifier2Inst, scoreTable.get(targetLocationIndex + z)); indexForClassifier2Inst++; } //note that pos or neg does not matter as this is not used tempInst2.setValue(tempInst2.numAttributes() - 1, "pos"); double[] results = classifierTwo.distributionForInstance(tempInst2); if (firstEntryForClassifierTwo == true) firstEntryForClassifierTwo = false; else output2.write(","); output2.write(y + targetLocationIndex + "=" + results[0]); } //end of for loop output2.newLine(); output2.flush(); output.newLine(); output.flush(); } output.close(); output2.close(); statusPane.setText("Classifier Finished running..."); loadScoreFile(outputDirectory + File.separator + "classifiertwo_" + classifierData.getClassifierName() + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores"); } catch (Exception e) { JOptionPane.showMessageDialog(null, "Exception Occured", "Error", JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } }
From source file:sirius.trainer.step4.RunClassifier.java
License:Open Source License
public static Classifier startClassifierTwo(JInternalFrame parent, ApplicationData applicationData, JTextArea classifierTwoDisplayTextArea, GenericObjectEditor m_ClassifierEditor2, Classifier classifierOne, GraphPane myGraph, boolean test, ClassifierResults classifierResults, int range, double threshold) { int arraySize = 0; int lineCount = 0; try {/*from ww w.ja va 2 s . c om*/ StatusPane statusPane = applicationData.getStatusPane(); //Initialising long totalTimeStart = System.currentTimeMillis(); Step1TableModel positiveStep1TableModel = applicationData.getPositiveStep1TableModel(); Step1TableModel negativeStep1TableModel = applicationData.getNegativeStep1TableModel(); int positiveDataset3FromInt = applicationData.getPositiveDataset3FromField(); int positiveDataset3ToInt = applicationData.getPositiveDataset3ToField(); int negativeDataset3FromInt = applicationData.getNegativeDataset3FromField(); int negativeDataset3ToInt = applicationData.getNegativeDataset3ToField(); //Preparing Dataset2.arff to train Classifier Two statusPane.setText("Preparing Dataset2.arff..."); //This step generates Dataset2.arff if (DatasetGenerator.generateDataset2(parent, applicationData, applicationData.getSetUpstream(), applicationData.getSetDownstream(), classifierOne) == false) { //Interrupted or Error occurred return null; } //Training Classifier Two statusPane.setText("Training Classifier Two... May take a while... Please wait..."); Instances inst2 = new Instances(new BufferedReader( new FileReader(applicationData.getWorkingDirectory() + File.separator + "Dataset2.arff"))); inst2.setClassIndex(inst2.numAttributes() - 1); long trainTimeStart = 0; long trainTimeElapsed = 0; Classifier classifierTwo = (Classifier) m_ClassifierEditor2.getValue(); trainTimeStart = System.currentTimeMillis(); applicationData.setDataset2Instances(inst2); classifierTwo.buildClassifier(inst2); trainTimeElapsed = System.currentTimeMillis() - trainTimeStart; //Running Classifier Two String classifierName = m_ClassifierEditor2.getValue().getClass().getName(); classifierResults.updateList(classifierResults.getClassifierList(), "Classifier: ", classifierName); classifierResults.updateList(classifierResults.getClassifierList(), "Training Data: ", applicationData.getWorkingDirectory() + File.separator + "Dataset2.arff"); classifierResults.updateList(classifierResults.getClassifierList(), "Time Used: ", Utils.doubleToString(trainTimeElapsed / 1000.0, 2) + " seconds"); if (test == false) { statusPane.setText("Classifier Two Trained...Done..."); return classifierTwo; } if (applicationData.terminateThread == true) { statusPane.setText("Interrupted - Classifier One Training Completed"); return classifierTwo; } statusPane.setText("Running Classifier Two on Dataset 3..."); //Generate the header for ClassifierTwo.scores on Dataset3 BufferedWriter classifierTwoOutput = new BufferedWriter(new FileWriter( applicationData.getWorkingDirectory() + File.separator + "ClassifierTwo.scores")); if (m_ClassifierEditor2.getValue() instanceof OptionHandler) classifierName += " " + Utils.joinOptions(((OptionHandler) m_ClassifierEditor2.getValue()).getOptions()); //Generating an Instance given a sequence with the current attributes int setClassifierTwoUpstreamInt = applicationData.getSetUpstream(); int setClassifierTwoDownstreamInt = applicationData.getSetDownstream(); int classifierTwoWindowSize; if (setClassifierTwoUpstreamInt < 0 && setClassifierTwoDownstreamInt > 0) classifierTwoWindowSize = (setClassifierTwoUpstreamInt * -1) + setClassifierTwoDownstreamInt; else if (setClassifierTwoUpstreamInt < 0 && setClassifierTwoDownstreamInt < 0) classifierTwoWindowSize = (setClassifierTwoUpstreamInt - setClassifierTwoDownstreamInt - 1) * -1; else//both +ve classifierTwoWindowSize = (setClassifierTwoDownstreamInt - setClassifierTwoUpstreamInt + 1); Instances inst = applicationData.getDataset1Instances(); //NOTE: need to take care of this function; FastaFileManipulation fastaFile = new FastaFileManipulation(positiveStep1TableModel, negativeStep1TableModel, positiveDataset3FromInt, positiveDataset3ToInt, negativeDataset3FromInt, negativeDataset3ToInt, applicationData.getWorkingDirectory()); //loading in all the features.. ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>(); for (int x = 0; x < inst.numAttributes() - 1; x++) { //-1 because class attribute must be ignored featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(x).name())); } //Reading the fastaFile String _class = "pos"; lineCount = 0; int totalPosSequences = positiveDataset3ToInt - positiveDataset3FromInt + 1; FastaFormat fastaFormat; while ((fastaFormat = fastaFile.nextSequence(_class)) != null) { if (applicationData.terminateThread == true) { statusPane.setText("Interrupted - Classifier Two Trained"); classifierTwoOutput.close(); return classifierTwo; } lineCount++; classifierTwoOutput.write(fastaFormat.getHeader()); classifierTwoOutput.newLine(); classifierTwoOutput.write(fastaFormat.getSequence()); classifierTwoOutput.newLine(); //if((lineCount % 100) == 0){ statusPane.setText("Running ClassifierTwo on Dataset 3...@ " + lineCount + " / " + applicationData.getTotalSequences(3) + " Sequences"); //} arraySize = fastaFormat.getArraySize(applicationData.getLeftMostPosition(), applicationData.getRightMostPosition()); //This area always generate -ve arraySize~! WHY?? Exception always occur here double scores[] = new double[arraySize]; int predictPosition[] = fastaFormat.getPredictPositionForClassifierOne( applicationData.getLeftMostPosition(), applicationData.getRightMostPosition()); //Doing shift from upstream till downstream SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(), predictPosition[0], predictPosition[1]); int scoreCount = 0; String line2; while ((line2 = seq.nextShift()) != null) { Instance tempInst = new Instance(inst.numAttributes()); tempInst.setDataset(inst); //-1 because class attribute can be ignored for (int x = 0; x < inst.numAttributes() - 1; x++) { Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2, featureDataArrayList.get(x), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(x, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(x, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(x, (String) obj); else { classifierTwoOutput.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } tempInst.setValue(inst.numAttributes() - 1, _class); //Run classifierOne double[] results = classifierOne.distributionForInstance(tempInst); scores[scoreCount++] = results[0]; } //Run classifierTwo int currentPosition = fastaFormat.getPredictionFromForClassifierTwo( applicationData.getLeftMostPosition(), applicationData.getRightMostPosition(), applicationData.getSetUpstream()); classifierTwoOutput.write(_class); for (int y = 0; y < arraySize - classifierTwoWindowSize + 1; y++) { //+1 is for the class index Instance tempInst2 = new Instance(classifierTwoWindowSize + 1); tempInst2.setDataset(inst2); for (int x = 0; x < classifierTwoWindowSize; x++) { tempInst2.setValue(x, scores[x + y]); } tempInst2.setValue(tempInst2.numAttributes() - 1, _class); double[] results = classifierTwo.distributionForInstance(tempInst2); classifierTwoOutput.write("," + currentPosition + "=" + results[0]); currentPosition++; if (currentPosition == 0) currentPosition++; } classifierTwoOutput.newLine(); classifierTwoOutput.flush(); if (lineCount == totalPosSequences) _class = "neg"; } classifierTwoOutput.close(); statusPane.setText("Done!"); PredictionStats classifierTwoStatsOnBlindTest = new PredictionStats( applicationData.getWorkingDirectory() + File.separator + "ClassifierTwo.scores", range, threshold); //display(double range) long totalTimeElapsed = System.currentTimeMillis() - totalTimeStart; classifierResults.updateList(classifierResults.getResultsList(), "Total Time Used: ", Utils.doubleToString(totalTimeElapsed / 60000, 2) + " minutes " + Utils.doubleToString((totalTimeElapsed / 1000.0) % 60.0, 2) + " seconds"); classifierTwoStatsOnBlindTest.updateDisplay(classifierResults, classifierTwoDisplayTextArea, true); applicationData.setClassifierTwoStats(classifierTwoStatsOnBlindTest); myGraph.setMyStats(classifierTwoStatsOnBlindTest); fastaFile.cleanUp(); return classifierTwo; } catch (Exception ex) { ex.printStackTrace(); JOptionPane.showMessageDialog(parent, ex.getMessage() + "Classifier Two On Blind Test Set - Check Console Output", "Evaluate classifier two", JOptionPane.ERROR_MESSAGE); System.err.println("applicationData.getLeftMostPosition(): " + applicationData.getLeftMostPosition()); System.err.println("applicationData.getRightMostPosition(): " + applicationData.getRightMostPosition()); System.err.println("arraySize: " + arraySize); System.err.println("lineCount: " + lineCount); return null; } }
From source file:sirius.trainer.step4.RunClassifier.java
License:Open Source License
public static Classifier xValidateClassifierTwo(JInternalFrame parent, ApplicationData applicationData, JTextArea classifierTwoDisplayTextArea, GenericObjectEditor m_ClassifierEditor2, Classifier classifierOne, int folds, GraphPane myGraph, ClassifierResults classifierResults, int range, double threshold, boolean outputClassifier) { try {/*w ww.j av a 2 s .c o m*/ StatusPane statusPane = applicationData.getStatusPane(); long totalTimeStart = System.currentTimeMillis(), totalTimeElapsed; //Classifier tempClassifier = (Classifier) m_ClassifierEditor2.getValue(); final int positiveDataset2FromInt = applicationData.getPositiveDataset2FromField(); final int positiveDataset2ToInt = applicationData.getPositiveDataset2ToField(); final int negativeDataset2FromInt = applicationData.getNegativeDataset2FromField(); final int negativeDataset2ToInt = applicationData.getNegativeDataset2ToField(); final int totalDataset2Sequences = (positiveDataset2ToInt - positiveDataset2FromInt + 1) + (negativeDataset2ToInt - negativeDataset2FromInt + 1); final int classifierTwoUpstream = applicationData.getSetUpstream(); final int classifierTwoDownstream = applicationData.getSetDownstream(); Step1TableModel positiveStep1TableModel = applicationData.getPositiveStep1TableModel(); Step1TableModel negativeStep1TableModel = applicationData.getNegativeStep1TableModel(); //Train classifier two with the full dataset first then do cross-validation to gauge its accuracy //Preparing Dataset2.arff to train Classifier Two long trainTimeStart = 0, trainTimeElapsed = 0; statusPane.setText("Preparing Dataset2.arff..."); //This step generates Dataset2.arff if (DatasetGenerator.generateDataset2(parent, applicationData, applicationData.getSetUpstream(), applicationData.getSetDownstream(), classifierOne) == false) { //Interrupted or Error occurred return null; } Instances instOfDataset2 = new Instances(new BufferedReader( new FileReader(applicationData.getWorkingDirectory() + File.separator + "Dataset2.arff"))); instOfDataset2.setClassIndex(instOfDataset2.numAttributes() - 1); applicationData.setDataset2Instances(instOfDataset2); Classifier classifierTwo = (Classifier) m_ClassifierEditor2.getValue(); statusPane.setText("Training Classifier Two... May take a while... Please wait..."); //Record Start Time trainTimeStart = System.currentTimeMillis(); if (outputClassifier) classifierTwo.buildClassifier(instOfDataset2); //Record Total Time used to build classifier one trainTimeElapsed = System.currentTimeMillis() - trainTimeStart; //Training Done String classifierName = m_ClassifierEditor2.getValue().getClass().getName(); classifierResults.updateList(classifierResults.getClassifierList(), "Classifier: ", classifierName); classifierResults.updateList(classifierResults.getClassifierList(), "Training Data: ", folds + " fold cross-validation on Dataset2.arff"); classifierResults.updateList(classifierResults.getClassifierList(), "Time Used: ", Utils.doubleToString(trainTimeElapsed / 1000.0, 2) + " seconds"); Instances instOfDataset1 = new Instances(applicationData.getDataset1Instances()); instOfDataset1.setClassIndex(applicationData.getDataset1Instances().numAttributes() - 1); //Reading and Storing the featureList ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>(); for (int y = 0; y < instOfDataset1.numAttributes() - 1; y++) { featureDataArrayList.add(Feature.levelOneClassifierPane(instOfDataset1.attribute(y).name())); } //Generating an Instance given a sequence with the current attributes int setClassifierTwoUpstreamInt = applicationData.getSetUpstream(); int setClassifierTwoDownstreamInt = applicationData.getSetDownstream(); int classifierTwoWindowSize; if (setClassifierTwoUpstreamInt < 0 && setClassifierTwoDownstreamInt > 0) classifierTwoWindowSize = (setClassifierTwoUpstreamInt * -1) + setClassifierTwoDownstreamInt; else if (setClassifierTwoUpstreamInt < 0 && setClassifierTwoDownstreamInt < 0) classifierTwoWindowSize = (setClassifierTwoUpstreamInt - setClassifierTwoDownstreamInt - 1) * -1; else//both +ve classifierTwoWindowSize = (setClassifierTwoDownstreamInt - setClassifierTwoUpstreamInt + 1); int posTestSequenceCounter = 0; BufferedWriter outputCrossValidation = new BufferedWriter(new FileWriter( applicationData.getWorkingDirectory() + File.separator + "classifierTwo.scores")); for (int x = 0; x < folds; x++) { File trainFile = new File(applicationData.getWorkingDirectory() + File.separator + "trainingDataset2_" + (x + 1) + ".arff"); File testFile = new File(applicationData.getWorkingDirectory() + File.separator + "testingDataset2_" + (x + 1) + ".fasta"); statusPane.setText("Preparing Training Data for Fold " + (x + 1) + ".."); FastaFileManipulation fastaFile = new FastaFileManipulation(positiveStep1TableModel, negativeStep1TableModel, positiveDataset2FromInt, positiveDataset2ToInt, negativeDataset2FromInt, negativeDataset2ToInt, applicationData.getWorkingDirectory()); //1) generate trainingDataset2.arff headings BufferedWriter trainingOutputFile = new BufferedWriter( new FileWriter(applicationData.getWorkingDirectory() + File.separator + "trainingDataset2_" + (x + 1) + ".arff")); trainingOutputFile.write("@relation 'A temp file for X-validation purpose' "); trainingOutputFile.newLine(); trainingOutputFile.newLine(); trainingOutputFile.flush(); for (int y = classifierTwoUpstream; y <= classifierTwoDownstream; y++) { if (y != 0) { trainingOutputFile.write("@attribute (" + y + ") numeric"); trainingOutputFile.newLine(); trainingOutputFile.flush(); } } if (positiveDataset2FromInt > 0 && negativeDataset2FromInt > 0) trainingOutputFile.write("@attribute Class {pos,neg}"); else if (positiveDataset2FromInt > 0 && negativeDataset2FromInt == 0) trainingOutputFile.write("@attribute Class {pos}"); else if (positiveDataset2FromInt == 0 && negativeDataset2FromInt > 0) trainingOutputFile.write("@attribute Class {neg}"); trainingOutputFile.newLine(); trainingOutputFile.newLine(); trainingOutputFile.write("@data"); trainingOutputFile.newLine(); trainingOutputFile.newLine(); trainingOutputFile.flush(); //AHFU_DEBUG BufferedWriter testingOutputFileArff = new BufferedWriter( new FileWriter(applicationData.getWorkingDirectory() + File.separator + "testingDataset2_" + (x + 1) + ".arff")); testingOutputFileArff.write("@relation 'A temp file for X-validation purpose' "); testingOutputFileArff.newLine(); testingOutputFileArff.newLine(); testingOutputFileArff.flush(); for (int y = classifierTwoUpstream; y <= classifierTwoDownstream; y++) { if (y != 0) { testingOutputFileArff.write("@attribute (" + y + ") numeric"); testingOutputFileArff.newLine(); testingOutputFileArff.flush(); } } if (positiveDataset2FromInt > 0 && negativeDataset2FromInt > 0) testingOutputFileArff.write("@attribute Class {pos,neg}"); else if (positiveDataset2FromInt > 0 && negativeDataset2FromInt == 0) testingOutputFileArff.write("@attribute Class {pos}"); else if (positiveDataset2FromInt == 0 && negativeDataset2FromInt > 0) testingOutputFileArff.write("@attribute Class {neg}"); testingOutputFileArff.newLine(); testingOutputFileArff.newLine(); testingOutputFileArff.write("@data"); testingOutputFileArff.newLine(); testingOutputFileArff.newLine(); testingOutputFileArff.flush(); //AHFU_DEBUG END //2) generate testingDataset2.fasta BufferedWriter testingOutputFile = new BufferedWriter( new FileWriter(applicationData.getWorkingDirectory() + File.separator + "testingDataset2_" + (x + 1) + ".fasta")); //Now, populating datas for both the training and testing files int fastaFileLineCounter = 0; posTestSequenceCounter = 0; int totalTestSequenceCounter = 0; int totalTrainTestSequenceCounter = 0; FastaFormat fastaFormat; //For pos sequences while ((fastaFormat = fastaFile.nextSequence("pos")) != null) { if (applicationData.terminateThread == true) { statusPane.setText("Interrupted - Classifier Two Trained"); outputCrossValidation.close(); testingOutputFileArff.close(); testingOutputFile.close(); trainingOutputFile.close(); return classifierTwo; } totalTrainTestSequenceCounter++; //if(totalTrainTestSequenceCounter%100 == 0) statusPane.setText("Preparing Training Data for Fold " + (x + 1) + ".. @ " + totalTrainTestSequenceCounter + " / " + totalDataset2Sequences); if ((fastaFileLineCounter % folds) == x) {//This sequence is for testing testingOutputFile.write(fastaFormat.getHeader()); testingOutputFile.newLine(); testingOutputFile.write(fastaFormat.getSequence()); testingOutputFile.newLine(); testingOutputFile.flush(); posTestSequenceCounter++; totalTestSequenceCounter++; //AHFU DEBUG SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(), classifierTwoUpstream, classifierTwoDownstream); String line2; while ((line2 = seq.nextShift()) != null) { Instance tempInst = new Instance(instOfDataset1.numAttributes()); tempInst.setDataset(instOfDataset1); //-1 because class attribute can be ignored for (int w = 0; w < instOfDataset1.numAttributes() - 1; w++) { Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2, featureDataArrayList.get(w), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(w, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(w, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(w, (String) obj); else { outputCrossValidation.close(); testingOutputFileArff.close(); testingOutputFile.close(); trainingOutputFile.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } tempInst.setValue(tempInst.numAttributes() - 1, "pos"); double[] results = classifierOne.distributionForInstance(tempInst); testingOutputFileArff.write(results[0] + ","); } testingOutputFileArff.write("pos"); testingOutputFileArff.newLine(); testingOutputFileArff.flush(); //AHFU DEBUG END } else {//This sequence is for training SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(), classifierTwoUpstream, classifierTwoDownstream); String line2; while ((line2 = seq.nextShift()) != null) { Instance tempInst = new Instance(instOfDataset1.numAttributes()); tempInst.setDataset(instOfDataset1); //-1 because class attribute can be ignored for (int w = 0; w < instOfDataset1.numAttributes() - 1; w++) { Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2, featureDataArrayList.get(w), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(w, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(w, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(w, (String) obj); else { outputCrossValidation.close(); testingOutputFileArff.close(); testingOutputFile.close(); trainingOutputFile.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } tempInst.setValue(tempInst.numAttributes() - 1, "pos"); double[] results = classifierOne.distributionForInstance(tempInst); trainingOutputFile.write(results[0] + ","); } trainingOutputFile.write("pos"); trainingOutputFile.newLine(); trainingOutputFile.flush(); } fastaFileLineCounter++; } //For neg sequences fastaFileLineCounter = 0; while ((fastaFormat = fastaFile.nextSequence("neg")) != null) { if (applicationData.terminateThread == true) { statusPane.setText("Interrupted - Classifier Two Trained"); outputCrossValidation.close(); testingOutputFileArff.close(); testingOutputFile.close(); trainingOutputFile.close(); return classifierTwo; } totalTrainTestSequenceCounter++; //if(totalTrainTestSequenceCounter%100 == 0) statusPane.setText("Preparing Training Data for Fold " + (x + 1) + ".. @ " + totalTrainTestSequenceCounter + " / " + totalDataset2Sequences); if ((fastaFileLineCounter % folds) == x) {//This sequence is for testing testingOutputFile.write(fastaFormat.getHeader()); testingOutputFile.newLine(); testingOutputFile.write(fastaFormat.getSequence()); testingOutputFile.newLine(); testingOutputFile.flush(); totalTestSequenceCounter++; //AHFU DEBUG SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(), classifierTwoUpstream, classifierTwoDownstream); String line2; while ((line2 = seq.nextShift()) != null) { Instance tempInst = new Instance(instOfDataset1.numAttributes()); tempInst.setDataset(instOfDataset1); //-1 because class attribute can be ignored for (int w = 0; w < instOfDataset1.numAttributes() - 1; w++) { Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2, featureDataArrayList.get(w), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(w, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(w, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(w, (String) obj); else { outputCrossValidation.close(); testingOutputFileArff.close(); testingOutputFile.close(); trainingOutputFile.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } tempInst.setValue(tempInst.numAttributes() - 1, "pos");//pos or neg does not matter here - not used double[] results = classifierOne.distributionForInstance(tempInst); testingOutputFileArff.write(results[0] + ","); } testingOutputFileArff.write("neg"); testingOutputFileArff.newLine(); testingOutputFileArff.flush(); //AHFU DEBUG END } else {//This sequence is for training SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(), classifierTwoUpstream, classifierTwoDownstream); String line2; while ((line2 = seq.nextShift()) != null) { Instance tempInst = new Instance(instOfDataset1.numAttributes()); tempInst.setDataset(instOfDataset1); //-1 because class attribute can be ignored for (int w = 0; w < instOfDataset1.numAttributes() - 1; w++) { Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2, featureDataArrayList.get(w), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(w, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(w, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(w, (String) obj); else { outputCrossValidation.close(); testingOutputFileArff.close(); testingOutputFile.close(); trainingOutputFile.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } tempInst.setValue(tempInst.numAttributes() - 1, "pos");//pos or neg does not matter here - not used double[] results = classifierOne.distributionForInstance(tempInst); trainingOutputFile.write(results[0] + ","); } trainingOutputFile.write("neg"); trainingOutputFile.newLine(); trainingOutputFile.flush(); } fastaFileLineCounter++; } trainingOutputFile.close(); testingOutputFile.close(); //AHFU_DEBUG testingOutputFileArff.close(); //AHFU DEBUG END //3) train and test classifier two then store the statistics statusPane.setText("Building Fold " + (x + 1) + ".."); //open an input stream to the arff file BufferedReader trainingInput = new BufferedReader( new FileReader(applicationData.getWorkingDirectory() + File.separator + "trainingDataset2_" + (x + 1) + ".arff")); //getting ready to train a foldClassifier using arff file Instances instOfTrainingDataset2 = new Instances( new BufferedReader(new FileReader(applicationData.getWorkingDirectory() + File.separator + "trainingDataset2_" + (x + 1) + ".arff"))); instOfTrainingDataset2.setClassIndex(instOfTrainingDataset2.numAttributes() - 1); Classifier foldClassifier = (Classifier) m_ClassifierEditor2.getValue(); foldClassifier.buildClassifier(instOfTrainingDataset2); trainingInput.close(); //Reading the test file statusPane.setText("Evaluating fold " + (x + 1) + ".."); BufferedReader testingInput = new BufferedReader( new FileReader(applicationData.getWorkingDirectory() + File.separator + "testingDataset2_" + (x + 1) + ".fasta")); int lineCounter = 0; String lineHeader; String lineSequence; while ((lineHeader = testingInput.readLine()) != null) { if (applicationData.terminateThread == true) { statusPane.setText("Interrupted - Classifier Two Not Trained"); outputCrossValidation.close(); testingOutputFileArff.close(); testingOutputFile.close(); trainingOutputFile.close(); testingInput.close(); return classifierTwo; } lineSequence = testingInput.readLine(); outputCrossValidation.write(lineHeader); outputCrossValidation.newLine(); outputCrossValidation.write(lineSequence); outputCrossValidation.newLine(); lineCounter++; fastaFormat = new FastaFormat(lineHeader, lineSequence); int arraySize = fastaFormat.getArraySize(applicationData.getLeftMostPosition(), applicationData.getRightMostPosition()); double scores[] = new double[arraySize]; int predictPosition[] = fastaFormat.getPredictPositionForClassifierOne( applicationData.getLeftMostPosition(), applicationData.getRightMostPosition()); //For each sequence, you want to shift from upstream till downstream //ie changing the +1 location //to get the scores by classifier one so that can use it to train classifier two later //Doing shift from upstream till downstream //if(lineCounter % 100 == 0) statusPane.setText("Evaluating fold " + (x + 1) + ".. @ " + lineCounter + " / " + totalTestSequenceCounter); SequenceManipulation seq = new SequenceManipulation(lineSequence, predictPosition[0], predictPosition[1]); int scoreCount = 0; String line2; while ((line2 = seq.nextShift()) != null) { Instance tempInst = new Instance(instOfDataset1.numAttributes()); tempInst.setDataset(instOfDataset1); for (int i = 0; i < instOfDataset1.numAttributes() - 1; i++) { //-1 because class attribute can be ignored //Give the sequence and the featureList to get the feature freqs on the sequence Object obj = GenerateArff.getMatchCount(lineHeader, line2, featureDataArrayList.get(i), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(i, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(i, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(i, (String) obj); else { outputCrossValidation.close(); testingOutputFileArff.close(); testingOutputFile.close(); trainingOutputFile.close(); testingInput.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } if (lineCounter > posTestSequenceCounter) {//for neg tempInst.setValue(tempInst.numAttributes() - 1, "neg"); } else { tempInst.setValue(tempInst.numAttributes() - 1, "pos"); } double[] results = classifierOne.distributionForInstance(tempInst); scores[scoreCount++] = results[0]; } //end of sequence shift //Run classifierTwo int currentPosition = fastaFormat.getPredictionFromForClassifierTwo( applicationData.getLeftMostPosition(), applicationData.getRightMostPosition(), applicationData.getSetUpstream()); if (lineCounter > posTestSequenceCounter)//neg outputCrossValidation.write("neg"); else outputCrossValidation.write("pos"); for (int y = 0; y < arraySize - classifierTwoWindowSize + 1; y++) { //+1 is for the class index Instance tempInst2 = new Instance(classifierTwoWindowSize + 1); tempInst2.setDataset(instOfTrainingDataset2); for (int l = 0; l < classifierTwoWindowSize; l++) { tempInst2.setValue(l, scores[l + y]); } if (lineCounter > posTestSequenceCounter)//for neg tempInst2.setValue(tempInst2.numAttributes() - 1, "neg"); else//for pos tempInst2.setValue(tempInst2.numAttributes() - 1, "pos"); double[] results = foldClassifier.distributionForInstance(tempInst2); outputCrossValidation.write("," + currentPosition + "=" + results[0]); currentPosition++; if (currentPosition == 0) currentPosition++; } outputCrossValidation.newLine(); outputCrossValidation.flush(); } //end of reading test file outputCrossValidation.close(); testingOutputFileArff.close(); testingOutputFile.close(); trainingOutputFile.close(); testingInput.close(); fastaFile.cleanUp(); //AHFU_DEBUG trainFile.deleteOnExit(); testFile.deleteOnExit(); //NORMAL MODE //trainFile.delete(); //testFile.delete(); } //end of for loop for xvalidation PredictionStats classifierTwoStatsOnXValidation = new PredictionStats( applicationData.getWorkingDirectory() + File.separator + "classifierTwo.scores", range, threshold); //display(double range) totalTimeElapsed = System.currentTimeMillis() - totalTimeStart; classifierResults.updateList(classifierResults.getResultsList(), "Total Time Used: ", Utils.doubleToString(totalTimeElapsed / 60000, 2) + " minutes " + Utils.doubleToString((totalTimeElapsed / 1000.0) % 60.0, 2) + " seconds"); classifierTwoStatsOnXValidation.updateDisplay(classifierResults, classifierTwoDisplayTextArea, true); applicationData.setClassifierTwoStats(classifierTwoStatsOnXValidation); myGraph.setMyStats(classifierTwoStatsOnXValidation); statusPane.setText("Done!"); return classifierTwo; } catch (Exception e) { e.printStackTrace(); JOptionPane.showMessageDialog(parent, e.getMessage(), "ERROR", JOptionPane.ERROR_MESSAGE); return null; } }
From source file:swm.project.mappings.OurDistance.java
@Override public double distance(Instance instnc, Instance instnc1) { int num = instnc.numAttributes(); List<Double> movieClusterRating1 = new ArrayList<Double>(), movieClusterRating2 = new ArrayList<Double>(); Attribute id = instnc.attribute(0); for (int index = 1; index < num; index++) { }//from w w w . ja v a2 s. c om return 1; }