Example usage for weka.core Instance numAttributes

Introduction

In this page you can find the example usage for weka.core Instance numAttributes.

Prototype

public int numAttributes();

Source Link

Document

Returns the number of attributes.

Usage

From source file:pk.lums.edu.sma.processing.ml.DBSCAN.EuclideanDataObject.java

License:Open Source License

/**
 * Calculates the euclidian-distance between dataObject and this.dataObject
 * /*from www  . j  a v  a2 s. c  o m*/
 * @param dataObject
 *            The DataObject, that is used for distance-calculation with
 *            this.dataObject; now assumed to be of the same type and with
 *            the same structure
 * @return double-value The euclidian-distance between dataObject and
 *         this.dataObject
 */
public double distance(DataObject dataObject) {
    double dist = 0.0;

    Instance firstInstance = getInstance();
    Instance secondInstance = dataObject.getInstance();
    int firstNumValues = firstInstance.numValues();
    int secondNumValues = secondInstance.numValues();
    int numAttributes = firstInstance.numAttributes();

    int firstI, secondI;
    for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) {
        if (p1 >= firstNumValues) {
            firstI = numAttributes;
        } else {
            firstI = firstInstance.index(p1);
        }

        if (p2 >= secondNumValues) {
            secondI = numAttributes;
        } else {
            secondI = secondInstance.index(p2);
        }

        double cDistance = 0;
        if (firstI == secondI) {
            cDistance = computeDistance(firstI, firstInstance.valueSparse(p1), secondInstance.valueSparse(p2));
            p1++;
            p2++;
        } else if (firstI > secondI) {
            cDistance = computeDistance(secondI, 0, secondInstance.valueSparse(p2));
            p2++;
        } else {
            cDistance = computeDistance(firstI, firstInstance.valueSparse(p1), 0);
            p1++;
        }
        dist += cDistance * cDistance;
    }
    return Math.sqrt(dist);
}

From source file:pk.lums.edu.sma.processing.ml.DBSCAN.ManhattanDataObject.java

License:Open Source License

/**
 * Calculates the manhattan-distance between dataObject and this.dataObject
 * /*from   w  w w.j av  a2s .c o m*/
 * @param dataObject
 *            The DataObject, that is used for distance-calculation with
 *            this.dataObject now assumed to be of the same type and with
 *            the same structure
 * @return double-value The manhattan-distance between dataObject and
 *         this.dataObject NaN, if the computation could not be performed
 */
public double distance(DataObject dataObject) {
    double dist = 0.0;

    Instance firstInstance = getInstance();
    Instance secondInstance = dataObject.getInstance();
    int firstNumValues = firstInstance.numValues();
    int secondNumValues = secondInstance.numValues();
    int numAttributes = firstInstance.numAttributes();

    int firstI, secondI;
    for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) {
        if (p1 >= firstNumValues) {
            firstI = numAttributes;
        } else {
            firstI = firstInstance.index(p1);
        }

        if (p2 >= secondNumValues) {
            secondI = numAttributes;
        } else {
            secondI = secondInstance.index(p2);
        }

        double cDistance = 0;
        if (firstI == secondI) {
            cDistance = computeDistance(firstI, firstInstance.valueSparse(p1), secondInstance.valueSparse(p2));
            p1++;
            p2++;
        } else if (firstI > secondI) {
            cDistance = computeDistance(secondI, 0, secondInstance.valueSparse(p2));
            p2++;
        } else {
            cDistance = computeDistance(firstI, firstInstance.valueSparse(p1), 0);
            p1++;
        }
        dist += Math.abs(cDistance);
    }
    return dist;
}

From source file:preprocess.StringToWordVector.java

License:Open Source License

/**
 * determines the dictionary.//from   ww  w  . j  a  v a 2  s. c  o m
 */
private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
        try {
            if (getStopwords().exists() && !getStopwords().isDirectory())
                stopwords.read(getStopwords());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
        values = getInputFormat().attribute(classInd).numValues();
    }

    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap[] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
        dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
        Instance instance = getInputFormat().instance(i);
        int vInd = 0;
        if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
            vInd = (int) instance.classValue();
        }

        // Iterate through all relevant string attributes of the current instance
        Hashtable h = new Hashtable();
        for (int j = 0; j < instance.numAttributes(); j++) {
            if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

                // Get tokenizer
                m_Tokenizer.tokenize(instance.stringValue(j));

                // Iterate through tokens, perform stemming, and remove stopwords
                // (if required)
                while (m_Tokenizer.hasMoreElements()) {
                    String word = ((String) m_Tokenizer.nextElement()).intern();

                    if (this.m_lowerCaseTokens == true)
                        word = word.toLowerCase();

                    word = m_Stemmer.stem(word);

                    if (this.m_useStoplist == true)
                        if (stopwords.is(word))
                            continue;

                    if (!(h.contains(word)))
                        h.put(word, new Integer(0));

                    Count count = (Count) dictionaryArr[vInd].get(word);
                    if (count == null) {
                        dictionaryArr[vInd].put(word, new Count(1));
                    } else {
                        count.count++;
                    }
                }
            }
        }

        //updating the docCount for the words that have occurred in this
        //instance(document).
        Enumeration e = h.keys();
        while (e.hasMoreElements()) {
            String word = (String) e.nextElement();
            Count c = (Count) dictionaryArr[vInd].get(word);
            if (c != null) {
                c.docCount++;
            } else
                System.err.println(
                        "Warning: A word should definitely be in the " + "dictionary.Please check the code");
        }

        if (pruneRate > 0) {
            if (i % pruneRate == 0 && i > 0) {
                for (int z = 0; z < values; z++) {
                    Vector d = new Vector(1000);
                    Iterator it = dictionaryArr[z].keySet().iterator();
                    while (it.hasNext()) {
                        String word = (String) it.next();
                        Count count = (Count) dictionaryArr[z].get(word);
                        if (count.count <= 1) {
                            d.add(word);
                        }
                    }
                    Iterator iter = d.iterator();
                    while (iter.hasNext()) {
                        String word = (String) iter.next();
                        dictionaryArr[z].remove(word);
                    }
                }
            }
        }
    }

    // Figure out the minimum required word frequency
    int totalsize = 0;
    int prune[] = new int[values];
    for (int z = 0; z < values; z++) {
        totalsize += dictionaryArr[z].size();

        int array[] = new int[dictionaryArr[z].size()];
        int pos = 0;
        Iterator it = dictionaryArr[z].keySet().iterator();
        while (it.hasNext()) {
            String word = (String) it.next();
            Count count = (Count) dictionaryArr[z].get(word);
            array[pos] = count.count;
            pos++;
        }

        // sort the array
        sortArray(array);
        if (array.length < m_WordsToKeep) {
            // if there aren't enough words, set the threshold to
            // minFreq
            prune[z] = m_minTermFreq;
        } else {
            // otherwise set it to be at least minFreq
            prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]);
        }
    }

    // Convert the dictionary into an attribute index
    // and create one attribute per word
    FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes());

    // Add the non-converted attributes 
    int classIndex = -1;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (!m_SelectedRange.isInRange(i)) {
            if (getInputFormat().classIndex() == i) {
                classIndex = attributes.size();
            }
            attributes.addElement(getInputFormat().attribute(i).copy());
        }
    }

    // Add the word vector attributes (eliminating duplicates
    // that occur in multiple classes)
    TreeMap newDictionary = new TreeMap();
    int index = attributes.size();
    for (int z = 0; z < values; z++) {
        Iterator it = dictionaryArr[z].keySet().iterator();
        while (it.hasNext()) {
            String word = (String) it.next();
            Count count = (Count) dictionaryArr[z].get(word);
            if (count.count >= prune[z]) {
                if (newDictionary.get(word) == null) {
                    newDictionary.put(word, new Integer(index++));
                    attributes.addElement(new Attribute(m_Prefix + word));
                }
            }
        }
    }

    // Compute document frequencies
    m_DocsCounts = new int[attributes.size()];
    Iterator it = newDictionary.keySet().iterator();
    while (it.hasNext()) {
        String word = (String) it.next();
        int idx = ((Integer) newDictionary.get(word)).intValue();
        int docsCount = 0;
        for (int j = 0; j < values; j++) {
            Count c = (Count) dictionaryArr[j].get(word);
            if (c != null)
                docsCount += c.docCount;
        }
        m_DocsCounts[idx] = docsCount;
    }

    // Trim vector and set instance variables
    attributes.trimToSize();
    m_Dictionary = newDictionary;
    m_NumInstances = getInputFormat().numInstances();

    // Set the filter's output format
    Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0);
    outputFormat.setClassIndex(classIndex);
    setOutputFormat(outputFormat);
}

From source file:preprocess.StringToWordVector.java

License:Open Source License

/**
 * Converts the instance w/o normalization.
 * /*w  w w.  j a  va2  s  .c  o  m*/
 * @oaram instance the instance to convert
 * @param v
 * @return the conerted instance
 */
private int convertInstancewoDocNorm(Instance instance, FastVector v) {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (!m_SelectedRange.isInRange(i)) {
            if (getInputFormat().attribute(i).type() != Attribute.STRING) {
                // Add simple nominal and numeric attributes directly
                if (instance.value(i) != 0.0) {
                    contained.put(new Integer(firstCopy), new Double(instance.value(i)));
                }
            } else {
                if (instance.isMissing(i)) {
                    contained.put(new Integer(firstCopy), new Double(Instance.missingValue()));
                } else {

                    // If this is a string attribute, we have to first add
                    // this value to the range of possible values, then add
                    // its new internal index.
                    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                        // Note that the first string value in a
                        // SparseInstance doesn't get printed.
                        outputFormatPeek().attribute(firstCopy)
                                .addStringValue("Hack to defeat SparseInstance bug");
                    }
                    int newIndex = outputFormatPeek().attribute(firstCopy)
                            .addStringValue(instance.stringValue(i));
                    contained.put(new Integer(firstCopy), new Double(newIndex));
                }
            }
            firstCopy++;
        }
    }

    for (int j = 0; j < instance.numAttributes(); j++) {
        //if ((getInputFormat().attribute(j).type() == Attribute.STRING) 
        if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

            m_Tokenizer.tokenize(instance.stringValue(j));

            while (m_Tokenizer.hasMoreElements()) {
                String word = (String) m_Tokenizer.nextElement();
                if (this.m_lowerCaseTokens == true)
                    word = word.toLowerCase();
                word = m_Stemmer.stem(word);
                Integer index = (Integer) m_Dictionary.get(word);
                if (index != null) {
                    if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup
                        Double count = (Double) contained.get(index);
                        if (count != null) {
                            contained.put(index, new Double(count.doubleValue() + 1.0));
                        } else {
                            contained.put(index, new Double(1));
                        }
                    } else {
                        contained.put(index, new Double(1));
                    }
                }
            }
        }
    }

    //Doing TFTransform
    if (m_TFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = Math.log(val + 1);
                contained.put(index, new Double(val));
            }
        }
    }

    //Doing IDFTransform
    if (m_IDFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]);
                contained.put(index, new Double(val));
            }
        }
    }

    // Convert the set to structures needed to create a sparse instance.
    double[] values = new double[contained.size()];
    int[] indices = new int[contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
        Integer index = (Integer) it.next();
        Double value = (Double) contained.get(index);
        values[i] = value.doubleValue();
        indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());

    v.addElement(inst);

    return firstCopy;
}

From source file:regulyasocjacyjne.RegulyAsocjacyjne.java

public static void infoObj() throws Exception {
    Instances data = loadData("./src/date/irysy.arff");

    for (int i = 0; i < data.numInstances(); i++) //Przegladanie obiektow
    {//from   www . j av  a 2s  .co  m
        System.out.println("Wiersz numer " + i + ":");

        Instance instance = data.instance(i); //Pobranie obiektu (wiersza danych) o podanym numerze

        for (int j = 0; j < instance.numAttributes(); j++) //Przegladanie atrybutow w obiekcie
        {
            String textValue = instance.toString(j); //Pobranie wartosci atrybutu o podanym numerze (tzn. pobranie tekstowej reprezentacji wartosci)
            System.out.print(textValue + ", ");
        }
        System.out.println();
    }
}

From source file:sirius.predictor.main.PredictorFrame.java

License:Open Source License

private void runClassifier(ClassifierData classifierData, boolean allPositions) {
    //this method is for type 1 classifier with all positions and motif list
    //and type 2 classifier with all positions
    if (sequenceNameTableModel.getRowCount() < 1) {
        JOptionPane.showMessageDialog(this, "Please load File first!", "No Sequence",
                JOptionPane.INFORMATION_MESSAGE);
        return;/* w w  w .j  a va 2s .com*/
    }
    if (loadFastaFileMenuItem.getState() == false) {
        JOptionPane.showMessageDialog(this, "Please load Fasta File! Currently, you have score file!",
                "Wrong File Format", JOptionPane.INFORMATION_MESSAGE);
        return;
    }
    if (onAllPositionsMenuItem.getState() == false && motifListTableModel.getSize() == 0) {
        JOptionPane.showMessageDialog(this, "There are no Motifs chosen in Motif List!", "No Motifs",
                JOptionPane.INFORMATION_MESSAGE);
        MotifListDialog dialog = new MotifListDialog(motifListTableModel);
        dialog.setLocationRelativeTo(this);
        dialog.setVisible(true);
        return;
    }
    while (outputDirectory == null) {
        JOptionPane.showMessageDialog(this, "Please set output directory first!", "Output Directory not set",
                JOptionPane.INFORMATION_MESSAGE);
        setOutputDirectory();
        //return;
    }
    try {
        BufferedWriter output = new BufferedWriter(new FileWriter(
                outputDirectory + File.separator + "classifierone_" + classifierData.getClassifierName() + "_"
                        + classifierData.getClassifierType() + "_" + fastaFilename + ".scores"));
        Classifier classifierOne = classifierData.getClassifierOne();
        int leftMostPosition = classifierData.getLeftMostPosition();
        int rightMostPosition = classifierData.getRightMostPosition();
        //Reading and Storing the featureList
        Instances inst = classifierData.getInstances();
        ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>();
        for (int x = 0; x < inst.numAttributes() - 1; x++) {
            //-1 because class attribute must be ignored
            featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(x).name()));
        }

        for (int x = 0; x < sequenceNameTableModel.getRowCount(); x++) {
            if (stopClassifier == true) {
                statusPane.setText("Running of Classifier Stopped!");
                stopClassifier = false;
                output.close();
                return;
            }
            //if(x%100 == 0)
            statusPane.setText("Running " + classifierData.getClassifierName() + " - ClassifierOne @ " + x
                    + " / " + sequenceNameTableModel.getRowCount());
            //Header              
            output.write(sequenceNameTableModel.getHeader(x));
            output.newLine();
            output.write(sequenceNameTableModel.getSequence(x));
            output.newLine();
            //Sequence Score -> index-score, index-score
            String sequence = sequenceNameTableModel.getSequence(x);
            int minSequenceLengthRequired;
            int targetLocationIndex;
            if (leftMostPosition < 0 && rightMostPosition > 0) {// -ve and +ve
                minSequenceLengthRequired = (leftMostPosition * -1) + rightMostPosition;
                targetLocationIndex = (leftMostPosition * -1);
            } else if (leftMostPosition < 0 && rightMostPosition < 0) {//-ve and -ve
                minSequenceLengthRequired = rightMostPosition - leftMostPosition + 1;
                targetLocationIndex = (leftMostPosition * -1);
            } else {//+ve and +ve
                minSequenceLengthRequired = rightMostPosition - leftMostPosition + 1;
                targetLocationIndex = (leftMostPosition * -1);
            }
            boolean firstEntryForClassifierOne = true;
            for (int y = 0; y + (minSequenceLengthRequired - 1) < sequence.length(); y++) {
                //Check if targetLocation match any motif in motif List
                if (allPositions == false && motifListTableModel
                        .gotMotifMatch(sequence.substring(y + 0, y + targetLocationIndex)) == false)
                    continue;
                String line2 = sequence.substring(y + 0, y + minSequenceLengthRequired);
                Instance tempInst;
                tempInst = new Instance(inst.numAttributes());
                tempInst.setDataset(inst);
                for (int z = 0; z < inst.numAttributes() - 1; z++) {
                    //-1 because class attribute can be ignored
                    //Give the sequence and the featureList to get the feature freqs on the sequence
                    Object obj = GenerateArff.getMatchCount("+1_Index(" + targetLocationIndex + ")", line2,
                            featureDataArrayList.get(z), classifierData.getScoringMatrixIndex(),
                            classifierData.getCountingStyleIndex(), classifierData.getScoringMatrix());
                    if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer"))
                        tempInst.setValue(z, (Integer) obj);
                    else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double"))
                        tempInst.setValue(z, (Double) obj);
                    else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String"))
                        tempInst.setValue(z, (String) obj);
                    else {
                        output.close();
                        throw new Error("Unknown: " + obj.getClass().getName());
                    }
                }
                //note that pos or neg does not matter as this is not used
                tempInst.setValue(inst.numAttributes() - 1, "neg");
                double[] results = classifierOne.distributionForInstance(tempInst);
                if (firstEntryForClassifierOne)
                    firstEntryForClassifierOne = false;
                else
                    output.write(",");
                output.write(y + targetLocationIndex + "=" + results[0]);
            }
            output.newLine();
            output.flush();
        }
        output.flush();
        output.close();

        statusPane.setText("ClassifierOne finished running...");

        //Run classifier Two if it is type 2
        if (classifierData.getClassifierType() == 2) {
            BufferedWriter output2 = new BufferedWriter(new FileWriter(
                    outputDirectory + File.separator + "classifiertwo_" + classifierData.getClassifierName()
                            + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores"));
            BufferedReader input2 = new BufferedReader(new FileReader(
                    outputDirectory + File.separator + "classifierone_" + classifierData.getClassifierName()
                            + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores"));
            Classifier classifierTwo = classifierData.getClassifierTwo();
            Instances inst2 = classifierData.getInstances2();
            int setUpstream = classifierData.getSetUpstream();
            int setDownstream = classifierData.getSetDownstream();
            int minScoreWindowRequired;
            if (setUpstream < 0 && setDownstream < 0) {//-ve and -ve
                minScoreWindowRequired = setDownstream - setUpstream + 1;
            } else if (setUpstream < 0 && setDownstream > 0) {//-ve and +ve
                minScoreWindowRequired = (setUpstream * -1) + setDownstream;
            } else {//+ve and +ve
                minScoreWindowRequired = setDownstream - setUpstream + 1;
            }
            String lineHeader;
            String lineSequence;
            int lineCounter2 = 0;
            while ((lineHeader = input2.readLine()) != null) {
                if (stopClassifier == true) {
                    statusPane.setText("Running of Classifier Stopped!");
                    stopClassifier = false;
                    output2.close();
                    input2.close();
                    return;
                }
                //if(lineCounter2%100 == 0)
                statusPane.setText("Running " + classifierData.getClassifierName() + " - ClassifierTwo @ "
                        + lineCounter2 + " / " + sequenceNameTableModel.getRowCount());
                lineSequence = input2.readLine();
                output2.write(lineHeader);
                output2.newLine();
                output2.write(lineSequence);
                output2.newLine();
                StringTokenizer locationScore = new StringTokenizer(input2.readLine(), ",");
                int totalTokens = locationScore.countTokens();
                String[][] scores = new String[totalTokens][2];
                int scoreIndex = 0;
                while (locationScore.hasMoreTokens()) {
                    StringTokenizer locationScoreToken = new StringTokenizer(locationScore.nextToken(), "=");
                    scores[scoreIndex][0] = locationScoreToken.nextToken();//location
                    scores[scoreIndex][1] = locationScoreToken.nextToken();//score
                    scoreIndex++;
                }
                int targetLocationIndex2;
                if (setUpstream == 0 || setDownstream == 0) {
                    output2.close();
                    input2.close();
                    throw new Exception("setUpstream == 0 || setDownstream == 0");
                }
                if (setUpstream < 0) {
                    targetLocationIndex2 = Integer.parseInt(scores[0][0]) + (-setUpstream);
                } else {//setUpstream > 0
                    targetLocationIndex2 = Integer.parseInt(scores[0][0]); //first location
                }
                for (int x = 0; x + minScoreWindowRequired - 1 < totalTokens; x++) {
                    //+1 is for the class index
                    if (x != 0)
                        output2.write(",");
                    Instance tempInst2 = new Instance(minScoreWindowRequired + 1);
                    tempInst2.setDataset(inst2);
                    for (int y = 0; y < minScoreWindowRequired; y++) {
                        tempInst2.setValue(y, Double.parseDouble(scores[x + y][1]));
                    }
                    tempInst2.setValue(tempInst2.numAttributes() - 1, "pos");
                    double[] results = classifierTwo.distributionForInstance(tempInst2);
                    output2.write(targetLocationIndex2 + "=" + results[0]);
                    targetLocationIndex2++;
                }
                lineCounter2++;
                output2.newLine();
            }
            input2.close();
            output2.close();
            statusPane.setText("ClassifierTwo finished running...");
        }
        if (classifierData.getClassifierType() == 1)
            loadScoreFile(
                    outputDirectory + File.separator + "classifierone_" + classifierData.getClassifierName()
                            + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores");
        else
            loadScoreFile(
                    outputDirectory + File.separator + "classifiertwo_" + classifierData.getClassifierName()
                            + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores");
    } catch (Exception e) {
        JOptionPane.showMessageDialog(null, "Exception Occured", "Error", JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    }
}

From source file:sirius.predictor.main.PredictorFrame.java

License:Open Source License

private void runType2ClassifierWithMotifList(ClassifierData classifierData) {
    //Checking..       
    if (sequenceNameTableModel.getRowCount() < 1) {
        JOptionPane.showMessageDialog(this, "Please load File first!", "No Sequence",
                JOptionPane.INFORMATION_MESSAGE);
        return;/*from   ww  w . j av  a  2  s. c o m*/
    }
    if (loadFastaFileMenuItem.getState() == false) {
        JOptionPane.showMessageDialog(this, "Please load Fasta File! Currently, you have score file!",
                "Wrong File Format", JOptionPane.INFORMATION_MESSAGE);
        return;
    }
    if (motifListTableModel.getSize() == 0) {
        JOptionPane.showMessageDialog(this, "There are no Motifs chosen in Motif List!", "No Motifs",
                JOptionPane.INFORMATION_MESSAGE);
        MotifListDialog dialog = new MotifListDialog(motifListTableModel);
        dialog.setLocationRelativeTo(this);
        dialog.setVisible(true);
        return;
    }
    //Proper running start
    try {
        //classifierOne score output
        BufferedWriter output = new BufferedWriter(new FileWriter(
                outputDirectory + File.separator + "classifierone_" + classifierData.getClassifierName() + "_"
                        + classifierData.getClassifierType() + "_" + fastaFilename + ".scores"));
        Classifier classifierOne = classifierData.getClassifierOne();
        int leftMostPosition = classifierData.getLeftMostPosition();
        int rightMostPosition = classifierData.getRightMostPosition();
        //Reading and Storing the featureList
        Instances inst = classifierData.getInstances();
        ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>();
        for (int x = 0; x < inst.numAttributes() - 1; x++) {
            //-1 because class attribute must be ignored
            featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(x).name()));
        }
        //initialization for type 2 classifier               
        BufferedWriter output2 = new BufferedWriter(new FileWriter(
                outputDirectory + File.separator + "classifiertwo_" + classifierData.getClassifierName() + "_"
                        + classifierData.getClassifierType() + "_" + fastaFilename + ".scores"));
        int setUpstream = classifierData.getSetUpstream();
        int setDownstream = classifierData.getSetDownstream();
        int minScoreWindowRequired;
        if (setUpstream < 0 && setDownstream < 0) {//-ve and -ve
            minScoreWindowRequired = setDownstream - setUpstream + 1;
        } else if (setUpstream < 0 && setDownstream > 0) {//-ve and +ve
            minScoreWindowRequired = (setUpstream * -1) + setDownstream;
        } else {//+ve and +ve
            minScoreWindowRequired = setDownstream - setUpstream + 1;
        }
        Classifier classifierTwo = classifierData.getClassifierTwo();
        Instances inst2 = classifierData.getInstances2();
        if (setUpstream == 0 || setDownstream == 0) {
            output.close();
            output2.close();
            throw new Exception("setUpstream == 0 || setDownstream == 0");
        }
        //for each sequence
        for (int x = 0; x < sequenceNameTableModel.getRowCount(); x++) {
            if (stopClassifier == true) {
                statusPane.setText("Running of Classifier Stopped!");
                stopClassifier = false;
                output.close();
                output2.close();
                return;
            }
            //if(x%100 == 0)
            statusPane.setText("Running " + classifierData.getClassifierName() + " - ClassifierOne @ " + x
                    + " / " + sequenceNameTableModel.getRowCount());
            //Header              
            output.write(sequenceNameTableModel.getHeader(x));
            output.newLine();
            output.write(sequenceNameTableModel.getSequence(x));
            output.newLine();
            output2.write(sequenceNameTableModel.getHeader(x));
            output2.newLine();
            output2.write(sequenceNameTableModel.getSequence(x));
            output2.newLine();
            //Sequence Score -> index-score, index-score
            String sequence = sequenceNameTableModel.getSequence(x);
            int minSequenceLengthRequired;
            int targetLocationIndex;
            //set the targetLocationIndex and minSequenceLengthRequired
            if (leftMostPosition < 0 && rightMostPosition > 0) {// -ve and +ve
                minSequenceLengthRequired = (leftMostPosition * -1) + rightMostPosition;
                targetLocationIndex = (leftMostPosition * -1);
            } else if (leftMostPosition < 0 && rightMostPosition < 0) {//-ve and -ve
                minSequenceLengthRequired = rightMostPosition - leftMostPosition + 1;
                targetLocationIndex = (leftMostPosition * -1);
            } else {//+ve and +ve
                minSequenceLengthRequired = rightMostPosition - leftMostPosition + 1;
                targetLocationIndex = (leftMostPosition * -1);
            }
            //This hashtable is used to ensure that on positions where predictions are already made,
            //we just skip. This will happen only if it is a type 2 classifier
            Hashtable<Integer, Double> scoreTable = new Hashtable<Integer, Double>();
            boolean firstEntryForClassifierOne = true;
            boolean firstEntryForClassifierTwo = true;
            for (int y = 0; y + (minSequenceLengthRequired - 1) < sequence.length(); y++) {
                int endPoint = y;//endPoint should be the exact position
                int currentY = y;
                int startPoint = y;
                //run only on Motifs?               
                if (onMotifsOnlyMenuItem.getState()) {
                    //Check if targetLocation match any motif in motif List
                    if (motifListTableModel
                            .gotMotifMatch(sequence.substring(y + 0, y + targetLocationIndex)) == false)
                        continue; //position not found in motif list
                    else
                        //rollback to upstream and make prediction all the way till downstream
                        //needed for type 2 classifier
                        currentY += setUpstream;
                    if (setUpstream > 0)
                        currentY--;
                    startPoint = currentY;
                    //note that y starts from 0 so y is surely >= 0
                    endPoint += setDownstream;
                    if (setDownstream > 0)
                        endPoint--;
                    //check still within bound of the sequence
                    if (startPoint < 0 || endPoint >= sequence.length() - (minSequenceLengthRequired - 1))
                        continue;//out of bounds                  
                }
                while (currentY <= endPoint) {
                    if (scoreTable.get(currentY + targetLocationIndex) != null) {
                        currentY++;
                        continue;
                    }
                    String line2 = sequence.substring(currentY + 0, currentY + minSequenceLengthRequired);
                    Instance tempInst;
                    tempInst = new Instance(inst.numAttributes());
                    tempInst.setDataset(inst);
                    for (int z = 0; z < inst.numAttributes() - 1; z++) {
                        //-1 because class attribute can be ignored
                        //Give the sequence and the featureList to get the feature freqs on the sequence
                        Object obj = GenerateArff.getMatchCount("+1_Index(" + targetLocationIndex + ")", line2,
                                featureDataArrayList.get(z), classifierData.getScoringMatrixIndex(),
                                classifierData.getCountingStyleIndex(), classifierData.getScoringMatrix());
                        if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer"))
                            tempInst.setValue(z, (Integer) obj);
                        else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double"))
                            tempInst.setValue(z, (Double) obj);
                        else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String"))
                            tempInst.setValue(z, (String) obj);
                        else {
                            output.close();
                            output2.close();
                            throw new Error("Unknown: " + obj.getClass().getName());
                        }
                    }
                    //note that pos or neg does not matter as this is not used
                    tempInst.setValue(inst.numAttributes() - 1, "neg");
                    double[] results = classifierOne.distributionForInstance(tempInst);
                    if (firstEntryForClassifierOne)
                        firstEntryForClassifierOne = false;
                    else
                        output.write(",");
                    output.write(currentY + targetLocationIndex + "=" + results[0]);
                    scoreTable.put(currentY + targetLocationIndex, results[0]);
                    currentY++;
                }
                Instance tempInst2 = new Instance(minScoreWindowRequired + 1);//+1 for class attribute
                tempInst2.setDataset(inst2);
                int indexForClassifier2Inst = 0;
                for (int z = startPoint; z <= endPoint; z++) {
                    tempInst2.setValue(indexForClassifier2Inst, scoreTable.get(targetLocationIndex + z));
                    indexForClassifier2Inst++;
                }
                //note that pos or neg does not matter as this is not used
                tempInst2.setValue(tempInst2.numAttributes() - 1, "pos");
                double[] results = classifierTwo.distributionForInstance(tempInst2);
                if (firstEntryForClassifierTwo == true)
                    firstEntryForClassifierTwo = false;
                else
                    output2.write(",");
                output2.write(y + targetLocationIndex + "=" + results[0]);
            } //end of for loop            
            output2.newLine();
            output2.flush();
            output.newLine();
            output.flush();
        }
        output.close();
        output2.close();

        statusPane.setText("Classifier Finished running...");
        loadScoreFile(outputDirectory + File.separator + "classifiertwo_" + classifierData.getClassifierName()
                + "_" + classifierData.getClassifierType() + "_" + fastaFilename + ".scores");
    } catch (Exception e) {
        JOptionPane.showMessageDialog(null, "Exception Occured", "Error", JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    }
}

From source file:sirius.trainer.step4.RunClassifier.java

License:Open Source License

public static Classifier startClassifierTwo(JInternalFrame parent, ApplicationData applicationData,
        JTextArea classifierTwoDisplayTextArea, GenericObjectEditor m_ClassifierEditor2,
        Classifier classifierOne, GraphPane myGraph, boolean test, ClassifierResults classifierResults,
        int range, double threshold) {
    int arraySize = 0;
    int lineCount = 0;
    try {/*from  ww  w.ja va 2 s .  c om*/
        StatusPane statusPane = applicationData.getStatusPane();
        //Initialising      
        long totalTimeStart = System.currentTimeMillis();
        Step1TableModel positiveStep1TableModel = applicationData.getPositiveStep1TableModel();
        Step1TableModel negativeStep1TableModel = applicationData.getNegativeStep1TableModel();
        int positiveDataset3FromInt = applicationData.getPositiveDataset3FromField();
        int positiveDataset3ToInt = applicationData.getPositiveDataset3ToField();
        int negativeDataset3FromInt = applicationData.getNegativeDataset3FromField();
        int negativeDataset3ToInt = applicationData.getNegativeDataset3ToField();

        //Preparing Dataset2.arff to train Classifier Two
        statusPane.setText("Preparing Dataset2.arff...");
        //This step generates Dataset2.arff
        if (DatasetGenerator.generateDataset2(parent, applicationData, applicationData.getSetUpstream(),
                applicationData.getSetDownstream(), classifierOne) == false) {
            //Interrupted or Error occurred
            return null;
        }

        //Training Classifier Two
        statusPane.setText("Training Classifier Two... May take a while... Please wait...");
        Instances inst2 = new Instances(new BufferedReader(
                new FileReader(applicationData.getWorkingDirectory() + File.separator + "Dataset2.arff")));
        inst2.setClassIndex(inst2.numAttributes() - 1);
        long trainTimeStart = 0;
        long trainTimeElapsed = 0;

        Classifier classifierTwo = (Classifier) m_ClassifierEditor2.getValue();
        trainTimeStart = System.currentTimeMillis();
        applicationData.setDataset2Instances(inst2);
        classifierTwo.buildClassifier(inst2);
        trainTimeElapsed = System.currentTimeMillis() - trainTimeStart;

        //Running Classifier Two   
        String classifierName = m_ClassifierEditor2.getValue().getClass().getName();
        classifierResults.updateList(classifierResults.getClassifierList(), "Classifier: ", classifierName);
        classifierResults.updateList(classifierResults.getClassifierList(), "Training Data: ",
                applicationData.getWorkingDirectory() + File.separator + "Dataset2.arff");
        classifierResults.updateList(classifierResults.getClassifierList(), "Time Used: ",
                Utils.doubleToString(trainTimeElapsed / 1000.0, 2) + " seconds");

        if (test == false) {
            statusPane.setText("Classifier Two Trained...Done...");
            return classifierTwo;
        }
        if (applicationData.terminateThread == true) {
            statusPane.setText("Interrupted - Classifier One Training Completed");
            return classifierTwo;
        }
        statusPane.setText("Running Classifier Two on Dataset 3...");

        //Generate the header for ClassifierTwo.scores on Dataset3            
        BufferedWriter classifierTwoOutput = new BufferedWriter(new FileWriter(
                applicationData.getWorkingDirectory() + File.separator + "ClassifierTwo.scores"));
        if (m_ClassifierEditor2.getValue() instanceof OptionHandler)
            classifierName += " "
                    + Utils.joinOptions(((OptionHandler) m_ClassifierEditor2.getValue()).getOptions());

        //Generating an Instance given a sequence with the current attributes
        int setClassifierTwoUpstreamInt = applicationData.getSetUpstream();
        int setClassifierTwoDownstreamInt = applicationData.getSetDownstream();
        int classifierTwoWindowSize;
        if (setClassifierTwoUpstreamInt < 0 && setClassifierTwoDownstreamInt > 0)
            classifierTwoWindowSize = (setClassifierTwoUpstreamInt * -1) + setClassifierTwoDownstreamInt;
        else if (setClassifierTwoUpstreamInt < 0 && setClassifierTwoDownstreamInt < 0)
            classifierTwoWindowSize = (setClassifierTwoUpstreamInt - setClassifierTwoDownstreamInt - 1) * -1;
        else//both +ve
            classifierTwoWindowSize = (setClassifierTwoDownstreamInt - setClassifierTwoUpstreamInt + 1);

        Instances inst = applicationData.getDataset1Instances();

        //NOTE: need to take care of this function;    
        FastaFileManipulation fastaFile = new FastaFileManipulation(positiveStep1TableModel,
                negativeStep1TableModel, positiveDataset3FromInt, positiveDataset3ToInt,
                negativeDataset3FromInt, negativeDataset3ToInt, applicationData.getWorkingDirectory());

        //loading in all the features..
        ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>();
        for (int x = 0; x < inst.numAttributes() - 1; x++) {
            //-1 because class attribute must be ignored
            featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(x).name()));
        }

        //Reading the fastaFile                                
        String _class = "pos";
        lineCount = 0;
        int totalPosSequences = positiveDataset3ToInt - positiveDataset3FromInt + 1;
        FastaFormat fastaFormat;
        while ((fastaFormat = fastaFile.nextSequence(_class)) != null) {
            if (applicationData.terminateThread == true) {
                statusPane.setText("Interrupted - Classifier Two Trained");
                classifierTwoOutput.close();
                return classifierTwo;
            }
            lineCount++;
            classifierTwoOutput.write(fastaFormat.getHeader());
            classifierTwoOutput.newLine();
            classifierTwoOutput.write(fastaFormat.getSequence());
            classifierTwoOutput.newLine();
            //if((lineCount % 100) == 0){                      
            statusPane.setText("Running ClassifierTwo on Dataset 3...@ " + lineCount + " / "
                    + applicationData.getTotalSequences(3) + " Sequences");
            //}
            arraySize = fastaFormat.getArraySize(applicationData.getLeftMostPosition(),
                    applicationData.getRightMostPosition());
            //This area always generate -ve arraySize~! WHY?? Exception always occur here              
            double scores[] = new double[arraySize];
            int predictPosition[] = fastaFormat.getPredictPositionForClassifierOne(
                    applicationData.getLeftMostPosition(), applicationData.getRightMostPosition());
            //Doing shift from upstream till downstream   
            SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(), predictPosition[0],
                    predictPosition[1]);
            int scoreCount = 0;
            String line2;
            while ((line2 = seq.nextShift()) != null) {
                Instance tempInst = new Instance(inst.numAttributes());
                tempInst.setDataset(inst);
                //-1 because class attribute can be ignored
                for (int x = 0; x < inst.numAttributes() - 1; x++) {
                    Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2,
                            featureDataArrayList.get(x), applicationData.getScoringMatrixIndex(),
                            applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix());
                    if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer"))
                        tempInst.setValue(x, (Integer) obj);
                    else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double"))
                        tempInst.setValue(x, (Double) obj);
                    else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String"))
                        tempInst.setValue(x, (String) obj);
                    else {
                        classifierTwoOutput.close();
                        throw new Error("Unknown: " + obj.getClass().getName());
                    }
                }
                tempInst.setValue(inst.numAttributes() - 1, _class);
                //Run classifierOne                 
                double[] results = classifierOne.distributionForInstance(tempInst);
                scores[scoreCount++] = results[0];
            }
            //Run classifierTwo                 
            int currentPosition = fastaFormat.getPredictionFromForClassifierTwo(
                    applicationData.getLeftMostPosition(), applicationData.getRightMostPosition(),
                    applicationData.getSetUpstream());
            classifierTwoOutput.write(_class);
            for (int y = 0; y < arraySize - classifierTwoWindowSize + 1; y++) {
                //+1 is for the class index
                Instance tempInst2 = new Instance(classifierTwoWindowSize + 1);
                tempInst2.setDataset(inst2);
                for (int x = 0; x < classifierTwoWindowSize; x++) {
                    tempInst2.setValue(x, scores[x + y]);
                }
                tempInst2.setValue(tempInst2.numAttributes() - 1, _class);
                double[] results = classifierTwo.distributionForInstance(tempInst2);
                classifierTwoOutput.write("," + currentPosition + "=" + results[0]);
                currentPosition++;
                if (currentPosition == 0)
                    currentPosition++;
            }
            classifierTwoOutput.newLine();
            classifierTwoOutput.flush();
            if (lineCount == totalPosSequences)
                _class = "neg";
        }
        classifierTwoOutput.close();
        statusPane.setText("Done!");
        PredictionStats classifierTwoStatsOnBlindTest = new PredictionStats(
                applicationData.getWorkingDirectory() + File.separator + "ClassifierTwo.scores", range,
                threshold);
        //display(double range)
        long totalTimeElapsed = System.currentTimeMillis() - totalTimeStart;
        classifierResults.updateList(classifierResults.getResultsList(), "Total Time Used: ",
                Utils.doubleToString(totalTimeElapsed / 60000, 2) + " minutes "
                        + Utils.doubleToString((totalTimeElapsed / 1000.0) % 60.0, 2) + " seconds");
        classifierTwoStatsOnBlindTest.updateDisplay(classifierResults, classifierTwoDisplayTextArea, true);
        applicationData.setClassifierTwoStats(classifierTwoStatsOnBlindTest);
        myGraph.setMyStats(classifierTwoStatsOnBlindTest);
        fastaFile.cleanUp();
        return classifierTwo;
    } catch (Exception ex) {
        ex.printStackTrace();
        JOptionPane.showMessageDialog(parent,
                ex.getMessage() + "Classifier Two On Blind Test Set - Check Console Output",
                "Evaluate classifier two", JOptionPane.ERROR_MESSAGE);
        System.err.println("applicationData.getLeftMostPosition(): " + applicationData.getLeftMostPosition());
        System.err.println("applicationData.getRightMostPosition(): " + applicationData.getRightMostPosition());
        System.err.println("arraySize: " + arraySize);
        System.err.println("lineCount: " + lineCount);
        return null;
    }
}

From source file:sirius.trainer.step4.RunClassifier.java

License:Open Source License

public static Classifier xValidateClassifierTwo(JInternalFrame parent, ApplicationData applicationData,
        JTextArea classifierTwoDisplayTextArea, GenericObjectEditor m_ClassifierEditor2,
        Classifier classifierOne, int folds, GraphPane myGraph, ClassifierResults classifierResults, int range,
        double threshold, boolean outputClassifier) {
    try {/*w ww.j av  a  2 s  .c o m*/
        StatusPane statusPane = applicationData.getStatusPane();

        long totalTimeStart = System.currentTimeMillis(), totalTimeElapsed;
        //Classifier tempClassifier = (Classifier) m_ClassifierEditor2.getValue();
        final int positiveDataset2FromInt = applicationData.getPositiveDataset2FromField();
        final int positiveDataset2ToInt = applicationData.getPositiveDataset2ToField();
        final int negativeDataset2FromInt = applicationData.getNegativeDataset2FromField();
        final int negativeDataset2ToInt = applicationData.getNegativeDataset2ToField();

        final int totalDataset2Sequences = (positiveDataset2ToInt - positiveDataset2FromInt + 1)
                + (negativeDataset2ToInt - negativeDataset2FromInt + 1);

        final int classifierTwoUpstream = applicationData.getSetUpstream();
        final int classifierTwoDownstream = applicationData.getSetDownstream();

        Step1TableModel positiveStep1TableModel = applicationData.getPositiveStep1TableModel();
        Step1TableModel negativeStep1TableModel = applicationData.getNegativeStep1TableModel();

        //Train classifier two with the full dataset first then do cross-validation to gauge its accuracy                      
        //Preparing Dataset2.arff to train Classifier Two
        long trainTimeStart = 0, trainTimeElapsed = 0;
        statusPane.setText("Preparing Dataset2.arff...");
        //This step generates Dataset2.arff
        if (DatasetGenerator.generateDataset2(parent, applicationData, applicationData.getSetUpstream(),
                applicationData.getSetDownstream(), classifierOne) == false) {
            //Interrupted or Error occurred
            return null;
        }
        Instances instOfDataset2 = new Instances(new BufferedReader(
                new FileReader(applicationData.getWorkingDirectory() + File.separator + "Dataset2.arff")));
        instOfDataset2.setClassIndex(instOfDataset2.numAttributes() - 1);
        applicationData.setDataset2Instances(instOfDataset2);
        Classifier classifierTwo = (Classifier) m_ClassifierEditor2.getValue();
        statusPane.setText("Training Classifier Two... May take a while... Please wait...");
        //Record Start Time
        trainTimeStart = System.currentTimeMillis();
        if (outputClassifier)
            classifierTwo.buildClassifier(instOfDataset2);
        //Record Total Time used to build classifier one
        trainTimeElapsed = System.currentTimeMillis() - trainTimeStart;
        //Training Done          

        String classifierName = m_ClassifierEditor2.getValue().getClass().getName();
        classifierResults.updateList(classifierResults.getClassifierList(), "Classifier: ", classifierName);
        classifierResults.updateList(classifierResults.getClassifierList(), "Training Data: ",
                folds + " fold cross-validation on Dataset2.arff");
        classifierResults.updateList(classifierResults.getClassifierList(), "Time Used: ",
                Utils.doubleToString(trainTimeElapsed / 1000.0, 2) + " seconds");

        Instances instOfDataset1 = new Instances(applicationData.getDataset1Instances());
        instOfDataset1.setClassIndex(applicationData.getDataset1Instances().numAttributes() - 1);
        //Reading and Storing the featureList
        ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>();
        for (int y = 0; y < instOfDataset1.numAttributes() - 1; y++) {
            featureDataArrayList.add(Feature.levelOneClassifierPane(instOfDataset1.attribute(y).name()));
        }

        //Generating an Instance given a sequence with the current attributes
        int setClassifierTwoUpstreamInt = applicationData.getSetUpstream();
        int setClassifierTwoDownstreamInt = applicationData.getSetDownstream();
        int classifierTwoWindowSize;
        if (setClassifierTwoUpstreamInt < 0 && setClassifierTwoDownstreamInt > 0)
            classifierTwoWindowSize = (setClassifierTwoUpstreamInt * -1) + setClassifierTwoDownstreamInt;
        else if (setClassifierTwoUpstreamInt < 0 && setClassifierTwoDownstreamInt < 0)
            classifierTwoWindowSize = (setClassifierTwoUpstreamInt - setClassifierTwoDownstreamInt - 1) * -1;
        else//both +ve
            classifierTwoWindowSize = (setClassifierTwoDownstreamInt - setClassifierTwoUpstreamInt + 1);

        int posTestSequenceCounter = 0;

        BufferedWriter outputCrossValidation = new BufferedWriter(new FileWriter(
                applicationData.getWorkingDirectory() + File.separator + "classifierTwo.scores"));

        for (int x = 0; x < folds; x++) {
            File trainFile = new File(applicationData.getWorkingDirectory() + File.separator
                    + "trainingDataset2_" + (x + 1) + ".arff");
            File testFile = new File(applicationData.getWorkingDirectory() + File.separator + "testingDataset2_"
                    + (x + 1) + ".fasta");

            statusPane.setText("Preparing Training Data for Fold " + (x + 1) + "..");
            FastaFileManipulation fastaFile = new FastaFileManipulation(positiveStep1TableModel,
                    negativeStep1TableModel, positiveDataset2FromInt, positiveDataset2ToInt,
                    negativeDataset2FromInt, negativeDataset2ToInt, applicationData.getWorkingDirectory());

            //1) generate trainingDataset2.arff headings
            BufferedWriter trainingOutputFile = new BufferedWriter(
                    new FileWriter(applicationData.getWorkingDirectory() + File.separator + "trainingDataset2_"
                            + (x + 1) + ".arff"));
            trainingOutputFile.write("@relation 'A temp file for X-validation purpose' ");
            trainingOutputFile.newLine();
            trainingOutputFile.newLine();
            trainingOutputFile.flush();
            for (int y = classifierTwoUpstream; y <= classifierTwoDownstream; y++) {
                if (y != 0) {
                    trainingOutputFile.write("@attribute (" + y + ") numeric");
                    trainingOutputFile.newLine();
                    trainingOutputFile.flush();
                }
            }
            if (positiveDataset2FromInt > 0 && negativeDataset2FromInt > 0)
                trainingOutputFile.write("@attribute Class {pos,neg}");
            else if (positiveDataset2FromInt > 0 && negativeDataset2FromInt == 0)
                trainingOutputFile.write("@attribute Class {pos}");
            else if (positiveDataset2FromInt == 0 && negativeDataset2FromInt > 0)
                trainingOutputFile.write("@attribute Class {neg}");
            trainingOutputFile.newLine();
            trainingOutputFile.newLine();
            trainingOutputFile.write("@data");
            trainingOutputFile.newLine();
            trainingOutputFile.newLine();
            trainingOutputFile.flush();
            //AHFU_DEBUG 
            BufferedWriter testingOutputFileArff = new BufferedWriter(
                    new FileWriter(applicationData.getWorkingDirectory() + File.separator + "testingDataset2_"
                            + (x + 1) + ".arff"));
            testingOutputFileArff.write("@relation 'A temp file for X-validation purpose' ");
            testingOutputFileArff.newLine();
            testingOutputFileArff.newLine();
            testingOutputFileArff.flush();
            for (int y = classifierTwoUpstream; y <= classifierTwoDownstream; y++) {
                if (y != 0) {
                    testingOutputFileArff.write("@attribute (" + y + ") numeric");
                    testingOutputFileArff.newLine();
                    testingOutputFileArff.flush();
                }
            }
            if (positiveDataset2FromInt > 0 && negativeDataset2FromInt > 0)
                testingOutputFileArff.write("@attribute Class {pos,neg}");
            else if (positiveDataset2FromInt > 0 && negativeDataset2FromInt == 0)
                testingOutputFileArff.write("@attribute Class {pos}");
            else if (positiveDataset2FromInt == 0 && negativeDataset2FromInt > 0)
                testingOutputFileArff.write("@attribute Class {neg}");
            testingOutputFileArff.newLine();
            testingOutputFileArff.newLine();
            testingOutputFileArff.write("@data");
            testingOutputFileArff.newLine();
            testingOutputFileArff.newLine();
            testingOutputFileArff.flush();
            //AHFU_DEBUG END
            //2) generate testingDataset2.fasta
            BufferedWriter testingOutputFile = new BufferedWriter(
                    new FileWriter(applicationData.getWorkingDirectory() + File.separator + "testingDataset2_"
                            + (x + 1) + ".fasta"));

            //Now, populating datas for both the training and testing files            
            int fastaFileLineCounter = 0;
            posTestSequenceCounter = 0;
            int totalTestSequenceCounter = 0;
            int totalTrainTestSequenceCounter = 0;
            FastaFormat fastaFormat;
            //For pos sequences   
            while ((fastaFormat = fastaFile.nextSequence("pos")) != null) {
                if (applicationData.terminateThread == true) {
                    statusPane.setText("Interrupted - Classifier Two Trained");
                    outputCrossValidation.close();
                    testingOutputFileArff.close();
                    testingOutputFile.close();
                    trainingOutputFile.close();
                    return classifierTwo;
                }
                totalTrainTestSequenceCounter++;
                //if(totalTrainTestSequenceCounter%100 == 0)
                statusPane.setText("Preparing Training Data for Fold " + (x + 1) + ".. @ "
                        + totalTrainTestSequenceCounter + " / " + totalDataset2Sequences);
                if ((fastaFileLineCounter % folds) == x) {//This sequence is for testing
                    testingOutputFile.write(fastaFormat.getHeader());
                    testingOutputFile.newLine();
                    testingOutputFile.write(fastaFormat.getSequence());
                    testingOutputFile.newLine();
                    testingOutputFile.flush();
                    posTestSequenceCounter++;
                    totalTestSequenceCounter++;
                    //AHFU DEBUG
                    SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(),
                            classifierTwoUpstream, classifierTwoDownstream);
                    String line2;
                    while ((line2 = seq.nextShift()) != null) {
                        Instance tempInst = new Instance(instOfDataset1.numAttributes());
                        tempInst.setDataset(instOfDataset1);
                        //-1 because class attribute can be ignored
                        for (int w = 0; w < instOfDataset1.numAttributes() - 1; w++) {
                            Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2,
                                    featureDataArrayList.get(w), applicationData.getScoringMatrixIndex(),
                                    applicationData.getCountingStyleIndex(),
                                    applicationData.getScoringMatrix());
                            if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer"))
                                tempInst.setValue(w, (Integer) obj);
                            else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double"))
                                tempInst.setValue(w, (Double) obj);
                            else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String"))
                                tempInst.setValue(w, (String) obj);
                            else {
                                outputCrossValidation.close();
                                testingOutputFileArff.close();
                                testingOutputFile.close();
                                trainingOutputFile.close();
                                throw new Error("Unknown: " + obj.getClass().getName());
                            }
                        }
                        tempInst.setValue(tempInst.numAttributes() - 1, "pos");
                        double[] results = classifierOne.distributionForInstance(tempInst);
                        testingOutputFileArff.write(results[0] + ",");
                    }
                    testingOutputFileArff.write("pos");
                    testingOutputFileArff.newLine();
                    testingOutputFileArff.flush();
                    //AHFU DEBUG END
                } else {//This sequence is for training
                    SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(),
                            classifierTwoUpstream, classifierTwoDownstream);
                    String line2;
                    while ((line2 = seq.nextShift()) != null) {
                        Instance tempInst = new Instance(instOfDataset1.numAttributes());
                        tempInst.setDataset(instOfDataset1);
                        //-1 because class attribute can be ignored
                        for (int w = 0; w < instOfDataset1.numAttributes() - 1; w++) {
                            Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2,
                                    featureDataArrayList.get(w), applicationData.getScoringMatrixIndex(),
                                    applicationData.getCountingStyleIndex(),
                                    applicationData.getScoringMatrix());
                            if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer"))
                                tempInst.setValue(w, (Integer) obj);
                            else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double"))
                                tempInst.setValue(w, (Double) obj);
                            else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String"))
                                tempInst.setValue(w, (String) obj);
                            else {
                                outputCrossValidation.close();
                                testingOutputFileArff.close();
                                testingOutputFile.close();
                                trainingOutputFile.close();
                                throw new Error("Unknown: " + obj.getClass().getName());
                            }
                        }
                        tempInst.setValue(tempInst.numAttributes() - 1, "pos");
                        double[] results = classifierOne.distributionForInstance(tempInst);
                        trainingOutputFile.write(results[0] + ",");
                    }
                    trainingOutputFile.write("pos");
                    trainingOutputFile.newLine();
                    trainingOutputFile.flush();
                }
                fastaFileLineCounter++;
            }
            //For neg sequences
            fastaFileLineCounter = 0;
            while ((fastaFormat = fastaFile.nextSequence("neg")) != null) {
                if (applicationData.terminateThread == true) {
                    statusPane.setText("Interrupted - Classifier Two Trained");
                    outputCrossValidation.close();
                    testingOutputFileArff.close();
                    testingOutputFile.close();
                    trainingOutputFile.close();
                    return classifierTwo;
                }
                totalTrainTestSequenceCounter++;
                //if(totalTrainTestSequenceCounter%100 == 0)
                statusPane.setText("Preparing Training Data for Fold " + (x + 1) + ".. @ "
                        + totalTrainTestSequenceCounter + " / " + totalDataset2Sequences);
                if ((fastaFileLineCounter % folds) == x) {//This sequence is for testing
                    testingOutputFile.write(fastaFormat.getHeader());
                    testingOutputFile.newLine();
                    testingOutputFile.write(fastaFormat.getSequence());
                    testingOutputFile.newLine();
                    testingOutputFile.flush();
                    totalTestSequenceCounter++;
                    //AHFU DEBUG
                    SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(),
                            classifierTwoUpstream, classifierTwoDownstream);
                    String line2;
                    while ((line2 = seq.nextShift()) != null) {
                        Instance tempInst = new Instance(instOfDataset1.numAttributes());
                        tempInst.setDataset(instOfDataset1);
                        //-1 because class attribute can be ignored
                        for (int w = 0; w < instOfDataset1.numAttributes() - 1; w++) {
                            Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2,
                                    featureDataArrayList.get(w), applicationData.getScoringMatrixIndex(),
                                    applicationData.getCountingStyleIndex(),
                                    applicationData.getScoringMatrix());
                            if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer"))
                                tempInst.setValue(w, (Integer) obj);
                            else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double"))
                                tempInst.setValue(w, (Double) obj);
                            else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String"))
                                tempInst.setValue(w, (String) obj);
                            else {
                                outputCrossValidation.close();
                                testingOutputFileArff.close();
                                testingOutputFile.close();
                                trainingOutputFile.close();
                                throw new Error("Unknown: " + obj.getClass().getName());
                            }
                        }
                        tempInst.setValue(tempInst.numAttributes() - 1, "pos");//pos or neg does not matter here - not used         
                        double[] results = classifierOne.distributionForInstance(tempInst);
                        testingOutputFileArff.write(results[0] + ",");
                    }
                    testingOutputFileArff.write("neg");
                    testingOutputFileArff.newLine();
                    testingOutputFileArff.flush();
                    //AHFU DEBUG END
                } else {//This sequence is for training
                    SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(),
                            classifierTwoUpstream, classifierTwoDownstream);
                    String line2;
                    while ((line2 = seq.nextShift()) != null) {
                        Instance tempInst = new Instance(instOfDataset1.numAttributes());
                        tempInst.setDataset(instOfDataset1);
                        //-1 because class attribute can be ignored
                        for (int w = 0; w < instOfDataset1.numAttributes() - 1; w++) {
                            Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2,
                                    featureDataArrayList.get(w), applicationData.getScoringMatrixIndex(),
                                    applicationData.getCountingStyleIndex(),
                                    applicationData.getScoringMatrix());
                            if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer"))
                                tempInst.setValue(w, (Integer) obj);
                            else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double"))
                                tempInst.setValue(w, (Double) obj);
                            else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String"))
                                tempInst.setValue(w, (String) obj);
                            else {
                                outputCrossValidation.close();
                                testingOutputFileArff.close();
                                testingOutputFile.close();
                                trainingOutputFile.close();
                                throw new Error("Unknown: " + obj.getClass().getName());
                            }
                        }
                        tempInst.setValue(tempInst.numAttributes() - 1, "pos");//pos or neg does not matter here - not used              
                        double[] results = classifierOne.distributionForInstance(tempInst);
                        trainingOutputFile.write(results[0] + ",");
                    }
                    trainingOutputFile.write("neg");
                    trainingOutputFile.newLine();
                    trainingOutputFile.flush();
                }
                fastaFileLineCounter++;
            }
            trainingOutputFile.close();
            testingOutputFile.close();

            //AHFU_DEBUG
            testingOutputFileArff.close();
            //AHFU DEBUG END
            //3) train and test classifier two then store the statistics
            statusPane.setText("Building Fold " + (x + 1) + "..");
            //open an input stream to the arff file 
            BufferedReader trainingInput = new BufferedReader(
                    new FileReader(applicationData.getWorkingDirectory() + File.separator + "trainingDataset2_"
                            + (x + 1) + ".arff"));
            //getting ready to train a foldClassifier using arff file
            Instances instOfTrainingDataset2 = new Instances(
                    new BufferedReader(new FileReader(applicationData.getWorkingDirectory() + File.separator
                            + "trainingDataset2_" + (x + 1) + ".arff")));
            instOfTrainingDataset2.setClassIndex(instOfTrainingDataset2.numAttributes() - 1);
            Classifier foldClassifier = (Classifier) m_ClassifierEditor2.getValue();
            foldClassifier.buildClassifier(instOfTrainingDataset2);
            trainingInput.close();

            //Reading the test file
            statusPane.setText("Evaluating fold " + (x + 1) + "..");
            BufferedReader testingInput = new BufferedReader(
                    new FileReader(applicationData.getWorkingDirectory() + File.separator + "testingDataset2_"
                            + (x + 1) + ".fasta"));
            int lineCounter = 0;
            String lineHeader;
            String lineSequence;
            while ((lineHeader = testingInput.readLine()) != null) {
                if (applicationData.terminateThread == true) {
                    statusPane.setText("Interrupted - Classifier Two Not Trained");
                    outputCrossValidation.close();
                    testingOutputFileArff.close();
                    testingOutputFile.close();
                    trainingOutputFile.close();
                    testingInput.close();
                    return classifierTwo;
                }
                lineSequence = testingInput.readLine();
                outputCrossValidation.write(lineHeader);
                outputCrossValidation.newLine();
                outputCrossValidation.write(lineSequence);
                outputCrossValidation.newLine();
                lineCounter++;
                fastaFormat = new FastaFormat(lineHeader, lineSequence);
                int arraySize = fastaFormat.getArraySize(applicationData.getLeftMostPosition(),
                        applicationData.getRightMostPosition());
                double scores[] = new double[arraySize];
                int predictPosition[] = fastaFormat.getPredictPositionForClassifierOne(
                        applicationData.getLeftMostPosition(), applicationData.getRightMostPosition());
                //For each sequence, you want to shift from upstream till downstream 
                //ie changing the +1 location
                //to get the scores by classifier one so that can use it to train classifier two later
                //Doing shift from upstream till downstream    
                //if(lineCounter % 100 == 0)
                statusPane.setText("Evaluating fold " + (x + 1) + ".. @ " + lineCounter + " / "
                        + totalTestSequenceCounter);
                SequenceManipulation seq = new SequenceManipulation(lineSequence, predictPosition[0],
                        predictPosition[1]);
                int scoreCount = 0;
                String line2;
                while ((line2 = seq.nextShift()) != null) {
                    Instance tempInst = new Instance(instOfDataset1.numAttributes());
                    tempInst.setDataset(instOfDataset1);
                    for (int i = 0; i < instOfDataset1.numAttributes() - 1; i++) {
                        //-1 because class attribute can be ignored
                        //Give the sequence and the featureList to get the feature freqs on the sequence
                        Object obj = GenerateArff.getMatchCount(lineHeader, line2, featureDataArrayList.get(i),
                                applicationData.getScoringMatrixIndex(),
                                applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix());
                        if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer"))
                            tempInst.setValue(i, (Integer) obj);
                        else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double"))
                            tempInst.setValue(i, (Double) obj);
                        else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String"))
                            tempInst.setValue(i, (String) obj);
                        else {
                            outputCrossValidation.close();
                            testingOutputFileArff.close();
                            testingOutputFile.close();
                            trainingOutputFile.close();
                            testingInput.close();
                            throw new Error("Unknown: " + obj.getClass().getName());
                        }
                    }
                    if (lineCounter > posTestSequenceCounter) {//for neg
                        tempInst.setValue(tempInst.numAttributes() - 1, "neg");
                    } else {
                        tempInst.setValue(tempInst.numAttributes() - 1, "pos");
                    }
                    double[] results = classifierOne.distributionForInstance(tempInst);
                    scores[scoreCount++] = results[0];
                } //end of sequence shift 
                  //Run classifierTwo                 
                int currentPosition = fastaFormat.getPredictionFromForClassifierTwo(
                        applicationData.getLeftMostPosition(), applicationData.getRightMostPosition(),
                        applicationData.getSetUpstream());
                if (lineCounter > posTestSequenceCounter)//neg
                    outputCrossValidation.write("neg");
                else
                    outputCrossValidation.write("pos");
                for (int y = 0; y < arraySize - classifierTwoWindowSize + 1; y++) {
                    //+1 is for the class index
                    Instance tempInst2 = new Instance(classifierTwoWindowSize + 1);
                    tempInst2.setDataset(instOfTrainingDataset2);
                    for (int l = 0; l < classifierTwoWindowSize; l++) {
                        tempInst2.setValue(l, scores[l + y]);
                    }
                    if (lineCounter > posTestSequenceCounter)//for neg
                        tempInst2.setValue(tempInst2.numAttributes() - 1, "neg");
                    else//for pos                          
                        tempInst2.setValue(tempInst2.numAttributes() - 1, "pos");
                    double[] results = foldClassifier.distributionForInstance(tempInst2);
                    outputCrossValidation.write("," + currentPosition + "=" + results[0]);
                    currentPosition++;
                    if (currentPosition == 0)
                        currentPosition++;
                }
                outputCrossValidation.newLine();
                outputCrossValidation.flush();
            } //end of reading test file
            outputCrossValidation.close();
            testingOutputFileArff.close();
            testingOutputFile.close();
            trainingOutputFile.close();
            testingInput.close();
            fastaFile.cleanUp();

            //AHFU_DEBUG
            trainFile.deleteOnExit();
            testFile.deleteOnExit();

            //NORMAL MODE
            //trainFile.delete();
            //testFile.delete();
        } //end of for loop for xvalidation      

        PredictionStats classifierTwoStatsOnXValidation = new PredictionStats(
                applicationData.getWorkingDirectory() + File.separator + "classifierTwo.scores", range,
                threshold);
        //display(double range)
        totalTimeElapsed = System.currentTimeMillis() - totalTimeStart;
        classifierResults.updateList(classifierResults.getResultsList(), "Total Time Used: ",
                Utils.doubleToString(totalTimeElapsed / 60000, 2) + " minutes "
                        + Utils.doubleToString((totalTimeElapsed / 1000.0) % 60.0, 2) + " seconds");
        classifierTwoStatsOnXValidation.updateDisplay(classifierResults, classifierTwoDisplayTextArea, true);
        applicationData.setClassifierTwoStats(classifierTwoStatsOnXValidation);
        myGraph.setMyStats(classifierTwoStatsOnXValidation);

        statusPane.setText("Done!");

        return classifierTwo;
    } catch (Exception e) {
        e.printStackTrace();
        JOptionPane.showMessageDialog(parent, e.getMessage(), "ERROR", JOptionPane.ERROR_MESSAGE);
        return null;
    }
}

From source file:swm.project.mappings.OurDistance.java

@Override
public double distance(Instance instnc, Instance instnc1) {

    int num = instnc.numAttributes();
    List<Double> movieClusterRating1 = new ArrayList<Double>(), movieClusterRating2 = new ArrayList<Double>();

    Attribute id = instnc.attribute(0);
    for (int index = 1; index < num; index++) {

    }//from w w w . ja v  a2  s. c  om
    return 1;
}