Example usage for weka.core Instance setValue

Introduction

In this page you can find the example usage for weka.core Instance setValue.

Prototype

public void setValue(Attribute att, String value);

Source Link

Document

Sets a value of an nominal or string attribute to the given value.

Usage

From source file:edu.utexas.cs.tactex.utils.RegressionUtils.java

License:Open Source License

public static Instances createXInstances(XYForRegression xy) {
    // build attributes
    ArrayList<Attribute> attributes = new ArrayList<Attribute>();
    for (String attrName : xy.getAttrNames()) {
        // attribute name is the polynomial power
        Attribute xattr = new Attribute(attrName);
        attributes.add(xattr);//from   w  ww.  ja  va 2  s.  co m
    }

    double[][] Xarr = xy.getX();
    int numInstances = Xarr.length;
    int numAttr = Xarr[0].length;
    Instances Xinst = new Instances("X", attributes, numInstances);

    // fill instances 
    for (int i = 0; i < numInstances; ++i) {
        Instance row = new DenseInstance(numAttr);
        for (int j = 0; j < numAttr; ++j) {
            row.setValue(j, Xarr[i][j]);
        }
        Xinst.add(row);
    }
    return Xinst;
}

From source file:elh.eus.absa.CLI.java

License:Open Source License

/**
 * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier
 * (E and A) for E#A aspect categories//from w w w .  ja  v a  2 s . co  m
 * @throws Exception 
 */
public final void trainATC2(final InputStream inputStream) throws IOException {
    // load training parameters file
    String paramFile = parsedArguments.getString("params");
    String testFile = parsedArguments.getString("testset");
    String paramFile2 = parsedArguments.getString("params2");
    String corpusFormat = parsedArguments.getString("corpusFormat");
    //String validation = parsedArguments.getString("validation");
    String lang = parsedArguments.getString("language");
    //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum"));
    //boolean printPreds = parsedArguments.getBoolean("printPreds");
    boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences");
    boolean onlyTest = parsedArguments.getBoolean("testOnly");
    double threshold = 0.5;
    double threshold2 = 0.5;
    String modelsPath = "/home/inaki/elixa-atp/ovsaModels";

    CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang);
    Features atcTrain = new Features(reader, paramFile, "3");
    Instances traindata = atcTrain.loadInstances(true, "atc");

    if (onlyTest) {
        if (FileUtilsElh.checkFile(testFile)) {
            System.err.println("read from test file");
            reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat,
                    nullSentenceOpinions, lang);
            atcTrain.setCorpus(reader);
            traindata = atcTrain.loadInstances(true, "atc");
        }
    }

    //setting class attribute (entCat|attCat|entAttCat|polarityCat)

    //HashMap<String, Integer> opInst = atcTrain.getOpinInst();      
    //WekaWrapper classifyAtts;
    WekaWrapper onevsall;
    try {

        //classify.printMultilabelPredictions(classify.multiLabelPrediction());      */   

        //onevsall
        Instances entdata = new Instances(traindata);
        entdata.deleteAttributeAt(entdata.attribute("attCat").index());
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        entdata.setClassIndex(entdata.attribute("entCat").index());
        onevsall = new WekaWrapper(entdata, true);

        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "entCat");
            System.out.println("trainATC: one vs all models ready");
        }
        onevsall.setTestdata(entdata);
        HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath,
                paramFile + "entCat");
        System.out.println("trainATC: one vs all predictions ready");
        HashMap<Integer, String> instOps = new HashMap<Integer, String>();
        for (String oId : atcTrain.getOpinInst().keySet()) {
            instOps.put(atcTrain.getOpinInst().get(oId), oId);
        }

        atcTrain = new Features(reader, paramFile2, "3");
        entdata = atcTrain.loadInstances(true, "attTrain2_data");
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        //entdata.setClassIndex(entdata.attribute("entCat").index());

        Attribute insAtt = entdata.attribute("instanceId");
        double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1);
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins" + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold) {
                    //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                    // for the first one update the instances
                    if (oSubId >= 1) {
                        Instance newIns = new SparseInstance(currentInst);
                        newIns.setDataset(entdata);
                        entdata.add(newIns);
                        newIns.setValue(insAtt, maxInstId + oSubId);
                        newIns.setClassValue(cl);
                        instOps.put((int) maxInstId + oSubId, oId);

                    }
                    // if the are more create new instances
                    else {
                        currentInst.setClassValue(cl);
                        //create and add opinion to the structure
                        //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                        //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId);
                        //reader.addOpinion(op);
                    }
                    oSubId++;
                }
            } //finished updating instances data                                    
        }

        entdata.setClass(entdata.attribute("attCat"));
        onevsall = new WekaWrapper(entdata, true);

        /**
         *  Bigarren sailkatzailea
         * 
         * */
        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "attCat");
            System.out.println("trainATC: one vs all attcat models ready");
        }

        ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat");

        insAtt = entdata.attribute("instanceId");
        maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues());
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins: " + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold2) {
                    ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                    if (ovsaRes.get(i).get(cl) > threshold) {
                        //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                        // for the first one update the instances
                        if (oSubId >= 1) {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);                     
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        // if the are more create new instances
                        else {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                            reader.removeOpinion(oId);
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        oSubId++;
                    }
                } //finished updating instances data                                    
            }
        }
        reader.print2Semeval2015format(paramFile + "entAttCat.xml");
    } catch (Exception e) {
        e.printStackTrace();
    }

    //traindata.setClass(traindata.attribute("entAttCat"));
    System.err.println("DONE CLI train-atc2 (oneVsAll)");
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * /*from  ww w.  j av a2 s.  c om*/
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstances(boolean save, String prefix) throws IOException {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //Properties posProp = new Properties();
    //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);      
    if (params.containsKey("lemmaNgrams")) {
        Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"),
                params.getProperty("lemma-model"), corpus.getLang(), "bin", "false");

        postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        // string normalization (emoticons, twitter grammar,...)
        String opNormalized = corpus.getOpinionSentence(oId);

        // compute uppercase ratio before normalization (if needed)      
        double upRatio = 0.0;
        if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) {
            String upper = opNormalized.replaceAll("[\\p{Ll}]", "");
            upRatio = (double) upper.length() / (double) opNormalized.length();
            values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        }

        // string normalization (emoticons, twitter grammar,...)
        if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams"))
                && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) {
            opNormalized = normalize(opNormalized, params.getProperty("normalization", "none"));
        }

        //process the current instance with the NLP pipeline in order to get token and lemma|pos features
        KAFDocument nafinst = new KAFDocument("", "");
        String nafname = trainExamples.get(oId).getsId().replace(':', '_');
        String nafDir = params.getProperty("kafDir");
        String nafPath = nafDir + File.separator + nafname + ".kaf";
        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = 1;
        try {
            if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty()))
            {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(),
                            params.getProperty("pos-model"), params.getProperty("lemma-model"), postagger);
                    Files.createDirectories(Paths.get(nafDir));
                    nafinst.save(nafPath);
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));
            } else {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang());
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));

            }
        } catch (IOException | JDOMException e) {
            System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId
                    + "|" + oId + ") for filling the attribute vector");
            e.printStackTrace();
            System.exit(5);
        }

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        List<WF> window = nafinst.getWFs();
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer to = window.size();
            Integer from = 0;
            end++;
            for (int i = 0; i < window.size(); i++) {
                WF wf = window.get(i);
                if ((wf.getOffset() == start) && (i >= bowWin)) {
                    from = i - bowWin;
                } else if (wf.getOffset() >= end) {
                    if (i + bowWin < window.size()) {
                        to = i + bowWin;
                    }
                    break;
                }
            }
            window = window.subList(from, to);
            //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to);
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        List<String> windowWFIds = new ArrayList<String>();

        // word form ngram related features
        for (WF wf : window) {
            windowWFIds.add(wf.getId());

            String wfStr = wf.getForm();
            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum

            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (Term t : nafinst.getTermsFromWFs(windowWFIds)) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    String lemma = t.getLemma();

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // add ngrams to the feature vector
                    for (int i = 0; i < ngrams.size(); i++) {
                        String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma");
                        //if the current lemma is in the ngram list activate the feature in the vector
                        if (params.containsKey("lemmaNgrams")
                                && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                            Attribute ngAtt = rsltdata.attribute(ng);
                            if (ngAtt != null) {
                                addNumericToFeatureVector(ng, values, 1); //tokNum                     
                            }
                        }

                        ng = featureFromArray(ngrams.subList(0, i + 1), "");
                        if (params.containsKey("polarLexiconGeneral")
                                || params.containsKey("polarLexiconDomain")) {
                            checkPolarityLexicons(ng, values, tokNum, polNgrams);
                        } //end polarity ngram checker
                    } //end ngram checking                                      
                }
                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }
                    posNgrams.add(t.getPos());

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            while (!ngrams.isEmpty()) {
                String ng = featureFromArray(ngrams, "lemma");

                //if the current lemma is in the ngram list activate the feature in the vector
                if (rsltdata.attribute(ng) != null) {
                    addNumericToFeatureVector(ng, values, 1); //tokNum
                }

                // polarity lexicons
                if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
                    checkPolarityLexicons(ng, values, tokNum, polNgrams);
                } //end polarity ngram checker

                ngrams.removeFirst();
            }

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId));
            if (pol != null && !pol.isEmpty()) {
                //System.err.println("polarity: _"+pol+"_");
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstances() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }
    return rsltdata;
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * /* w ww  .j a v  a 2 s  .  co m*/
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstancesTAB(boolean save, String prefix) {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        String[] noWindow = corpus.getOpinionSentence(oId).split("\n");

        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = noWindow.length;

        List<String> window = Arrays.asList(noWindow);
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer from = start - bowWin;
            if (from < 0) {
                from = 0;
            }
            Integer to = end + bowWin;
            if (to > noWindow.length - 1) {
                to = noWindow.length - 1;
            }
            window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to));
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        //System.err.println(Arrays.toString(window.toArray()));

        // word form ngram related features
        for (String wf : window) {
            String[] fields = wf.split("\t");
            String wfStr = normalize(fields[0], params.getProperty("normalization", "none"));
            // blank line means we found a sentence end. Empty n-gram list and reiniciate.  
            if (wf.equals("")) {
                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, true); //toknum

                // since wf is empty no need to check for clusters and other features.
                continue;
            }

            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, false); //toknum
            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (String t : window) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    //blank line means we found a sentence end. Empty n-gram list and reiniciate.
                    if (t.equals("")) {
                        // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                        checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum

                        // since t is empty no need to check for clusters and other features.
                        continue;
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 2) {
                        continue;
                    }
                    String lemma = normalize(fields[1], params.getProperty("normalization", "none"));

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                    checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams);

                }

                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 3) {
                        continue;
                    }
                    String pos = fields[2];

                    posNgrams.add(pos);

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            // check both lemma n-grams and polarity lexicons, and add values to the feature vector
            checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams);

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        // compute uppercase ratio before normalization (if needed)      
        //double upRatio =0.0;
        //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes"))
        //{
        //   String upper = opNormalized.replaceAll("[a-z]", "");
        //   upRatio = (double)upper.length() / (double)opNormalized.length();
        //   values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        //}

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            if (pol != null && !pol.isEmpty()) {
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                //System.err.println("polarity: _"+pol+"_");
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstancesTAB() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    return rsltdata;
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * /*from  w w w .j  a  v a 2 s. c o  m*/
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstancesConll(boolean save, String prefix) {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    String nafdir = params.getProperty("kafDir");
    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstancesConll() - featNum: " + this.featNum
            + " - trainset attrib num -> " + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstancesConll() - featNum: " + this.featNum
            + " - trainset attrib num -> " + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        String nafPath = nafdir + File.separator + trainExamples.get(oId).getsId().replace(':', '_');
        String taggedFile = "";
        try {
            if (!FileUtilsElh.checkFile(nafPath + ".kaf")) {
                nafPath = NLPpipelineWrapper.tagSentence(corpus.getOpinionSentence(oId), nafPath,
                        corpus.getLang(), params.getProperty("pos-model"), params.getProperty("lemma-model"),
                        postagger);
            } else {
                nafPath = nafPath + ".kaf";
            }
            InputStream reader = new FileInputStream(new File(nafPath));
            taggedFile = IOUtils.toString(reader);
            reader.close();
        } catch (IOException | JDOMException fe) {
            // TODO Auto-generated catch block
            fe.printStackTrace();
        }

        String[] noWindow = taggedFile.split("\n");

        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = noWindow.length;

        //System.err.println("Features::loadInstancesConll - tagged File read lines:"+tokNum);

        List<String> window = Arrays.asList(noWindow);
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer from = start - bowWin;
            if (from < 0) {
                from = 0;
            }
            Integer to = end + bowWin;
            if (to > noWindow.length - 1) {
                to = noWindow.length - 1;
            }
            window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to));
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        //System.err.println(Arrays.toString(window.toArray()));

        // word form ngram related features
        for (String wf : window) {
            String[] fields = wf.split("\\s");
            String wfStr = normalize(fields[0], params.getProperty("normalization", "none"));
            // blank line means we found a sentence end. Empty n-gram list and reiniciate.  
            if (wf.equals("")) {
                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, true); //toknum

                // since wf is empty no need to check for clusters and other features.
                continue;
            }

            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, false); //toknum
            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (String t : window) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    //blank line means we found a sentence end. Empty n-gram list and reiniciate.
                    if (t.equals("")) {
                        // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                        checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum

                        // since t is empty no need to check for clusters and other features.
                        continue;
                    }

                    String[] fields = t.split("\\s");
                    if (fields.length < 2) {
                        continue;
                    }
                    String lemma = normalize(fields[1], params.getProperty("normalization", "none"));

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                    checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams);

                }

                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }

                    String[] fields = t.split("\\s");
                    if (fields.length < 3) {
                        continue;
                    }
                    String pos = fields[2];

                    posNgrams.add(pos);

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            // check both lemma n-grams and polarity lexicons, and add values to the feature vector
            checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams);

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        // compute uppercase ratio before normalization (if needed)      
        //double upRatio =0.0;
        //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes"))
        //{
        //   String upper = opNormalized.replaceAll("[a-z]", "");
        //   upRatio = (double)upper.length() / (double)opNormalized.length();
        //   values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        //}

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            if (pol != null && !pol.isEmpty()) {
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                //System.err.println("polarity: _"+pol+"_");
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstancesConll() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    return rsltdata;
}

From source file:elh.eus.absa.WekaWrapper.java

License:Open Source License

/**
 *      Train one vs all models over the given training data.
 *  /*from w w  w . j a  va 2s  . c o m*/
 * @param modelpath directory to store each model for the one vs. all method
 * @param prefix prefix the models should have (each model will have the name of its class appended
 * @throws Exception
 */
public void trainOneVsAll(String modelpath, String prefix) throws Exception {
    Instances orig = new Instances(traindata);
    Enumeration<Object> classValues = traindata.classAttribute().enumerateValues();
    String classAtt = traindata.classAttribute().name();
    while (classValues.hasMoreElements()) {
        String v = (String) classValues.nextElement();
        System.err.println("trainer onevsall for class " + v + " classifier");
        //needed because of weka's sparse data format problems THIS IS TROUBLE! ...
        if (v.equalsIgnoreCase("dummy")) {
            continue;
        }
        // copy instances and set the same class value
        Instances ovsa = new Instances(orig);
        //create a new class attribute         
        //   // Declare the class attribute along with its values
        ArrayList<String> classVal = new ArrayList<String>();
        classVal.add("dummy"); //needed because of weka's sparse data format problems...
        classVal.add(v);
        classVal.add("UNKNOWN");
        ovsa.insertAttributeAt(new Attribute(classAtt + "2", classVal), ovsa.numAttributes());
        //change all instance labels that have not the current class value to "other"
        for (int i = 0; i < ovsa.numInstances(); i++) {
            Instance inst = ovsa.instance(i);
            String instClass = inst.stringValue(ovsa.attribute(classAtt).index());
            if (instClass.equalsIgnoreCase(v)) {
                inst.setValue(ovsa.attribute(classAtt + "2").index(), v);
            } else {
                inst.setValue(ovsa.attribute(classAtt + "2").index(), "UNKNOWN");
            }
        }
        //delete the old class attribute and set the new.         
        ovsa.setClassIndex(ovsa.attribute(classAtt + "2").index());
        ovsa.deleteAttributeAt(ovsa.attribute(classAtt).index());
        ovsa.renameAttribute(ovsa.attribute(classAtt + "2").index(), classAtt);
        ovsa.setClassIndex(ovsa.attribute(classAtt).index());

        //build the classifier, crossvalidate and store the model
        setTraindata(ovsa);
        saveModel(modelpath + File.separator + prefix + "_" + v + ".model");
        setTestdata(ovsa);
        testModel(modelpath + File.separator + prefix + "_" + v + ".model");

        System.err.println("trained onevsall " + v + " classifier");
    }

    setTraindata(orig);
}

From source file:EsperTest.CEPListener.java

public void update(EventBean[] newData, EventBean[] oldData) {

    System.out.println("Event received: " + newData[0].getUnderlying());
    if (newData.length > 2) {
        //create the column name and type, these are strings
        //http://weka.wikispaces.com/Creating+an+ARFF+file
        Instances data;/*  w w w .ja  va 2 s  . c o m*/
        FastVector atts = new FastVector();

        for (int j = 0; j < columnNumbers.length; j++) {
            FastVector values = new FastVector();
            for (int i = 0; i < labels.NominalCount(j); i++) {
                values.addElement(labels.GetLabel(columnNumbers[j], i));
            }
            atts.addElement(new Attribute(labels.GetHeader(columnNumbers[j]), values));
        }

        data = new Instances("Title", atts, 0);

        for (int i = 0; i < newData.length; i++) {
            Instance inst = new Instance(columnNumbers.length);
            for (int j = 0; j < columnNumbers.length; j++) {
                inst.setValue(j, newData[i].get("eventType").toString());
            }
            data.add(inst);
        }

        Apriori aprioriObj = new weka.associations.Apriori();

        try {
            aprioriObj.buildAssociations(data);
        } catch (Exception e) {
            System.out.println(e);
        }

        FastVector rules[] = aprioriObj.getAllTheRules();

    }

}

From source file:etc.aloe.data.SegmentSet.java

License:Open Source License

/**
 * Convert the segment set into an ExampleSet (ready for feature
 * extraction). The returned example set includes an id attribute, the
 * message text, a label attribute, and several basic features extracted
 * from the segment.//from www  .j  a  va  2 s .  c  o m
 *
 * @return
 */
public ExampleSet getBasicExamples() {
    ArrayList<Attribute> attributes = new ArrayList<Attribute>();

    attributes.add(new Attribute(ExampleSet.ID_ATTR_NAME));
    attributes.add(new Attribute(ExampleSet.MESSAGE_ATTR_NAME, (List<String>) null));
    attributes.add(new Attribute(ExampleSet.LABEL_ATTR_NAME, Arrays.asList(new String[] { "false", "true" })));
    attributes.add(new Attribute(ExampleSet.PARTICIPANT_ATTR_NAME, (List<String>) null));
    attributes.add(new Attribute(DURATION_ATTR_NAME));
    attributes.add(new Attribute(LENGTH_ATTR_NAME));
    attributes.add(new Attribute(CPS_ATTR_NAME));
    attributes.add(new Attribute(RATE_ATTR_NAME));

    Instances instances = new Instances("BasicExamples", attributes, 0);
    instances.setClassIndex(2);

    Attribute idAttr = instances.attribute(ExampleSet.ID_ATTR_NAME);
    Attribute messageAttr = instances.attribute(ExampleSet.MESSAGE_ATTR_NAME);
    Attribute labelAttr = instances.attribute(ExampleSet.LABEL_ATTR_NAME);
    Attribute participantAttr = instances.attribute(ExampleSet.PARTICIPANT_ATTR_NAME);
    Attribute durationAttr = instances.attribute(DURATION_ATTR_NAME);
    Attribute lengthAttr = instances.attribute(LENGTH_ATTR_NAME);
    Attribute cpsAttr = instances.attribute(CPS_ATTR_NAME);
    Attribute rateAttr = instances.attribute(RATE_ATTR_NAME);

    for (int i = 0; i < size(); i++) {
        Segment segment = get(i);
        Instance instance = new DenseInstance(instances.numAttributes());

        String messageStr = segment.concatMessages();
        String participantStr = segment.concatParticipants();

        instance.setValue(idAttr, segment.getId());
        instance.setValue(messageAttr, messageStr);
        instance.setValue(participantAttr, participantStr);

        if (segment.hasTrueLabel()) {
            instance.setValue(labelAttr, segment.getTrueLabel() ? "true" : "false");
        }

        computeRateValues(segment, instance, messageStr, durationAttr, lengthAttr, cpsAttr, rateAttr);

        instances.add(instance);
    }

    return new ExampleSet(instances);
}

From source file:etc.aloe.data.SegmentSet.java

License:Open Source License

/**
 * Computes the basic timing-related features about a segment and applies
 * them to the given instance./* ww  w  .  j  a v  a 2 s .  c  om*/
 *
 * @param segment
 * @param instance
 * @param messageStr
 * @param durationAttr
 * @param lengthAttr
 * @param cpsAttr
 * @param rateAttr
 */
private void computeRateValues(Segment segment, Instance instance, String messageStr, Attribute durationAttr,
        Attribute lengthAttr, Attribute cpsAttr, Attribute rateAttr) {
    double duration = segment.getDurationInSeconds();
    double length = segment.getMessages().size();

    //If the length is 1, then we correct the duration.
    //Assume average typing speed (35 words per minute, 5 char/word)
    if (length <= 1) {
        double averageCharPerSecond = 35.0 * 5.0 / 60.0;
        //[seconds] = [chars] / ([chars]/[seconds])
        duration = (1 + messageStr.length()) / averageCharPerSecond;
    }

    if (duration > 100000) {
        System.err.println("Wacky segment id: " + segment.getId() + " has duration: " + duration);
    }

    double cps = messageStr.length() / duration;
    double rate = segment.getMessages().size() / duration;

    instance.setValue(durationAttr, duration);
    instance.setValue(lengthAttr, length);
    instance.setValue(cpsAttr, cps);
    instance.setValue(rateAttr, rate);
}

From source file:etc.aloe.filters.StringToDictionaryVector.java

License:Open Source License

/**
 * Normalizes given instance to average doc length (only the newly
 * constructed attributes)./*from   ww w  .j  ava 2s  .c o  m*/
 *
 * @param inst   the instance to normalize
 * @param double the document length
 * @throws Exception if avg. doc length not set
 */
private void normalizeInstance(Instance inst, double docLength) throws Exception {

    if (docLength == 0) {
        return;
    }

    int numOldValues = getInputFormat().numAttributes();

    if (m_AvgDocLength < 0) {
        throw new Exception("Average document length not set.");
    }

    // Normalize document vector
    for (int j = numOldValues; j < inst.numAttributes(); j++) {
        double val = inst.value(j) * m_AvgDocLength / docLength;
        inst.setValue(j, val);
    }
}