Example usage for weka.core Instance setDataset

Introduction

In this page you can find the example usage for weka.core Instance setDataset.

Prototype

public void setDataset(Instances instances);

Source Link

Document

Sets the reference to the dataset.

Usage

From source file:edu.oregonstate.eecs.mcplan.abstraction.WekaUtil.java

License:Open Source License

public static void addInstance(final Instances instances, final Instance i) {
    instances.add(i);/*from   w ww  .  j ava 2s.  co  m*/
    i.setDataset(instances);
}

From source file:edu.oregonstate.eecs.mcplan.abstraction.WekaUtil.java

License:Open Source License

/**
 * Creates an Instances object containing the specified feature vector
 * and with an added "dummy label".// www  .ja v  a  2s. c om
 * @param attributes
 * @param features
 * @return
 */
public static Instances createSingletonInstances(final List<Attribute> attributes, final double[] features) {
    final ArrayList<Attribute> attr_dummy_label = new ArrayList<Attribute>(attributes);
    attr_dummy_label.add(createBinaryNominalAttribute("__dummy_label__"));
    final double[] features_dummy_label = new double[features.length + 1];
    Fn.memcpy(features_dummy_label, features, features.length);
    final Instance instance = new DenseInstance(1.0, features_dummy_label);
    final Instances x = new Instances("__eval__", attr_dummy_label, 1);
    x.setClassIndex(attr_dummy_label.size() - 1);
    x.add(instance);
    instance.setDataset(x);
    return x;
}

From source file:edu.oregonstate.eecs.mcplan.ml.ClassifierSimilarityFunction.java

License:Open Source License

@Override
public double similarity(final double[] a, final double[] b) {
    final Instance instance = makeFeatures(a, b);
    dataset_.add(instance);/*  ww w .  ja v a 2 s  .co m*/
    instance.setDataset(dataset_);
    double[] p;
    try {
        p = classifier_.distributionForInstance(instance);
    } catch (final Exception ex) {
        throw new RuntimeException(ex);
    }
    dataset_.remove(0);
    assert (p.length == 2);
    // p[1] = similar
    return p[1];
}

From source file:edu.stanford.rsl.conrad.segmentation.GridFeatureExtractor.java

License:Open Source License

/**
 * creates a new feature vector (instance in weka language) and adds it to
 * the local feature vector set.//from   ww  w  .j av  a  2 s  . c om
 * 
 * @param attValues
 */
public void addInstance(double[] attValues) {
    Instance inst = new DenseInstance(1.0, attValues);
    inst.setDataset(instances);
    instances.add(inst);
}

From source file:edu.teco.context.recognition.WekaManager.java

License:Apache License

public void classifyInstance(double[] featureValues) {

    /*/* ww w.j  av a  2 s. c  o  m*/
     * // Create empty instance with three attribute values Instance inst =
     * new DenseInstance(3);
     * 
     * // Set instance's values for the attributes "length", "weight", and
     * "position" inst.setValue(length, 5.3); inst.setValue(weight, 300);
     * inst.setValue(position, "first");
     * 
     * // Set instance's dataset to be the dataset "race"
     * inst.setDataset(race);
     */

    Instance instance = new DenseInstance(1.0, featureValues);
    boolean check = trainingData.checkInstance(instance);
    if (FrameworkContext.INFO)
        Log.i("WekaData", "Result of Instance check: " + check);
    instance.setDataset(trainingData);

    if (FrameworkContext.INFO)
        Log.i("WekaData", "Try to classify FeatureVector.");

    try {
        double classValue = classifier.classifyInstance(instance);
        double[] classDistribution = classifier.distributionForInstance(instance);

        Attribute classAttribute = trainingData.classAttribute();
        String className = classAttribute.value((int) classValue);
        double classProbability = classDistribution[(int) classValue];

        StringBuilder logString = new StringBuilder();
        logString.append("----- Classification Result -----\nClass Value = ").append(classValue)
                .append("\nClass Distribution = {");
        for (double value : classDistribution) {
            logString.append(value).append(";");
        }
        logString.deleteCharAt(logString.length() - 1);
        logString.append("}\nClass Name = ").append(className);

        if (FrameworkContext.INFO)
            Log.i("WekaData", logString.toString());

        WekaEvent wekaEvent = new WekaEvent(this, className, classProbability, mPreviousCalculatedClassName,
                classDistribution);

        notifyClassCalculated(wekaEvent);

        if (mPreviousCalculatedClassName != null) {
            if (mPreviousCalculatedClassName.equals(className)) {
                notifyClassChanged(wekaEvent);
            }
        }
        mPreviousCalculatedClassName = className;

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:elh.eus.absa.CLI.java

License:Open Source License

/**
 * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier
 * (E and A) for E#A aspect categories/* ww w  . j a  v a  2  s .  c  o  m*/
 * @throws Exception 
 */
public final void trainATC2(final InputStream inputStream) throws IOException {
    // load training parameters file
    String paramFile = parsedArguments.getString("params");
    String testFile = parsedArguments.getString("testset");
    String paramFile2 = parsedArguments.getString("params2");
    String corpusFormat = parsedArguments.getString("corpusFormat");
    //String validation = parsedArguments.getString("validation");
    String lang = parsedArguments.getString("language");
    //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum"));
    //boolean printPreds = parsedArguments.getBoolean("printPreds");
    boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences");
    boolean onlyTest = parsedArguments.getBoolean("testOnly");
    double threshold = 0.5;
    double threshold2 = 0.5;
    String modelsPath = "/home/inaki/elixa-atp/ovsaModels";

    CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang);
    Features atcTrain = new Features(reader, paramFile, "3");
    Instances traindata = atcTrain.loadInstances(true, "atc");

    if (onlyTest) {
        if (FileUtilsElh.checkFile(testFile)) {
            System.err.println("read from test file");
            reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat,
                    nullSentenceOpinions, lang);
            atcTrain.setCorpus(reader);
            traindata = atcTrain.loadInstances(true, "atc");
        }
    }

    //setting class attribute (entCat|attCat|entAttCat|polarityCat)

    //HashMap<String, Integer> opInst = atcTrain.getOpinInst();      
    //WekaWrapper classifyAtts;
    WekaWrapper onevsall;
    try {

        //classify.printMultilabelPredictions(classify.multiLabelPrediction());      */   

        //onevsall
        Instances entdata = new Instances(traindata);
        entdata.deleteAttributeAt(entdata.attribute("attCat").index());
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        entdata.setClassIndex(entdata.attribute("entCat").index());
        onevsall = new WekaWrapper(entdata, true);

        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "entCat");
            System.out.println("trainATC: one vs all models ready");
        }
        onevsall.setTestdata(entdata);
        HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath,
                paramFile + "entCat");
        System.out.println("trainATC: one vs all predictions ready");
        HashMap<Integer, String> instOps = new HashMap<Integer, String>();
        for (String oId : atcTrain.getOpinInst().keySet()) {
            instOps.put(atcTrain.getOpinInst().get(oId), oId);
        }

        atcTrain = new Features(reader, paramFile2, "3");
        entdata = atcTrain.loadInstances(true, "attTrain2_data");
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        //entdata.setClassIndex(entdata.attribute("entCat").index());

        Attribute insAtt = entdata.attribute("instanceId");
        double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1);
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins" + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold) {
                    //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                    // for the first one update the instances
                    if (oSubId >= 1) {
                        Instance newIns = new SparseInstance(currentInst);
                        newIns.setDataset(entdata);
                        entdata.add(newIns);
                        newIns.setValue(insAtt, maxInstId + oSubId);
                        newIns.setClassValue(cl);
                        instOps.put((int) maxInstId + oSubId, oId);

                    }
                    // if the are more create new instances
                    else {
                        currentInst.setClassValue(cl);
                        //create and add opinion to the structure
                        //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                        //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId);
                        //reader.addOpinion(op);
                    }
                    oSubId++;
                }
            } //finished updating instances data                                    
        }

        entdata.setClass(entdata.attribute("attCat"));
        onevsall = new WekaWrapper(entdata, true);

        /**
         *  Bigarren sailkatzailea
         * 
         * */
        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "attCat");
            System.out.println("trainATC: one vs all attcat models ready");
        }

        ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat");

        insAtt = entdata.attribute("instanceId");
        maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues());
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins: " + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold2) {
                    ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                    if (ovsaRes.get(i).get(cl) > threshold) {
                        //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                        // for the first one update the instances
                        if (oSubId >= 1) {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);                     
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        // if the are more create new instances
                        else {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                            reader.removeOpinion(oId);
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        oSubId++;
                    }
                } //finished updating instances data                                    
            }
        }
        reader.print2Semeval2015format(paramFile + "entAttCat.xml");
    } catch (Exception e) {
        e.printStackTrace();
    }

    //traindata.setClass(traindata.attribute("entAttCat"));
    System.err.println("DONE CLI train-atc2 (oneVsAll)");
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * /*from  ww w  .ja va 2  s.com*/
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstances(boolean save, String prefix) throws IOException {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //Properties posProp = new Properties();
    //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);      
    if (params.containsKey("lemmaNgrams")) {
        Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"),
                params.getProperty("lemma-model"), corpus.getLang(), "bin", "false");

        postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        // string normalization (emoticons, twitter grammar,...)
        String opNormalized = corpus.getOpinionSentence(oId);

        // compute uppercase ratio before normalization (if needed)      
        double upRatio = 0.0;
        if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) {
            String upper = opNormalized.replaceAll("[\\p{Ll}]", "");
            upRatio = (double) upper.length() / (double) opNormalized.length();
            values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        }

        // string normalization (emoticons, twitter grammar,...)
        if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams"))
                && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) {
            opNormalized = normalize(opNormalized, params.getProperty("normalization", "none"));
        }

        //process the current instance with the NLP pipeline in order to get token and lemma|pos features
        KAFDocument nafinst = new KAFDocument("", "");
        String nafname = trainExamples.get(oId).getsId().replace(':', '_');
        String nafDir = params.getProperty("kafDir");
        String nafPath = nafDir + File.separator + nafname + ".kaf";
        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = 1;
        try {
            if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty()))
            {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(),
                            params.getProperty("pos-model"), params.getProperty("lemma-model"), postagger);
                    Files.createDirectories(Paths.get(nafDir));
                    nafinst.save(nafPath);
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));
            } else {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang());
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));

            }
        } catch (IOException | JDOMException e) {
            System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId
                    + "|" + oId + ") for filling the attribute vector");
            e.printStackTrace();
            System.exit(5);
        }

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        List<WF> window = nafinst.getWFs();
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer to = window.size();
            Integer from = 0;
            end++;
            for (int i = 0; i < window.size(); i++) {
                WF wf = window.get(i);
                if ((wf.getOffset() == start) && (i >= bowWin)) {
                    from = i - bowWin;
                } else if (wf.getOffset() >= end) {
                    if (i + bowWin < window.size()) {
                        to = i + bowWin;
                    }
                    break;
                }
            }
            window = window.subList(from, to);
            //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to);
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        List<String> windowWFIds = new ArrayList<String>();

        // word form ngram related features
        for (WF wf : window) {
            windowWFIds.add(wf.getId());

            String wfStr = wf.getForm();
            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum

            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (Term t : nafinst.getTermsFromWFs(windowWFIds)) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    String lemma = t.getLemma();

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // add ngrams to the feature vector
                    for (int i = 0; i < ngrams.size(); i++) {
                        String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma");
                        //if the current lemma is in the ngram list activate the feature in the vector
                        if (params.containsKey("lemmaNgrams")
                                && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                            Attribute ngAtt = rsltdata.attribute(ng);
                            if (ngAtt != null) {
                                addNumericToFeatureVector(ng, values, 1); //tokNum                     
                            }
                        }

                        ng = featureFromArray(ngrams.subList(0, i + 1), "");
                        if (params.containsKey("polarLexiconGeneral")
                                || params.containsKey("polarLexiconDomain")) {
                            checkPolarityLexicons(ng, values, tokNum, polNgrams);
                        } //end polarity ngram checker
                    } //end ngram checking                                      
                }
                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }
                    posNgrams.add(t.getPos());

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            while (!ngrams.isEmpty()) {
                String ng = featureFromArray(ngrams, "lemma");

                //if the current lemma is in the ngram list activate the feature in the vector
                if (rsltdata.attribute(ng) != null) {
                    addNumericToFeatureVector(ng, values, 1); //tokNum
                }

                // polarity lexicons
                if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
                    checkPolarityLexicons(ng, values, tokNum, polNgrams);
                } //end polarity ngram checker

                ngrams.removeFirst();
            }

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId));
            if (pol != null && !pol.isEmpty()) {
                //System.err.println("polarity: _"+pol+"_");
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstances() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }
    return rsltdata;
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * /*from   ww  w.  ja  va 2  s .  c o m*/
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstancesTAB(boolean save, String prefix) {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        String[] noWindow = corpus.getOpinionSentence(oId).split("\n");

        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = noWindow.length;

        List<String> window = Arrays.asList(noWindow);
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer from = start - bowWin;
            if (from < 0) {
                from = 0;
            }
            Integer to = end + bowWin;
            if (to > noWindow.length - 1) {
                to = noWindow.length - 1;
            }
            window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to));
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        //System.err.println(Arrays.toString(window.toArray()));

        // word form ngram related features
        for (String wf : window) {
            String[] fields = wf.split("\t");
            String wfStr = normalize(fields[0], params.getProperty("normalization", "none"));
            // blank line means we found a sentence end. Empty n-gram list and reiniciate.  
            if (wf.equals("")) {
                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, true); //toknum

                // since wf is empty no need to check for clusters and other features.
                continue;
            }

            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, false); //toknum
            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (String t : window) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    //blank line means we found a sentence end. Empty n-gram list and reiniciate.
                    if (t.equals("")) {
                        // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                        checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum

                        // since t is empty no need to check for clusters and other features.
                        continue;
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 2) {
                        continue;
                    }
                    String lemma = normalize(fields[1], params.getProperty("normalization", "none"));

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                    checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams);

                }

                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 3) {
                        continue;
                    }
                    String pos = fields[2];

                    posNgrams.add(pos);

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            // check both lemma n-grams and polarity lexicons, and add values to the feature vector
            checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams);

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        // compute uppercase ratio before normalization (if needed)      
        //double upRatio =0.0;
        //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes"))
        //{
        //   String upper = opNormalized.replaceAll("[a-z]", "");
        //   upRatio = (double)upper.length() / (double)opNormalized.length();
        //   values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        //}

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            if (pol != null && !pol.isEmpty()) {
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                //System.err.println("polarity: _"+pol+"_");
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstancesTAB() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    return rsltdata;
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * //from  w w  w .  ja va2 s.c  o m
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstancesConll(boolean save, String prefix) {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    String nafdir = params.getProperty("kafDir");
    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstancesConll() - featNum: " + this.featNum
            + " - trainset attrib num -> " + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstancesConll() - featNum: " + this.featNum
            + " - trainset attrib num -> " + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        String nafPath = nafdir + File.separator + trainExamples.get(oId).getsId().replace(':', '_');
        String taggedFile = "";
        try {
            if (!FileUtilsElh.checkFile(nafPath + ".kaf")) {
                nafPath = NLPpipelineWrapper.tagSentence(corpus.getOpinionSentence(oId), nafPath,
                        corpus.getLang(), params.getProperty("pos-model"), params.getProperty("lemma-model"),
                        postagger);
            } else {
                nafPath = nafPath + ".kaf";
            }
            InputStream reader = new FileInputStream(new File(nafPath));
            taggedFile = IOUtils.toString(reader);
            reader.close();
        } catch (IOException | JDOMException fe) {
            // TODO Auto-generated catch block
            fe.printStackTrace();
        }

        String[] noWindow = taggedFile.split("\n");

        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = noWindow.length;

        //System.err.println("Features::loadInstancesConll - tagged File read lines:"+tokNum);

        List<String> window = Arrays.asList(noWindow);
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer from = start - bowWin;
            if (from < 0) {
                from = 0;
            }
            Integer to = end + bowWin;
            if (to > noWindow.length - 1) {
                to = noWindow.length - 1;
            }
            window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to));
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        //System.err.println(Arrays.toString(window.toArray()));

        // word form ngram related features
        for (String wf : window) {
            String[] fields = wf.split("\\s");
            String wfStr = normalize(fields[0], params.getProperty("normalization", "none"));
            // blank line means we found a sentence end. Empty n-gram list and reiniciate.  
            if (wf.equals("")) {
                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, true); //toknum

                // since wf is empty no need to check for clusters and other features.
                continue;
            }

            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, false); //toknum
            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (String t : window) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    //blank line means we found a sentence end. Empty n-gram list and reiniciate.
                    if (t.equals("")) {
                        // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                        checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum

                        // since t is empty no need to check for clusters and other features.
                        continue;
                    }

                    String[] fields = t.split("\\s");
                    if (fields.length < 2) {
                        continue;
                    }
                    String lemma = normalize(fields[1], params.getProperty("normalization", "none"));

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                    checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams);

                }

                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }

                    String[] fields = t.split("\\s");
                    if (fields.length < 3) {
                        continue;
                    }
                    String pos = fields[2];

                    posNgrams.add(pos);

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            // check both lemma n-grams and polarity lexicons, and add values to the feature vector
            checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams);

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        // compute uppercase ratio before normalization (if needed)      
        //double upRatio =0.0;
        //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes"))
        //{
        //   String upper = opNormalized.replaceAll("[a-z]", "");
        //   upRatio = (double)upper.length() / (double)opNormalized.length();
        //   values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        //}

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            if (pol != null && !pol.isEmpty()) {
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                //System.err.println("polarity: _"+pol+"_");
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstancesConll() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    return rsltdata;
}

From source file:en_deep.mlprocess.manipulation.featmodif.FeatureModifierFilter.java

License:Open Source License

/**
 * Convert a single instance over if the class is nominal. The converted
 * instance is added to the end of the output queue.
 *
 * @param instance the instance to convert
 *//*from   w  ww. j  ava2s. c  o m*/
private void convertInstance(Instance instance) {

    double[] vals = new double[outputFormatPeek().numAttributes()];
    String[] stringVals = new String[vals.length];
    int attSoFar = 0;

    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
        Attribute att = instance.attribute(j);
        if (!m_Columns.isInRange(j)) {
            vals[attSoFar] = instance.value(j);
            attSoFar++;
        } else {
            // store new string values, make double values "missing" for now (if some string
            // values are missing, the double values will remain missing)
            if (instance.value(0) == 12 && instance.value(1) == 9 && att.name().equals("sempos")) {
                attSoFar = attSoFar;
            }
            attSoFar += getAttributeOutputValue(att, instance.value(j), vals, stringVals, attSoFar);
        }
    }
    Instance inst = null;
    if (instance instanceof SparseInstance) {
        inst = new SparseInstance(instance.weight(), vals);
    } else {
        inst = new DenseInstance(instance.weight(), vals);
    }

    inst.setDataset(getOutputFormat());
    copyValues(inst, false, instance.dataset(), getOutputFormat());

    // add new string values to the output data set and to the instance
    for (int i = 0; i < stringVals.length; ++i) {
        if (stringVals[i] != null) {
            vals[i] = inst.dataset().attribute(i).addStringValue(stringVals[i]);
        }
    }
    inst.replaceMissingValues(vals);

    inst.setDataset(getOutputFormat());
    push(inst);
}