Example usage for weka.core Instances attribute

List of usage examples for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name) 

Source Link

Document

Returns an attribute given its name.

Usage

From source file:elh.eus.absa.CLI.java

License:Open Source License

/**
 * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier
 * (E and A) for E#A aspect categories//from w  w  w  . j  av  a2  s.c  om
 * @throws Exception 
 */
public final void trainATC2(final InputStream inputStream) throws IOException {
    // load training parameters file
    String paramFile = parsedArguments.getString("params");
    String testFile = parsedArguments.getString("testset");
    String paramFile2 = parsedArguments.getString("params2");
    String corpusFormat = parsedArguments.getString("corpusFormat");
    //String validation = parsedArguments.getString("validation");
    String lang = parsedArguments.getString("language");
    //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum"));
    //boolean printPreds = parsedArguments.getBoolean("printPreds");
    boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences");
    boolean onlyTest = parsedArguments.getBoolean("testOnly");
    double threshold = 0.5;
    double threshold2 = 0.5;
    String modelsPath = "/home/inaki/elixa-atp/ovsaModels";

    CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang);
    Features atcTrain = new Features(reader, paramFile, "3");
    Instances traindata = atcTrain.loadInstances(true, "atc");

    if (onlyTest) {
        if (FileUtilsElh.checkFile(testFile)) {
            System.err.println("read from test file");
            reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat,
                    nullSentenceOpinions, lang);
            atcTrain.setCorpus(reader);
            traindata = atcTrain.loadInstances(true, "atc");
        }
    }

    //setting class attribute (entCat|attCat|entAttCat|polarityCat)

    //HashMap<String, Integer> opInst = atcTrain.getOpinInst();      
    //WekaWrapper classifyAtts;
    WekaWrapper onevsall;
    try {

        //classify.printMultilabelPredictions(classify.multiLabelPrediction());      */   

        //onevsall
        Instances entdata = new Instances(traindata);
        entdata.deleteAttributeAt(entdata.attribute("attCat").index());
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        entdata.setClassIndex(entdata.attribute("entCat").index());
        onevsall = new WekaWrapper(entdata, true);

        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "entCat");
            System.out.println("trainATC: one vs all models ready");
        }
        onevsall.setTestdata(entdata);
        HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath,
                paramFile + "entCat");
        System.out.println("trainATC: one vs all predictions ready");
        HashMap<Integer, String> instOps = new HashMap<Integer, String>();
        for (String oId : atcTrain.getOpinInst().keySet()) {
            instOps.put(atcTrain.getOpinInst().get(oId), oId);
        }

        atcTrain = new Features(reader, paramFile2, "3");
        entdata = atcTrain.loadInstances(true, "attTrain2_data");
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        //entdata.setClassIndex(entdata.attribute("entCat").index());

        Attribute insAtt = entdata.attribute("instanceId");
        double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1);
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins" + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold) {
                    //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                    // for the first one update the instances
                    if (oSubId >= 1) {
                        Instance newIns = new SparseInstance(currentInst);
                        newIns.setDataset(entdata);
                        entdata.add(newIns);
                        newIns.setValue(insAtt, maxInstId + oSubId);
                        newIns.setClassValue(cl);
                        instOps.put((int) maxInstId + oSubId, oId);

                    }
                    // if the are more create new instances
                    else {
                        currentInst.setClassValue(cl);
                        //create and add opinion to the structure
                        //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                        //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId);
                        //reader.addOpinion(op);
                    }
                    oSubId++;
                }
            } //finished updating instances data                                    
        }

        entdata.setClass(entdata.attribute("attCat"));
        onevsall = new WekaWrapper(entdata, true);

        /**
         *  Bigarren sailkatzailea
         * 
         * */
        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "attCat");
            System.out.println("trainATC: one vs all attcat models ready");
        }

        ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat");

        insAtt = entdata.attribute("instanceId");
        maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues());
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins: " + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold2) {
                    ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                    if (ovsaRes.get(i).get(cl) > threshold) {
                        //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                        // for the first one update the instances
                        if (oSubId >= 1) {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);                     
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        // if the are more create new instances
                        else {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                            reader.removeOpinion(oId);
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        oSubId++;
                    }
                } //finished updating instances data                                    
            }
        }
        reader.print2Semeval2015format(paramFile + "entAttCat.xml");
    } catch (Exception e) {
        e.printStackTrace();
    }

    //traindata.setClass(traindata.attribute("entAttCat"));
    System.err.println("DONE CLI train-atc2 (oneVsAll)");
}

From source file:elh.eus.absa.CLI.java

License:Open Source License

/**
 * train ATC using a single classifier (one vs. all) for E#A aspect categories.
 * //from w  w w . j  a v a 2  s.  co m
 * @param inputStream
 * @throws IOException
 */
public final void trainATCsingleCategory(final InputStream inputStream) throws IOException {
    // load training parameters file
    String paramFile = parsedArguments.getString("params");
    String testFile = parsedArguments.getString("testset");
    String corpusFormat = parsedArguments.getString("corpusFormat");
    //String validation = parsedArguments.getString("validation");
    String lang = parsedArguments.getString("language");
    //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum"));
    //boolean printPreds = parsedArguments.getBoolean("printPreds");
    boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences");
    boolean onlyTest = parsedArguments.getBoolean("testOnly");
    double threshold = 0.5;

    String modelsPath = "/home/inaki/Proiektuak/BOM/SEMEVAL2015/ovsaModels";

    CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang);
    Features atcTrain = new Features(reader, paramFile, "3");
    Instances traindata = atcTrain.loadInstances(true, "atc");

    if (onlyTest) {
        if (FileUtilsElh.checkFile(testFile)) {
            System.err.println("read from test file");
            reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat,
                    nullSentenceOpinions, lang);
            atcTrain.setCorpus(reader);
            traindata = atcTrain.loadInstances(true, "atc");
        }
    }

    //setting class attribute (entCat|attCat|entAttCat|polarityCat)

    //HashMap<String, Integer> opInst = atcTrain.getOpinInst();
    //WekaWrapper classifyEnts;
    //WekaWrapper classifyAtts;
    WekaWrapper onevsall;
    try {

        //classify.printMultilabelPredictions(classify.multiLabelPrediction());      */   

        //onevsall
        //Instances entdata = new Instances(traindata);
        traindata.deleteAttributeAt(traindata.attribute("attCat").index());
        traindata.deleteAttributeAt(traindata.attribute("entCat").index());
        traindata.setClassIndex(traindata.attribute("entAttCat").index());
        onevsall = new WekaWrapper(traindata, true);

        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "entAttCat");
            System.out.println("trainATC: one vs all models ready");
        }
        onevsall.setTestdata(traindata);
        HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath,
                paramFile + "entAttCat");
        System.out.println("trainATC: one vs all predictions ready");
        HashMap<Integer, String> kk = new HashMap<Integer, String>();
        for (String oId : atcTrain.getOpinInst().keySet()) {
            kk.put(atcTrain.getOpinInst().get(oId), oId);
        }

        Object[] ll = ovsaRes.get(1).keySet().toArray();
        for (Object l : ll) {
            System.err.print((String) l + " - ");
        }
        System.err.print("\n");

        for (int i : ovsaRes.keySet()) {
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(kk.get(i)).getsId();
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold) {
                    //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                    oSubId++;
                    //create and add opinion to the structure
                    //trgt, offsetFrom, offsetTo, polarity, cat, sId);
                    Opinion op = new Opinion(kk.get(i) + "_" + oSubId, "", 0, 0, "", cl, sId);
                    reader.addOpinion(op);
                }
            }
        }
        reader.print2Semeval2015format(paramFile + "entAttCat.xml");
    } catch (Exception e) {
        e.printStackTrace();
    }

    //traindata.setClass(traindata.attribute("entAttCat"));
    System.err.println("DONE CLI train-atc2 (oneVsAll)");
}

From source file:elh.eus.absa.CLI.java

License:Open Source License

/**
 * Main access to the train functionalities.
 * @throws Exception /*  w w  w  .ja  v  a 2  s  . c o m*/
 */
public final void tagATC(final InputStream inputStream) throws IOException {
    // load training parameters file
    String paramFile = parsedArguments.getString("params");
    String corpusFormat = parsedArguments.getString("corpusFormat");
    //String validation = parsedArguments.getString("validation");
    String lang = parsedArguments.getString("language");
    int foldNum = Integer.parseInt(parsedArguments.getString("foldNum"));
    //boolean printPreds = parsedArguments.getBoolean("printPreds");

    CorpusReader reader = new CorpusReader(inputStream, corpusFormat, lang);
    Features atcTrain = new Features(reader, paramFile, "3");
    Instances traindata = atcTrain.loadInstances(true, "atc");

    //setting class attribute (entCat|attCat|entAttCat|polarityCat)

    //HashMap<String, Integer> opInst = atcTrain.getOpinInst();
    WekaWrapper classify;
    try {
        //train first classifier (entities)
        traindata.setClass(traindata.attribute("entCat"));
        classify = new WekaWrapper(traindata, true);
        classify.crossValidate(foldNum);
        //Classifier entityCl = classify.getMLclass().;

        //train second classifier (attributtes)
        traindata.setClass(traindata.attribute("attCat"));
        classify.setTraindata(traindata);
        classify.crossValidate(foldNum);
        //Classifier attCl = classify.getMLclass();

        classify.printMultilabelPredictions(classify.multiLabelPrediction());

    } catch (Exception e) {
        e.printStackTrace();
    }

    traindata.setClass(traindata.attribute("entAttCat"));
    System.err.println("DONE CLI train-atc");
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 * Creates a feature set from a previously saved model. This allows to load previously saved feature sets. 
 * /* w  w  w . jav  a2  s  . co m*/
 * @param model string: path to the serialized model containing header information
 * @throws IOException 
 */
private void createFeatureSetFromModel(String model) throws IOException {
    try {
        WekaWrapper ww = new WekaWrapper(model);
        Instances header = ww.loadHeader(model);

        int attNum = header.numAttributes();
        for (int i = 0; i < attNum; i++) {
            Attribute att = header.attribute(i);
            String name = att.name();
            if (att.isNumeric()) {
                addNumericFeature(name);
                //System.out.println("numeric feature: "+name);
            } else if (att.isNominal()) {
                //System.out.println("nominal feature: "+name+" - "+att.toString());
                ArrayList<String> vals = new ArrayList<String>();
                Enumeration<Object> e = att.enumerateValues();
                while (e.hasMoreElements()) {
                    vals.add(e.nextElement().toString());
                }
                addNominalFeature(name, vals);
            }
        }

        //General polarity lexicon
        if (header.attribute("polLexGen_posScore") != null) {
            this.polarLexiconGen = new Lexicon(new File(params.getProperty("polarLexiconGeneral")), "lemma");
            System.err.println("Features : createFeatureSet() - General polarity lexicon loaded -> "
                    + params.getProperty("polarLexiconGeneral") + " (" + this.polarLexiconGen.size()
                    + " entries)");
            System.out.println("Features : createFeatureSet() - General polarity lexicon loaded -> "
                    + params.getProperty("polarLexiconGeneral") + " (" + this.polarLexiconGen.size()
                    + " entries)");
        }

        //Domain polarity lexicon
        if (header.attribute("polLexDom_posScore") != null) {
            //this.polarLexiconDom = loadPolarityLexiconFromFile(params.getProperty("polarLexiconDomain"), "polLexDom_");
            this.polarLexiconDom = new Lexicon(new File(params.getProperty("polarLexiconDomain")), "lemma");
            System.err.println("Features : createFeatureSet() - Domain polarity lexicon loaded -> "
                    + params.getProperty("polarLexiconDomain") + " (" + this.polarLexiconDom.size()
                    + " entries)");
            System.out.println("Features : createFeatureSet() - Domain polarity lexicon loaded -> "
                    + params.getProperty("polarLexiconDomain") + " (" + this.polarLexiconDom.size()
                    + " entries)");
        }

        // Load clark cluster category info from files
        loadClusterFeatures("clark");

        // Load brown cluster category info from files
        loadClusterFeatures("brown");

        // Load word2vec cluster category info from files
        loadClusterFeatures("word2vec");

    } catch (Exception e) {
        System.err.println("Features::createFeatureSetFromFile -> error when loading model header");
        e.printStackTrace();
    }

}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * //from  www .j  a v a2s.c  om
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstances(boolean save, String prefix) throws IOException {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //Properties posProp = new Properties();
    //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);      
    if (params.containsKey("lemmaNgrams")) {
        Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"),
                params.getProperty("lemma-model"), corpus.getLang(), "bin", "false");

        postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        // string normalization (emoticons, twitter grammar,...)
        String opNormalized = corpus.getOpinionSentence(oId);

        // compute uppercase ratio before normalization (if needed)      
        double upRatio = 0.0;
        if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) {
            String upper = opNormalized.replaceAll("[\\p{Ll}]", "");
            upRatio = (double) upper.length() / (double) opNormalized.length();
            values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        }

        // string normalization (emoticons, twitter grammar,...)
        if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams"))
                && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) {
            opNormalized = normalize(opNormalized, params.getProperty("normalization", "none"));
        }

        //process the current instance with the NLP pipeline in order to get token and lemma|pos features
        KAFDocument nafinst = new KAFDocument("", "");
        String nafname = trainExamples.get(oId).getsId().replace(':', '_');
        String nafDir = params.getProperty("kafDir");
        String nafPath = nafDir + File.separator + nafname + ".kaf";
        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = 1;
        try {
            if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty()))
            {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(),
                            params.getProperty("pos-model"), params.getProperty("lemma-model"), postagger);
                    Files.createDirectories(Paths.get(nafDir));
                    nafinst.save(nafPath);
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));
            } else {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang());
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));

            }
        } catch (IOException | JDOMException e) {
            System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId
                    + "|" + oId + ") for filling the attribute vector");
            e.printStackTrace();
            System.exit(5);
        }

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        List<WF> window = nafinst.getWFs();
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer to = window.size();
            Integer from = 0;
            end++;
            for (int i = 0; i < window.size(); i++) {
                WF wf = window.get(i);
                if ((wf.getOffset() == start) && (i >= bowWin)) {
                    from = i - bowWin;
                } else if (wf.getOffset() >= end) {
                    if (i + bowWin < window.size()) {
                        to = i + bowWin;
                    }
                    break;
                }
            }
            window = window.subList(from, to);
            //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to);
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        List<String> windowWFIds = new ArrayList<String>();

        // word form ngram related features
        for (WF wf : window) {
            windowWFIds.add(wf.getId());

            String wfStr = wf.getForm();
            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum

            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (Term t : nafinst.getTermsFromWFs(windowWFIds)) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    String lemma = t.getLemma();

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // add ngrams to the feature vector
                    for (int i = 0; i < ngrams.size(); i++) {
                        String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma");
                        //if the current lemma is in the ngram list activate the feature in the vector
                        if (params.containsKey("lemmaNgrams")
                                && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                            Attribute ngAtt = rsltdata.attribute(ng);
                            if (ngAtt != null) {
                                addNumericToFeatureVector(ng, values, 1); //tokNum                     
                            }
                        }

                        ng = featureFromArray(ngrams.subList(0, i + 1), "");
                        if (params.containsKey("polarLexiconGeneral")
                                || params.containsKey("polarLexiconDomain")) {
                            checkPolarityLexicons(ng, values, tokNum, polNgrams);
                        } //end polarity ngram checker
                    } //end ngram checking                                      
                }
                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }
                    posNgrams.add(t.getPos());

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            while (!ngrams.isEmpty()) {
                String ng = featureFromArray(ngrams, "lemma");

                //if the current lemma is in the ngram list activate the feature in the vector
                if (rsltdata.attribute(ng) != null) {
                    addNumericToFeatureVector(ng, values, 1); //tokNum
                }

                // polarity lexicons
                if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
                    checkPolarityLexicons(ng, values, tokNum, polNgrams);
                } //end polarity ngram checker

                ngrams.removeFirst();
            }

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId));
            if (pol != null && !pol.isEmpty()) {
                //System.err.println("polarity: _"+pol+"_");
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstances() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }
    return rsltdata;
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * //from   ww w.  j  a  va  2  s .co  m
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstancesTAB(boolean save, String prefix) {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        String[] noWindow = corpus.getOpinionSentence(oId).split("\n");

        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = noWindow.length;

        List<String> window = Arrays.asList(noWindow);
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer from = start - bowWin;
            if (from < 0) {
                from = 0;
            }
            Integer to = end + bowWin;
            if (to > noWindow.length - 1) {
                to = noWindow.length - 1;
            }
            window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to));
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        //System.err.println(Arrays.toString(window.toArray()));

        // word form ngram related features
        for (String wf : window) {
            String[] fields = wf.split("\t");
            String wfStr = normalize(fields[0], params.getProperty("normalization", "none"));
            // blank line means we found a sentence end. Empty n-gram list and reiniciate.  
            if (wf.equals("")) {
                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, true); //toknum

                // since wf is empty no need to check for clusters and other features.
                continue;
            }

            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, false); //toknum
            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (String t : window) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    //blank line means we found a sentence end. Empty n-gram list and reiniciate.
                    if (t.equals("")) {
                        // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                        checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum

                        // since t is empty no need to check for clusters and other features.
                        continue;
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 2) {
                        continue;
                    }
                    String lemma = normalize(fields[1], params.getProperty("normalization", "none"));

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                    checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams);

                }

                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 3) {
                        continue;
                    }
                    String pos = fields[2];

                    posNgrams.add(pos);

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            // check both lemma n-grams and polarity lexicons, and add values to the feature vector
            checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams);

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        // compute uppercase ratio before normalization (if needed)      
        //double upRatio =0.0;
        //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes"))
        //{
        //   String upper = opNormalized.replaceAll("[a-z]", "");
        //   upRatio = (double)upper.length() / (double)opNormalized.length();
        //   values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        //}

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            if (pol != null && !pol.isEmpty()) {
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                //System.err.println("polarity: _"+pol+"_");
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstancesTAB() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    return rsltdata;
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * //from  w  w  w .ja va 2 s . com
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstancesConll(boolean save, String prefix) {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    String nafdir = params.getProperty("kafDir");
    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstancesConll() - featNum: " + this.featNum
            + " - trainset attrib num -> " + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstancesConll() - featNum: " + this.featNum
            + " - trainset attrib num -> " + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        String nafPath = nafdir + File.separator + trainExamples.get(oId).getsId().replace(':', '_');
        String taggedFile = "";
        try {
            if (!FileUtilsElh.checkFile(nafPath + ".kaf")) {
                nafPath = NLPpipelineWrapper.tagSentence(corpus.getOpinionSentence(oId), nafPath,
                        corpus.getLang(), params.getProperty("pos-model"), params.getProperty("lemma-model"),
                        postagger);
            } else {
                nafPath = nafPath + ".kaf";
            }
            InputStream reader = new FileInputStream(new File(nafPath));
            taggedFile = IOUtils.toString(reader);
            reader.close();
        } catch (IOException | JDOMException fe) {
            // TODO Auto-generated catch block
            fe.printStackTrace();
        }

        String[] noWindow = taggedFile.split("\n");

        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = noWindow.length;

        //System.err.println("Features::loadInstancesConll - tagged File read lines:"+tokNum);

        List<String> window = Arrays.asList(noWindow);
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer from = start - bowWin;
            if (from < 0) {
                from = 0;
            }
            Integer to = end + bowWin;
            if (to > noWindow.length - 1) {
                to = noWindow.length - 1;
            }
            window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to));
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        //System.err.println(Arrays.toString(window.toArray()));

        // word form ngram related features
        for (String wf : window) {
            String[] fields = wf.split("\\s");
            String wfStr = normalize(fields[0], params.getProperty("normalization", "none"));
            // blank line means we found a sentence end. Empty n-gram list and reiniciate.  
            if (wf.equals("")) {
                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, true); //toknum

                // since wf is empty no need to check for clusters and other features.
                continue;
            }

            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, false); //toknum
            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (String t : window) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    //blank line means we found a sentence end. Empty n-gram list and reiniciate.
                    if (t.equals("")) {
                        // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                        checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum

                        // since t is empty no need to check for clusters and other features.
                        continue;
                    }

                    String[] fields = t.split("\\s");
                    if (fields.length < 2) {
                        continue;
                    }
                    String lemma = normalize(fields[1], params.getProperty("normalization", "none"));

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                    checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams);

                }

                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }

                    String[] fields = t.split("\\s");
                    if (fields.length < 3) {
                        continue;
                    }
                    String pos = fields[2];

                    posNgrams.add(pos);

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            // check both lemma n-grams and polarity lexicons, and add values to the feature vector
            checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams);

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        // compute uppercase ratio before normalization (if needed)      
        //double upRatio =0.0;
        //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes"))
        //{
        //   String upper = opNormalized.replaceAll("[a-z]", "");
        //   upRatio = (double)upper.length() / (double)opNormalized.length();
        //   values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        //}

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            if (pol != null && !pol.isEmpty()) {
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                //System.err.println("polarity: _"+pol+"_");
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstancesConll() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    return rsltdata;
}

From source file:elh.eus.absa.WekaWrapper.java

License:Open Source License

/**
 *      Train one vs all models over the given training data.
 *  //from www  .  j  a va  2 s. c om
 * @param modelpath directory to store each model for the one vs. all method
 * @param prefix prefix the models should have (each model will have the name of its class appended
 * @throws Exception
 */
public void trainOneVsAll(String modelpath, String prefix) throws Exception {
    Instances orig = new Instances(traindata);
    Enumeration<Object> classValues = traindata.classAttribute().enumerateValues();
    String classAtt = traindata.classAttribute().name();
    while (classValues.hasMoreElements()) {
        String v = (String) classValues.nextElement();
        System.err.println("trainer onevsall for class " + v + " classifier");
        //needed because of weka's sparse data format problems THIS IS TROUBLE! ...
        if (v.equalsIgnoreCase("dummy")) {
            continue;
        }
        // copy instances and set the same class value
        Instances ovsa = new Instances(orig);
        //create a new class attribute         
        //   // Declare the class attribute along with its values
        ArrayList<String> classVal = new ArrayList<String>();
        classVal.add("dummy"); //needed because of weka's sparse data format problems...
        classVal.add(v);
        classVal.add("UNKNOWN");
        ovsa.insertAttributeAt(new Attribute(classAtt + "2", classVal), ovsa.numAttributes());
        //change all instance labels that have not the current class value to "other"
        for (int i = 0; i < ovsa.numInstances(); i++) {
            Instance inst = ovsa.instance(i);
            String instClass = inst.stringValue(ovsa.attribute(classAtt).index());
            if (instClass.equalsIgnoreCase(v)) {
                inst.setValue(ovsa.attribute(classAtt + "2").index(), v);
            } else {
                inst.setValue(ovsa.attribute(classAtt + "2").index(), "UNKNOWN");
            }
        }
        //delete the old class attribute and set the new.         
        ovsa.setClassIndex(ovsa.attribute(classAtt + "2").index());
        ovsa.deleteAttributeAt(ovsa.attribute(classAtt).index());
        ovsa.renameAttribute(ovsa.attribute(classAtt + "2").index(), classAtt);
        ovsa.setClassIndex(ovsa.attribute(classAtt).index());

        //build the classifier, crossvalidate and store the model
        setTraindata(ovsa);
        saveModel(modelpath + File.separator + prefix + "_" + v + ".model");
        setTestdata(ovsa);
        testModel(modelpath + File.separator + prefix + "_" + v + ".model");

        System.err.println("trained onevsall " + v + " classifier");
    }

    setTraindata(orig);
}

From source file:entities.ArffFile.java

/**
 * Dada una lista de parametros, se ejecuta el filtro de microagregacion.
 * Todos estos parametros son entrada del usuario.
 * @param df Puede ser Euclidian o Manhattan distance, se especifica en la entrada.
 * @param numCluster//from   ww  w .  j a  va  2  s  .c  om
 * @param seed
 * @param maxIterations
 * @param replaceMissingValues
 * @param preserveInstancesOrder
 * @param attributes lista de los atributos que se desean generalizar con cluster
 */
public void microAgregacion(DistanceFunction df, int numCluster, int seed, int maxIterations,
        boolean replaceMissingValues, boolean preserveInstancesOrder, List<Integer> attributes)
        throws Exception {
    //instancesFilter = new Instances(instances);
    SimpleKMeans kMeans;
    kMeans = new SimpleKMeans();
    Instances uniqueAttributes;
    uniqueAttributes = new Instances(instancesFilter);
    List<String> names = new ArrayList<>();
    int i = 0;
    for (Integer attribute : attributes) {
        String name = new String(instancesFilter.attribute(attribute).name());
        if (instancesFilter.attribute(attribute).isDate() || instancesFilter.attribute(attribute).isString())
            throw new Exception("No se puede hacer cluster con atributos de tipo DATE o STRING");
        names.add(name);
    }
    while (uniqueAttributes.numAttributes() != attributes.size()) {
        if (!names.contains(uniqueAttributes.attribute(i).name()))
            uniqueAttributes.deleteAttributeAt(i);
        else
            i++;
    }
    try {
        kMeans.setNumClusters(numCluster);
        kMeans.setMaxIterations(maxIterations);
        kMeans.setSeed(seed);
        kMeans.setDisplayStdDevs(false);
        kMeans.setDistanceFunction(df);
        kMeans.setDontReplaceMissingValues(replaceMissingValues);
        kMeans.setPreserveInstancesOrder(preserveInstancesOrder);
        kMeans.buildClusterer(uniqueAttributes);
        //System.out.println(kMeans);
        for (int j = 0; j < uniqueAttributes.numInstances(); j++) {
            int cluster = kMeans.clusterInstance(uniqueAttributes.instance(j));
            for (int k = 0; k < uniqueAttributes.numAttributes(); k++) {
                if (uniqueAttributes.attribute(k).isNumeric())
                    uniqueAttributes.instance(j).setValue(k,
                            Double.parseDouble(kMeans.getClusterCentroids().instance(cluster).toString(k)));
                else
                    uniqueAttributes.instance(j).setValue(k,
                            kMeans.getClusterCentroids().instance(cluster).toString(k));
            }
        }
        replaceValues(uniqueAttributes, attributes);
    } catch (Exception ex) {
        Logger.getLogger(ArffFile.class.getName()).log(Level.SEVERE, null, ex);
    }
    //saveToFile("4");
}

From source file:entities.WekaBaselineBOWFeatureVector.java

public Instance fillFeatureVector(BaselineBOWFeatureVector vSource, Instances data) {
    double[] values = new double[data.numAttributes()];

    //values[0] = vSource.getCosSimilarityArrayAtIndex(0);//((vSource.getCosSimilarityArrayAtIndex(0) + vSource.getCosSimilarityArrayAtIndex(1)));
    //values[1] = vSource.getCosSimilarityArrayAtIndex(1);//((vSource.getCosSimilarityArrayAtIndex(0) + vSource.getCosSimilarityArrayAtIndex(1)));
    for (int i = 0; i < 64; i++)
        values[i] = vSource.getFrequencyArrayAtIndex(i);
    values[64] = data.attribute(64).indexOfValue(vSource.getLabel());

    Instance inst = new DenseInstance(1.0, values);

    return inst;//from  w w  w.ja va 2s .  c  o m
}