List of usage examples for weka.core Instance setDataset
public void setDataset(Instances instances);
From source file:edu.oregonstate.eecs.mcplan.abstraction.WekaUtil.java
License:Open Source License
public static void addInstance(final Instances instances, final Instance i) { instances.add(i);/*from w ww . j ava 2s. co m*/ i.setDataset(instances); }
From source file:edu.oregonstate.eecs.mcplan.abstraction.WekaUtil.java
License:Open Source License
/** * Creates an Instances object containing the specified feature vector * and with an added "dummy label".// www .ja v a 2s. c om * @param attributes * @param features * @return */ public static Instances createSingletonInstances(final List<Attribute> attributes, final double[] features) { final ArrayList<Attribute> attr_dummy_label = new ArrayList<Attribute>(attributes); attr_dummy_label.add(createBinaryNominalAttribute("__dummy_label__")); final double[] features_dummy_label = new double[features.length + 1]; Fn.memcpy(features_dummy_label, features, features.length); final Instance instance = new DenseInstance(1.0, features_dummy_label); final Instances x = new Instances("__eval__", attr_dummy_label, 1); x.setClassIndex(attr_dummy_label.size() - 1); x.add(instance); instance.setDataset(x); return x; }
From source file:edu.oregonstate.eecs.mcplan.ml.ClassifierSimilarityFunction.java
License:Open Source License
@Override public double similarity(final double[] a, final double[] b) { final Instance instance = makeFeatures(a, b); dataset_.add(instance);/* ww w . ja v a 2 s .co m*/ instance.setDataset(dataset_); double[] p; try { p = classifier_.distributionForInstance(instance); } catch (final Exception ex) { throw new RuntimeException(ex); } dataset_.remove(0); assert (p.length == 2); // p[1] = similar return p[1]; }
From source file:edu.stanford.rsl.conrad.segmentation.GridFeatureExtractor.java
License:Open Source License
/** * creates a new feature vector (instance in weka language) and adds it to * the local feature vector set.//from ww w .j av a 2 s . c om * * @param attValues */ public void addInstance(double[] attValues) { Instance inst = new DenseInstance(1.0, attValues); inst.setDataset(instances); instances.add(inst); }
From source file:edu.teco.context.recognition.WekaManager.java
License:Apache License
public void classifyInstance(double[] featureValues) { /*/* ww w.j av a 2 s. c o m*/ * // Create empty instance with three attribute values Instance inst = * new DenseInstance(3); * * // Set instance's values for the attributes "length", "weight", and * "position" inst.setValue(length, 5.3); inst.setValue(weight, 300); * inst.setValue(position, "first"); * * // Set instance's dataset to be the dataset "race" * inst.setDataset(race); */ Instance instance = new DenseInstance(1.0, featureValues); boolean check = trainingData.checkInstance(instance); if (FrameworkContext.INFO) Log.i("WekaData", "Result of Instance check: " + check); instance.setDataset(trainingData); if (FrameworkContext.INFO) Log.i("WekaData", "Try to classify FeatureVector."); try { double classValue = classifier.classifyInstance(instance); double[] classDistribution = classifier.distributionForInstance(instance); Attribute classAttribute = trainingData.classAttribute(); String className = classAttribute.value((int) classValue); double classProbability = classDistribution[(int) classValue]; StringBuilder logString = new StringBuilder(); logString.append("----- Classification Result -----\nClass Value = ").append(classValue) .append("\nClass Distribution = {"); for (double value : classDistribution) { logString.append(value).append(";"); } logString.deleteCharAt(logString.length() - 1); logString.append("}\nClass Name = ").append(className); if (FrameworkContext.INFO) Log.i("WekaData", logString.toString()); WekaEvent wekaEvent = new WekaEvent(this, className, classProbability, mPreviousCalculatedClassName, classDistribution); notifyClassCalculated(wekaEvent); if (mPreviousCalculatedClassName != null) { if (mPreviousCalculatedClassName.equals(className)) { notifyClassChanged(wekaEvent); } } mPreviousCalculatedClassName = className; } catch (Exception e) { e.printStackTrace(); } }
From source file:elh.eus.absa.CLI.java
License:Open Source License
/** * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier * (E and A) for E#A aspect categories/* ww w . j a v a 2 s . c o m*/ * @throws Exception */ public final void trainATC2(final InputStream inputStream) throws IOException { // load training parameters file String paramFile = parsedArguments.getString("params"); String testFile = parsedArguments.getString("testset"); String paramFile2 = parsedArguments.getString("params2"); String corpusFormat = parsedArguments.getString("corpusFormat"); //String validation = parsedArguments.getString("validation"); String lang = parsedArguments.getString("language"); //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum")); //boolean printPreds = parsedArguments.getBoolean("printPreds"); boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences"); boolean onlyTest = parsedArguments.getBoolean("testOnly"); double threshold = 0.5; double threshold2 = 0.5; String modelsPath = "/home/inaki/elixa-atp/ovsaModels"; CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang); Features atcTrain = new Features(reader, paramFile, "3"); Instances traindata = atcTrain.loadInstances(true, "atc"); if (onlyTest) { if (FileUtilsElh.checkFile(testFile)) { System.err.println("read from test file"); reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat, nullSentenceOpinions, lang); atcTrain.setCorpus(reader); traindata = atcTrain.loadInstances(true, "atc"); } } //setting class attribute (entCat|attCat|entAttCat|polarityCat) //HashMap<String, Integer> opInst = atcTrain.getOpinInst(); //WekaWrapper classifyAtts; WekaWrapper onevsall; try { //classify.printMultilabelPredictions(classify.multiLabelPrediction()); */ //onevsall Instances entdata = new Instances(traindata); entdata.deleteAttributeAt(entdata.attribute("attCat").index()); entdata.deleteAttributeAt(entdata.attribute("entAttCat").index()); entdata.setClassIndex(entdata.attribute("entCat").index()); onevsall = new WekaWrapper(entdata, true); if (!onlyTest) { onevsall.trainOneVsAll(modelsPath, paramFile + "entCat"); System.out.println("trainATC: one vs all models ready"); } onevsall.setTestdata(entdata); HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entCat"); System.out.println("trainATC: one vs all predictions ready"); HashMap<Integer, String> instOps = new HashMap<Integer, String>(); for (String oId : atcTrain.getOpinInst().keySet()) { instOps.put(atcTrain.getOpinInst().get(oId), oId); } atcTrain = new Features(reader, paramFile2, "3"); entdata = atcTrain.loadInstances(true, "attTrain2_data"); entdata.deleteAttributeAt(entdata.attribute("entAttCat").index()); //entdata.setClassIndex(entdata.attribute("entCat").index()); Attribute insAtt = entdata.attribute("instanceId"); double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1); System.err.println("last instance has index: " + maxInstId); for (int ins = 0; ins < entdata.numInstances(); ins++) { System.err.println("ins" + ins); int i = (int) entdata.instance(ins).value(insAtt); Instance currentInst = entdata.instance(ins); //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i)); String sId = reader.getOpinion(instOps.get(i)).getsId(); String oId = instOps.get(i); reader.removeSentenceOpinions(sId); int oSubId = 0; for (String cl : ovsaRes.get(i).keySet()) { //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold) { //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); // for the first one update the instances if (oSubId >= 1) { Instance newIns = new SparseInstance(currentInst); newIns.setDataset(entdata); entdata.add(newIns); newIns.setValue(insAtt, maxInstId + oSubId); newIns.setClassValue(cl); instOps.put((int) maxInstId + oSubId, oId); } // if the are more create new instances else { currentInst.setClassValue(cl); //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId); //reader.addOpinion(op); } oSubId++; } } //finished updating instances data } entdata.setClass(entdata.attribute("attCat")); onevsall = new WekaWrapper(entdata, true); /** * Bigarren sailkatzailea * * */ if (!onlyTest) { onevsall.trainOneVsAll(modelsPath, paramFile + "attCat"); System.out.println("trainATC: one vs all attcat models ready"); } ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat"); insAtt = entdata.attribute("instanceId"); maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues()); System.err.println("last instance has index: " + maxInstId); for (int ins = 0; ins < entdata.numInstances(); ins++) { System.err.println("ins: " + ins); int i = (int) entdata.instance(ins).value(insAtt); Instance currentInst = entdata.instance(ins); //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i)); String sId = reader.getOpinion(instOps.get(i)).getsId(); String oId = instOps.get(i); reader.removeSentenceOpinions(sId); int oSubId = 0; for (String cl : ovsaRes.get(i).keySet()) { //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold2) { ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold) { //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); // for the first one update the instances if (oSubId >= 1) { String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl; //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId); reader.addOpinion(op); } // if the are more create new instances else { String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl; //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); reader.removeOpinion(oId); Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId); reader.addOpinion(op); } oSubId++; } } //finished updating instances data } } reader.print2Semeval2015format(paramFile + "entAttCat.xml"); } catch (Exception e) { e.printStackTrace(); } //traindata.setClass(traindata.attribute("entAttCat")); System.err.println("DONE CLI train-atc2 (oneVsAll)"); }
From source file:elh.eus.absa.Features.java
License:Open Source License
/** * Function fills the attribute vectors for the instances existing in the corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * /*from ww w .ja va 2 s.com*/ * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstances(boolean save, String prefix) throws IOException { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //Properties posProp = new Properties(); //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); if (params.containsKey("lemmaNgrams")) { Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"), params.getProperty("lemma-model"), corpus.getLang(), "bin", "false"); postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; // string normalization (emoticons, twitter grammar,...) String opNormalized = corpus.getOpinionSentence(oId); // compute uppercase ratio before normalization (if needed) double upRatio = 0.0; if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) { String upper = opNormalized.replaceAll("[\\p{Ll}]", ""); upRatio = (double) upper.length() / (double) opNormalized.length(); values[rsltdata.attribute("upperCaseRation").index()] = upRatio; } // string normalization (emoticons, twitter grammar,...) if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams")) && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) { opNormalized = normalize(opNormalized, params.getProperty("normalization", "none")); } //process the current instance with the NLP pipeline in order to get token and lemma|pos features KAFDocument nafinst = new KAFDocument("", ""); String nafname = trainExamples.get(oId).getsId().replace(':', '_'); String nafDir = params.getProperty("kafDir"); String nafPath = nafDir + File.separator + nafname + ".kaf"; //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = 1; try { if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty())) { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(), params.getProperty("pos-model"), params.getProperty("lemma-model"), postagger); Files.createDirectories(Paths.get(nafDir)); nafinst.save(nafPath); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } else { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang()); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } } catch (IOException | JDOMException e) { System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId + "|" + oId + ") for filling the attribute vector"); e.printStackTrace(); System.exit(5); } LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } List<WF> window = nafinst.getWFs(); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer to = window.size(); Integer from = 0; end++; for (int i = 0; i < window.size(); i++) { WF wf = window.get(i); if ((wf.getOffset() == start) && (i >= bowWin)) { from = i - bowWin; } else if (wf.getOffset() >= end) { if (i + bowWin < window.size()) { to = i + bowWin; } break; } } window = window.subList(from, to); //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); List<String> windowWFIds = new ArrayList<String>(); // word form ngram related features for (WF wf : window) { windowWFIds.add(wf.getId()); String wfStr = wf.getForm(); if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (Term t : nafinst.getTermsFromWFs(windowWFIds)) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } String lemma = t.getLemma(); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // add ngrams to the feature vector for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { Attribute ngAtt = rsltdata.attribute(ng); if (ngAtt != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } } ng = featureFromArray(ngrams.subList(0, i + 1), ""); if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker } //end ngram checking } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } posNgrams.add(t.getPos()); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (rsltdata.attribute(ng) != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } // polarity lexicons if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker ngrams.removeFirst(); } //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId)); if (pol != null && !pol.isEmpty()) { //System.err.println("polarity: _"+pol+"_"); inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstances() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; }
From source file:elh.eus.absa.Features.java
License:Open Source License
/** * Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * /*from ww w. ja va 2 s . c o m*/ * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstancesTAB(boolean save, String prefix) { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } String[] noWindow = corpus.getOpinionSentence(oId).split("\n"); //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = noWindow.length; List<String> window = Arrays.asList(noWindow); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer from = start - bowWin; if (from < 0) { from = 0; } Integer to = end + bowWin; if (to > noWindow.length - 1) { to = noWindow.length - 1; } window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to)); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); //System.err.println(Arrays.toString(window.toArray())); // word form ngram related features for (String wf : window) { String[] fields = wf.split("\t"); String wfStr = normalize(fields[0], params.getProperty("normalization", "none")); // blank line means we found a sentence end. Empty n-gram list and reiniciate. if (wf.equals("")) { // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, true); //toknum // since wf is empty no need to check for clusters and other features. continue; } if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (String t : window) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } //blank line means we found a sentence end. Empty n-gram list and reiniciate. if (t.equals("")) { // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum // since t is empty no need to check for clusters and other features. continue; } String[] fields = t.split("\t"); if (fields.length < 2) { continue; } String lemma = normalize(fields[1], params.getProperty("normalization", "none")); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams); } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } String[] fields = t.split("\t"); if (fields.length < 3) { continue; } String pos = fields[2]; posNgrams.add(pos); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams); //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } // compute uppercase ratio before normalization (if needed) //double upRatio =0.0; //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) //{ // String upper = opNormalized.replaceAll("[a-z]", ""); // upRatio = (double)upper.length() / (double)opNormalized.length(); // values[rsltdata.attribute("upperCaseRation").index()] = upRatio; //} //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); if (pol != null && !pol.isEmpty()) { inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { //System.err.println("polarity: _"+pol+"_"); inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstancesTAB() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; }
From source file:elh.eus.absa.Features.java
License:Open Source License
/** * Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * //from w w w . ja va2 s.c o m * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstancesConll(boolean save, String prefix) { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); String nafdir = params.getProperty("kafDir"); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstancesConll() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstancesConll() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } String nafPath = nafdir + File.separator + trainExamples.get(oId).getsId().replace(':', '_'); String taggedFile = ""; try { if (!FileUtilsElh.checkFile(nafPath + ".kaf")) { nafPath = NLPpipelineWrapper.tagSentence(corpus.getOpinionSentence(oId), nafPath, corpus.getLang(), params.getProperty("pos-model"), params.getProperty("lemma-model"), postagger); } else { nafPath = nafPath + ".kaf"; } InputStream reader = new FileInputStream(new File(nafPath)); taggedFile = IOUtils.toString(reader); reader.close(); } catch (IOException | JDOMException fe) { // TODO Auto-generated catch block fe.printStackTrace(); } String[] noWindow = taggedFile.split("\n"); //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = noWindow.length; //System.err.println("Features::loadInstancesConll - tagged File read lines:"+tokNum); List<String> window = Arrays.asList(noWindow); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer from = start - bowWin; if (from < 0) { from = 0; } Integer to = end + bowWin; if (to > noWindow.length - 1) { to = noWindow.length - 1; } window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to)); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); //System.err.println(Arrays.toString(window.toArray())); // word form ngram related features for (String wf : window) { String[] fields = wf.split("\\s"); String wfStr = normalize(fields[0], params.getProperty("normalization", "none")); // blank line means we found a sentence end. Empty n-gram list and reiniciate. if (wf.equals("")) { // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, true); //toknum // since wf is empty no need to check for clusters and other features. continue; } if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (String t : window) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } //blank line means we found a sentence end. Empty n-gram list and reiniciate. if (t.equals("")) { // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum // since t is empty no need to check for clusters and other features. continue; } String[] fields = t.split("\\s"); if (fields.length < 2) { continue; } String lemma = normalize(fields[1], params.getProperty("normalization", "none")); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams); } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } String[] fields = t.split("\\s"); if (fields.length < 3) { continue; } String pos = fields[2]; posNgrams.add(pos); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams); //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } // compute uppercase ratio before normalization (if needed) //double upRatio =0.0; //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) //{ // String upper = opNormalized.replaceAll("[a-z]", ""); // upRatio = (double)upper.length() / (double)opNormalized.length(); // values[rsltdata.attribute("upperCaseRation").index()] = upRatio; //} //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); if (pol != null && !pol.isEmpty()) { inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { //System.err.println("polarity: _"+pol+"_"); inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstancesConll() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; }
From source file:en_deep.mlprocess.manipulation.featmodif.FeatureModifierFilter.java
License:Open Source License
/** * Convert a single instance over if the class is nominal. The converted * instance is added to the end of the output queue. * * @param instance the instance to convert *//*from w ww. j ava2s. c o m*/ private void convertInstance(Instance instance) { double[] vals = new double[outputFormatPeek().numAttributes()]; String[] stringVals = new String[vals.length]; int attSoFar = 0; for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = instance.attribute(j); if (!m_Columns.isInRange(j)) { vals[attSoFar] = instance.value(j); attSoFar++; } else { // store new string values, make double values "missing" for now (if some string // values are missing, the double values will remain missing) if (instance.value(0) == 12 && instance.value(1) == 9 && att.name().equals("sempos")) { attSoFar = attSoFar; } attSoFar += getAttributeOutputValue(att, instance.value(j), vals, stringVals, attSoFar); } } Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new DenseInstance(instance.weight(), vals); } inst.setDataset(getOutputFormat()); copyValues(inst, false, instance.dataset(), getOutputFormat()); // add new string values to the output data set and to the instance for (int i = 0; i < stringVals.length; ++i) { if (stringVals[i] != null) { vals[i] = inst.dataset().attribute(i).addStringValue(stringVals[i]); } } inst.replaceMissingValues(vals); inst.setDataset(getOutputFormat()); push(inst); }