Java tutorial
/* * Copyright 2014 Elhuyar Fundazioa This file is part of EliXa. EliXa is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. EliXa is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with EliXa. If not, see <http://www.gnu.org/licenses/>. */ package elh.eus.absa; import elh.eus.absa.CorpusReader; import ixa.kaflib.KAFDocument; import ixa.kaflib.Term; import ixa.kaflib.WF; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.Enumeration; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.TreeSet; import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; import weka.core.SparseInstance; import weka.core.converters.ArffSaver; import java.util.Properties; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.jdom2.JDOMException; /** * @author isanvi * */ public class Features { private CorpusReader corpus; private Properties params = new Properties(); private Instances traindata; // Create unigram base numeric attributes private ArrayList<Attribute> atts = new ArrayList<Attribute>(); //structure to control attribute indexes private HashMap<String, Integer> attIndexes = new HashMap<String, Integer>(); //structure to control instance ids wrt opinions/sentences private HashMap<String, Integer> opInst = new HashMap<String, Integer>(); //structure to control instance ids wrt opinions/sentences private HashMap<String, HashMap<String, Integer>> attributeSets = new HashMap<String, HashMap<String, Integer>>(); //structure to store word form ngram attributes private HashMap<String, Integer> wfNgrams = new HashMap<String, Integer>(); //structure to store lemma ngram attributes private HashMap<String, Integer> lemmaNgrams = new HashMap<String, Integer>(); //structure to store POS ngram attributes private HashMap<String, Integer> POSNgrams = new HashMap<String, Integer>(); private List<String> ClassificationClasses = Arrays.asList("dummy", "positive", "negative", "neutral"); //structure to control general polarity lexicon private Lexicon polarLexiconGen; //structure to control domain polarity lexicon private Lexicon polarLexiconDom; // PoS tagger object. Defined here for optimizing posTagger resource loading times private eus.ixa.ixa.pipe.pos.Annotate postagger; //MicroText Normalization object private MicroTextNormalizer MicrotxtNormalizer; // feature number private int featNum; /** * Constructor * @param InputStream ins : InputStream containing a corpus to process. * @param String paramFile : Path to the file containing the feature configuration file * (which features should be used) */ public Features(InputStream ins, String lang, String format, String paramFile, String classes) { this(new CorpusReader(ins, format, lang), paramFile, classes); } /** * Constructor * @param Corpus reader creader : An already existing corpus reader object. * @param String paramFile : Path to the file containing the feature configuration file * (which features should be used) */ public Features(CorpusReader creader, String paramFile, String classes) { //System.err.println("Features: constructor call"); this.corpus = creader; this.featNum = 0; setClasses(classes); File pfile = new File(paramFile); if (FileUtilsElh.checkFile(pfile)) { try { params.load(new FileInputStream(pfile)); String norm = params.getProperty("normalization", "none"); //preprocess if (norm.matches("(?i)(all|noHashtag)")) { MicrotxtNormalizer = new MicroTextNormalizer(corpus.getLang()); MicrotxtNormalizer .setEmodict(this.getClass().getClassLoader().getResourceAsStream("emoticons.lex")); } else if (norm.compareTo("none") != 0) { MicrotxtNormalizer = new MicroTextNormalizer(corpus.getLang()); } //System.err.println("Features: initiate feature extraction from corpus"); createFeatureSet(); } catch (IOException e) { System.err.println("Features: error when loading training parameter properties"); e.printStackTrace(); } } else { System.err.println("Features: given parameter file (" + paramFile + ") is not a valid file."); System.exit(1); } } /** * Constructor * @param Corpus reader creader : An already existing corpus reader object. * @param String paramFile : Path to the file containing the feature configuration file * (which features should be used) */ public Features(CorpusReader creader, String paramFile, String classes, String modelPath) { this.corpus = creader; this.featNum = 0; setClasses(classes); File pfile = new File(paramFile); if (FileUtilsElh.checkFile(pfile)) { try { params.load(new FileInputStream(pfile)); String norm = params.getProperty("normalization", "none"); //preprocess if (norm.matches("(?i)(all|noHashtag)")) { MicrotxtNormalizer = new MicroTextNormalizer(corpus.getLang()); MicrotxtNormalizer .setEmodict(this.getClass().getClassLoader().getResourceAsStream("emoticons.lex")); } else if (norm.compareTo("none") != 0) { MicrotxtNormalizer = new MicroTextNormalizer(corpus.getLang()); } if (FileUtilsElh.checkFile(modelPath)) { createFeatureSetFromModel(modelPath); } else { System.err.println("Features: initiate feature extraction from corpus"); createFeatureSet(); } } catch (IOException e) { System.err.println("Features: error when loading training parameter properties"); e.printStackTrace(); } } else { System.err.println("Features: given parameter file (" + paramFile + ") is not a valid file."); System.exit(1); } } /** * @return HashMap<String, Integer> containing the relation between the original opinions and * their generated attribute vector instances */ public HashMap<String, Integer> getOpinInst() { return opInst; } /** * @return Instances object containing the attribute vectors of the given corpus * */ public Instances getTraindata() { return traindata; } /** * Set the number of classification classes the classifier should be trained on. Depending on the number * of classes selected the annotation * * @param classes: String option for the number of classes to be learnt by the classifier. Options are: * (binary=p|n ; 3=p|n|neu ; 3+=p|n|neu|none ; 5+=p|n|neu|p+|n+ ; 5+=p|n|neu|p+|n+|none)" + " it defaults to 3 (p|n|neu).\n"); */ private void setClasses(String classes) { switch (classes) { case "binary": this.ClassificationClasses = Arrays.asList("dummy", "positive", "negative"); break; case "3": this.ClassificationClasses = Arrays.asList("dummy", "positive", "negative", "neutral"); break; case "3+": this.ClassificationClasses = Arrays.asList("dummy", "positive", "negative", "neutral", "none"); break; case "5": this.ClassificationClasses = Arrays.asList("dummy", "positive", "negative", "neutral", "positive+", "negative+"); break; case "5+": this.ClassificationClasses = Arrays.asList("dummy", "positive", "negative", "neutral", "positive+", "negative+", "none"); break; } } /** * @return HashMap<String, Integer> containing the name of the attributes and their indexes * in the attribute vectors */ private HashMap<String, Integer> getAttIndexes() { return attIndexes; } /** * Creates a feature set from a previously saved model. This allows to load previously saved feature sets. * * @param model string: path to the serialized model containing header information * @throws IOException */ private void createFeatureSetFromModel(String model) throws IOException { try { WekaWrapper ww = new WekaWrapper(model); Instances header = ww.loadHeader(model); int attNum = header.numAttributes(); for (int i = 0; i < attNum; i++) { Attribute att = header.attribute(i); String name = att.name(); if (att.isNumeric()) { addNumericFeature(name); //System.out.println("numeric feature: "+name); } else if (att.isNominal()) { //System.out.println("nominal feature: "+name+" - "+att.toString()); ArrayList<String> vals = new ArrayList<String>(); Enumeration<Object> e = att.enumerateValues(); while (e.hasMoreElements()) { vals.add(e.nextElement().toString()); } addNominalFeature(name, vals); } } //General polarity lexicon if (header.attribute("polLexGen_posScore") != null) { this.polarLexiconGen = new Lexicon(new File(params.getProperty("polarLexiconGeneral")), "lemma"); System.err.println("Features : createFeatureSet() - General polarity lexicon loaded -> " + params.getProperty("polarLexiconGeneral") + " (" + this.polarLexiconGen.size() + " entries)"); System.out.println("Features : createFeatureSet() - General polarity lexicon loaded -> " + params.getProperty("polarLexiconGeneral") + " (" + this.polarLexiconGen.size() + " entries)"); } //Domain polarity lexicon if (header.attribute("polLexDom_posScore") != null) { //this.polarLexiconDom = loadPolarityLexiconFromFile(params.getProperty("polarLexiconDomain"), "polLexDom_"); this.polarLexiconDom = new Lexicon(new File(params.getProperty("polarLexiconDomain")), "lemma"); System.err.println("Features : createFeatureSet() - Domain polarity lexicon loaded -> " + params.getProperty("polarLexiconDomain") + " (" + this.polarLexiconDom.size() + " entries)"); System.out.println("Features : createFeatureSet() - Domain polarity lexicon loaded -> " + params.getProperty("polarLexiconDomain") + " (" + this.polarLexiconDom.size() + " entries)"); } // Load clark cluster category info from files loadClusterFeatures("clark"); // Load brown cluster category info from files loadClusterFeatures("brown"); // Load word2vec cluster category info from files loadClusterFeatures("word2vec"); } catch (Exception e) { System.err.println("Features::createFeatureSetFromFile -> error when loading model header"); e.printStackTrace(); } } /** * create feature set starting from the training corpus provided. * * @throws IOException */ @SuppressWarnings("unchecked") private void createFeatureSet() throws IOException { // create a Id attribute to link the instances to a certain opinion. Note that this attribute won't be // used for classifying. It is used only for linking the instances with their corresponding opinions addNumericFeature("instanceId"); //naf paths for the tagged files String nafDir = params.getProperty("kafDir"); // create pos tagging dir if not exists Files.createDirectories(Paths.get(nafDir)); // dummy variable to debug the feature loading int featPos = this.featNum; //word form and lemma ngram minimum frequencies. int wfMinFreq = 1; int lemmaMinFreq = 1; // in case pos tags are used and we want to filter lemmas according to their pos List<String> discardPos = new ArrayList<String>(); if (params.containsKey("posFilter")) { String[] posTofilter = params.getProperty("posFilter").split(","); for (int i = 0; i < posTofilter.length; i++) { discardPos.add(posTofilter[i]); } } Set<String> corpSentenceIds = corpus.getSentences().keySet(); //Corpus tagging, if the corpus is not in conll tabulated format if (corpus.getFormat().equalsIgnoreCase("tabNotagged") || !corpus.getFormat().startsWith("tab")) { if ((params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))) && (corpus.getLang().compareToIgnoreCase("eu") != 0)) { Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"), corpus.getLang(), "3", "bin", "false"); postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); } for (String key : corpSentenceIds) { String currentSent = corpus.getSentence(key); if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams")) && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) { currentSent = normalize(currentSent, params.getProperty("normalization", "none")); } String nafPath = nafDir + File.separator + key.replace(':', '_'); try { String taggedPath = NLPpipelineWrapper.tagSentence(currentSent, nafPath, corpus.getLang(), params.getProperty("pos-model"), postagger); } catch (JDOMException e) { System.err.println("Features::createFeatureSet -> NAF error when tagging sentence"); e.printStackTrace(); } //System.err.println("Features::createFeatureSet -> corpus normalization step done"); } } // word form ngram features if (params.containsKey("wfngrams")) { // Min frequency for word form ngrams if (params.containsKey("wfMinFreq")) { try { wfMinFreq = Integer.parseInt(params.getProperty("wfMinFreq")); } catch (NumberFormatException nfe) { System.err.println("Features::createFeatureSet() - provided word form minimum frequency " + "is not an integer. Default value 1 will be used"); } } File test = new File(params.getProperty("wfngrams")); // If the word form ngram list is stored in a file. if (test.isFile()) { loadAttributeListFromFile(test, "wf"); } // If the corpus is in conll tabulated format else if (corpus.getFormat().startsWith("tab") && !corpus.getFormat().equalsIgnoreCase("tabNotagged")) { // N-gram Feature vector : extracted from sentences int success = extractNgramsTAB(Integer.valueOf(params.getProperty("wfngrams")), "wf", discardPos, true); addNumericFeatureSet("", wfNgrams, wfMinFreq); } // Otherwise use previously tagged files with ixa-pipes else { int wfNgramsLength = Integer.valueOf(params.getProperty("wfngrams")); System.err.println("Features::createFeatureSet -> word from ngram extraction (" + wfNgramsLength + ")-grams)..."); for (String key : corpSentenceIds) { String nafPath = nafDir + File.separator + key.replace(':', '_') + ".kaf"; //eu tagged files are conll format if (corpus.getLang().equalsIgnoreCase("eu")) { int success = extractNgramsTABString(new FileInputStream(new File(nafPath)), wfNgramsLength, "wf", discardPos, true); } else { KAFDocument naf = KAFDocument.createFromFile(new File(nafPath)); // N-gram Feature vector : extracted from sentences int success = extractWfNgramsKAF(Integer.valueOf(params.getProperty("wfngrams")), naf, true); } } addNumericFeatureSet("", wfNgrams, wfMinFreq); } System.err.println("Features : createFeatureSet() - unigram features -> " + (this.featNum - featPos)); System.out.println("Features : createFeatureSet() - unigram features -> " + (this.featNum - featPos)); } // lemma ngram features if (params.containsKey("lemmaNgrams")) { // Min frequency for word form ngrams if (params.containsKey("lemmaMinFreq")) { try { lemmaMinFreq = Integer.parseInt(params.getProperty("lemmaMinFreq")); } catch (NumberFormatException nfe) { System.err.println("Features::createFeatureSet() - provided lemma minimum frequency " + "is not an integer. Default value 1 will be used"); } } featPos = this.featNum; File test = new File(params.getProperty("lemmaNgrams")); // If N-grams are stored in a file if (test.isFile()) { loadAttributeListFromFile(test, "lemmaNgrams"); } // If the corpus is in conll tabulated format else if (corpus.getFormat().startsWith("tab") && !corpus.getFormat().equalsIgnoreCase("tabNotagged")) { // N-gram Feature vector : extracted from sentences int success = extractNgramsTAB(Integer.valueOf(params.getProperty("lemmaNgrams")), "lemma", discardPos, true); addNumericFeatureSet("", lemmaNgrams, lemmaMinFreq); } // Otherwise use previously tagged files with ixa-pipes else { int lemmaNgramsLength = Integer.valueOf(params.getProperty("lemmaNgrams")); System.err.println("Features::createFeatureSet -> lemma ngram extraction (" + lemmaNgramsLength + "-grams)..."); for (String key : corpSentenceIds) { String nafPath = nafDir + File.separator + key.replace(':', '_') + ".kaf"; //eu tagged files are conll format if (corpus.getLang().equalsIgnoreCase("eu")) { int success = extractNgramsTABString(new FileInputStream(new File(nafPath)), lemmaNgramsLength, "lemma", discardPos, true); } else { KAFDocument naf = KAFDocument.createFromFile(new File(nafPath)); // N-gram Feature vector : extracted from sentences int success = extractLemmaNgrams(Integer.valueOf(params.getProperty("lemmaNgrams")), naf, discardPos, true); } } addNumericFeatureSet("", lemmaNgrams, lemmaMinFreq); } System.out .println("Features : createFeatureSet() - lemma ngram features -> " + (this.featNum - featPos)); System.err .println("Features : createFeatureSet() - lemma ngram features -> " + (this.featNum - featPos)); } // pos tag features if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { featPos = this.featNum; File test = new File(params.getProperty("pos")); String conll = ""; // if POS ngrams are stored in a file if (test.isFile()) { loadAttributeListFromFile(test, "POS_"); } // If the corpus is in conll tabulated format else if (corpus.getFormat().startsWith("tab") && !corpus.getFormat().equalsIgnoreCase("tabNotagged")) { // N-gram Feature vector : extracted from sentences int success = extractNgramsTAB(Integer.valueOf(params.getProperty("pos")), "pos", discardPos, true); addNumericFeatureSet("", POSNgrams, 1); } // Otherwise use previously tagged files with ixa-pipes else { int posNgramLength = Integer.valueOf(params.getProperty("pos")); System.err.println( "Features::createFeatureSet -> pos ngram extraction (" + posNgramLength + "-grams)..."); for (String key : corpSentenceIds) { String nafPath = nafDir + File.separator + key.replace(':', '_') + ".kaf"; //eu tagged files are conll format if (corpus.getLang().equalsIgnoreCase("eu")) { int success = extractNgramsTABString(new FileInputStream(new File(nafPath)), posNgramLength, "pos", discardPos, true); } else { KAFDocument naf = KAFDocument.createFromFile(new File(nafPath)); // N-gram Feature vector : extracted from sentences int success = extractPosNgrams(Integer.valueOf(params.getProperty("pos")), naf, discardPos, true); } } addNumericFeatureSet("", POSNgrams, 1); } System.out.println("Features : createFeatureSet() - pos tag features -> " + (this.featNum - featPos)); System.err.println("Features : createFeatureSet() - pos tag features -> " + (this.featNum - featPos)); } // Load clark cluster category info from files loadClusterFeatures("clark"); // Load brown cluster category info from files loadClusterFeatures("brown"); // Load word2vec cluster category info from files loadClusterFeatures("word2vec"); // add sentence length as feature if (!params.getProperty("sentenceLength", "no").equalsIgnoreCase("no")) { addNumericFeature("sentenceLength"); } // add sentence length as feature if (!params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("no")) { addNumericFeature("upperCaseRatio"); } // Category vector extracted from training set opinions TreeSet<String>[] categoryInfo = new TreeSet[3]; if (!params.getProperty("categories", "no").equalsIgnoreCase("no")) { categoryInfo = extractCategories(); } // Two separated features characterize the category infor: entity (E) and attribute (A) if (params.getProperty("categories", "no").equalsIgnoreCase("E&A")) { // Declare Nominal attribute for entity category ArrayList<String> entVal = new ArrayList<String>(categoryInfo[0]); entVal.add(0, "dummy"); //needed because of weka's sparse data format problems... addNominalFeature("entCat", entVal); // Declare Nominal attribute for entity category ArrayList<String> attVal = new ArrayList<String>(categoryInfo[1]); attVal.add(0, "dummy"); //needed because of weka's sparse data format problems... addNominalFeature("attCat", attVal); // Declare Nominal attribute for category as a whole /*ArrayList<String> entAttVal = new ArrayList<String>(categoryInfo[2]); entAttVal.add(0, "dummy"); //needed because of weka's sparse data format problems... addNominalFeature("entAttCat", entAttVal);*/ } // Category as a whole else if (params.getProperty("categories", "no").equalsIgnoreCase("E#A")) { // Declare Nominal attribute for category as a whole ArrayList<String> entAttVal = new ArrayList<String>(categoryInfo[2]); entAttVal.add(0, "dummy"); //needed because of weka's sparse data format problems... addNominalFeature("entAttCat", entAttVal); } /** * Look at the polarity lexicons * */ //General domain polarity lexicon if (params.containsKey("polarLexiconGeneral") && FileUtilsElh.checkFile(params.getProperty("polarLexiconGeneral"))) { //this.polarLexiconGen = loadPolarityLexiconFromFile(params.getProperty("polarLexiconGeneral"), "polLexGen_"); this.polarLexiconGen = new Lexicon(new File(params.getProperty("polarLexiconGeneral")), "lemma"); System.err.println("Features : createFeatureSet() - General polarity lexicon loaded -> " + params.getProperty("polarLexiconGeneral") + " (" + this.polarLexiconGen.size() + " entries)"); System.out.println("Features : createFeatureSet() - General polarity lexicon loaded -> " + params.getProperty("polarLexiconGeneral") + " (" + this.polarLexiconGen.size() + " entries)"); if (params.containsKey("polNgrams") && !params.getProperty("polNgrams").equalsIgnoreCase("no")) { for (String s : this.polarLexiconGen.getEntrySet()) { addNumericFeature("polgen_" + s); } System.err.println("Features : createFeatureSet() - General polarity lexicon lemmas loaded. -> " + this.polarLexiconGen.size()); System.out.println("Features : createFeatureSet() - General polarity lexicon lemmas loaded. -> " + this.polarLexiconGen.size()); } //add features to feature map: two features, positive|negative scores addNumericFeature("polLexGen_posScore"); addNumericFeature("polLexGen_negScore"); } //Domain polarity lexicon if (params.containsKey("polarLexiconDomain") && FileUtilsElh.checkFile(params.getProperty("polarLexiconDomain"))) { //this.polarLexiconDom = loadPolarityLexiconFromFile(params.getProperty("polarLexiconDomain"), "polLexDom_"); this.polarLexiconDom = new Lexicon(new File(params.getProperty("polarLexiconDomain")), "lemma"); System.err.println("Features : createFeatureSet() - Domain polarity lexicon loaded -> " + params.getProperty("polarLexiconDomain") + " (" + this.polarLexiconDom.size() + " entries)"); System.out.println("Features : createFeatureSet() - Domain polarity lexicon loaded -> " + params.getProperty("polarLexiconDomain") + " (" + this.polarLexiconDom.size() + " entries)"); if (params.containsKey("polNgrams") && !params.getProperty("polNgrams").equalsIgnoreCase("no")) { for (String s : this.polarLexiconDom.getEntrySet()) { addNumericFeature("poldom_" + s); } System.err.println("Features : createFeatureSet() - Domain polarity lexicon lemmas loaded -> " + this.polarLexiconDom.size()); System.out.println("Features : createFeatureSet() - Domain polarity lexicon lemmas loaded -> " + this.polarLexiconDom.size()); } //add features to feature map: two features, positive|negative scores addNumericFeature("polLexDom_posScore"); addNumericFeature("polLexDom_negScore"); } // if polarity is activated in the parameter file look for if (params.containsKey("polarity") && params.getProperty("polarity").equalsIgnoreCase("yes")) { // Declare the class attribute along with its values addNominalFeature("polarityCat", this.ClassificationClasses); } } private void loadClusterFeatures(String clname) throws IOException { // Load clark cluster category info from files HashMap<String, Integer> clMap = new HashMap<String, Integer>(); if (params.containsKey("clark")) { int featPos = this.featNum; clMap = loadAttributeMapFromFile(params.getProperty(clname), clname + "ClId_"); if (clMap.isEmpty()) { params.remove(clname); } else { attributeSets.put(clname + "Cl", clMap); } System.err.println("Features : loadClusterFeatures() - " + clname + " cluster features -> " + (this.featNum - featPos)); System.out.println("Features : loadClusterFeatures() - " + clname + " cluster features -> " + (this.featNum - featPos)); } } /** * Function fills the attribute vectors for the instances existing in the corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstances(boolean save, String prefix) throws IOException { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //Properties posProp = new Properties(); //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); if (params.containsKey("lemmaNgrams")) { Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"), corpus.getLang(), "3", "bin", "false"); postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; // string normalization (emoticons, twitter grammar,...) String opNormalized = corpus.getOpinionSentence(oId); // compute uppercase ratio before normalization (if needed) double upRatio = 0.0; if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) { String upper = opNormalized.replaceAll("[\\p{Ll}]", ""); upRatio = (double) upper.length() / (double) opNormalized.length(); values[rsltdata.attribute("upperCaseRation").index()] = upRatio; } // string normalization (emoticons, twitter grammar,...) if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams")) && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) { opNormalized = normalize(opNormalized, params.getProperty("normalization", "none")); } //process the current instance with the NLP pipeline in order to get token and lemma|pos features KAFDocument nafinst = new KAFDocument("", ""); String nafname = trainExamples.get(oId).getsId().replace(':', '_'); String nafDir = params.getProperty("kafDir"); String nafPath = nafDir + File.separator + nafname + ".kaf"; //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = 1; try { if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty())) { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(), params.getProperty("pos-model"), postagger); Files.createDirectories(Paths.get(nafDir)); nafinst.save(nafPath); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } else { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang()); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } } catch (IOException | JDOMException e) { System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId + "|" + oId + ") for filling the attribute vector"); e.printStackTrace(); System.exit(5); } LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } List<WF> window = nafinst.getWFs(); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer to = window.size(); Integer from = 0; end++; for (int i = 0; i < window.size(); i++) { WF wf = window.get(i); if ((wf.getOffset() == start) && (i >= bowWin)) { from = i - bowWin; } else if (wf.getOffset() >= end) { if (i + bowWin < window.size()) { to = i + bowWin; } break; } } window = window.subList(from, to); //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); List<String> windowWFIds = new ArrayList<String>(); // word form ngram related features for (WF wf : window) { windowWFIds.add(wf.getId()); String wfStr = wf.getForm(); if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (Term t : nafinst.getTermsFromWFs(windowWFIds)) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } String lemma = t.getLemma(); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // add ngrams to the feature vector for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { Attribute ngAtt = rsltdata.attribute(ng); if (ngAtt != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } } ng = featureFromArray(ngrams.subList(0, i + 1), ""); if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker } //end ngram checking } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } posNgrams.add(t.getPos()); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (rsltdata.attribute(ng) != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } // polarity lexicons if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker ngrams.removeFirst(); } //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId)); if (pol != null && !pol.isEmpty()) { //System.err.println("polarity: _"+pol+"_"); inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstances() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; } /** * Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstancesTAB(boolean save, String prefix) { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } String[] noWindow = corpus.getOpinionSentence(oId).split("\n"); //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = noWindow.length; List<String> window = Arrays.asList(noWindow); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer from = start - bowWin; if (from < 0) { from = 0; } Integer to = end + bowWin; if (to > noWindow.length - 1) { to = noWindow.length - 1; } window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to)); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); //System.err.println(Arrays.toString(window.toArray())); // word form ngram related features for (String wf : window) { String[] fields = wf.split("\t"); String wfStr = normalize(fields[0], params.getProperty("normalization", "none")); // blank line means we found a sentence end. Empty n-gram list and reiniciate. if (wf.equals("")) { // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, true); //toknum // since wf is empty no need to check for clusters and other features. continue; } if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (String t : window) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } //blank line means we found a sentence end. Empty n-gram list and reiniciate. if (t.equals("")) { // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum // since t is empty no need to check for clusters and other features. continue; } String[] fields = t.split("\t"); if (fields.length < 2) { continue; } String lemma = normalize(fields[1], params.getProperty("normalization", "none")); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams); } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } String[] fields = t.split("\t"); if (fields.length < 3) { continue; } String pos = fields[2]; posNgrams.add(pos); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams); //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } // compute uppercase ratio before normalization (if needed) //double upRatio =0.0; //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) //{ // String upper = opNormalized.replaceAll("[a-z]", ""); // upRatio = (double)upper.length() / (double)opNormalized.length(); // values[rsltdata.attribute("upperCaseRation").index()] = upRatio; //} //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); if (pol != null && !pol.isEmpty()) { inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { //System.err.println("polarity: _"+pol+"_"); inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstancesTAB() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; } /** * Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstancesConll(boolean save, String prefix) { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); String nafdir = params.getProperty("kafDir"); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstancesConll() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstancesConll() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } String nafPath = nafdir + File.separator + trainExamples.get(oId).getsId().replace(':', '_'); String taggedFile = ""; try { if (!FileUtilsElh.checkFile(nafPath + ".kaf")) { nafPath = NLPpipelineWrapper.tagSentence(corpus.getOpinionSentence(oId), nafPath, corpus.getLang(), params.getProperty("pos-model"), postagger); } else { nafPath = nafPath + ".kaf"; } InputStream reader = new FileInputStream(new File(nafPath)); taggedFile = IOUtils.toString(reader); reader.close(); } catch (IOException | JDOMException fe) { // TODO Auto-generated catch block fe.printStackTrace(); } String[] noWindow = taggedFile.split("\n"); //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = noWindow.length; //System.err.println("Features::loadInstancesConll - tagged File read lines:"+tokNum); List<String> window = Arrays.asList(noWindow); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer from = start - bowWin; if (from < 0) { from = 0; } Integer to = end + bowWin; if (to > noWindow.length - 1) { to = noWindow.length - 1; } window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to)); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); //System.err.println(Arrays.toString(window.toArray())); // word form ngram related features for (String wf : window) { String[] fields = wf.split("\\s"); String wfStr = normalize(fields[0], params.getProperty("normalization", "none")); // blank line means we found a sentence end. Empty n-gram list and reiniciate. if (wf.equals("")) { // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, true); //toknum // since wf is empty no need to check for clusters and other features. continue; } if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (String t : window) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } //blank line means we found a sentence end. Empty n-gram list and reiniciate. if (t.equals("")) { // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum // since t is empty no need to check for clusters and other features. continue; } String[] fields = t.split("\\s"); if (fields.length < 2) { continue; } String lemma = normalize(fields[1], params.getProperty("normalization", "none")); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams); } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } String[] fields = t.split("\\s"); if (fields.length < 3) { continue; } String pos = fields[2]; posNgrams.add(pos); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams); //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } // compute uppercase ratio before normalization (if needed) //double upRatio =0.0; //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) //{ // String upper = opNormalized.replaceAll("[a-z]", ""); // upRatio = (double)upper.length() / (double)opNormalized.length(); // values[rsltdata.attribute("upperCaseRation").index()] = upRatio; //} //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); if (pol != null && !pol.isEmpty()) { inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { //System.err.println("polarity: _"+pol+"_"); inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstancesConll() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; } /** * normalizePolarity maps polarity categories to the categories defined in this.classificationClasses; * * @param polarity * @return String : normalized polarity string */ private String normalizePolarity(String polarity) { if (polarity == null) { return null; } // normalize all polarity = polarity.replaceFirst("^(?i)pos(\\+)?$", "positive$1"); polarity = polarity.replaceFirst("^(?i)neg(\\+)?$", "negative$1"); polarity = polarity.replaceFirst("^(?i)p(\\+)?$", "positive$1"); polarity = polarity.replaceFirst("^(?i)n(\\+)?$", "negative$1"); polarity = polarity.replaceFirst("^(?i)neu$", "neutral"); polarity = polarity.replaceFirst("^\\+$", "positive"); polarity = polarity.replaceFirst("\\-$", "negative"); polarity = polarity.replaceFirst("^\\=$", "neutral"); polarity = polarity.replaceFirst("^(?i)none$", "none"); switch (this.ClassificationClasses.size()) { // binary (dummy,p,n) - 'none' and 'neutral' examples are discarded. case 3: polarity = polarity.replaceFirst("^neutral$", ""); polarity = polarity.replaceFirst("^none$", ""); polarity = polarity.replaceFirst("\\+$", ""); break; // 3 classes (dummy,p,n,neu) - 'none' examples are treated as neutral ones. case 4: polarity = polarity.replaceFirst("^none$", "neutral"); polarity = polarity.replaceFirst("\\+$", ""); break; // 3+ classes (dummy,p,n,neu,none) case 5: polarity = polarity.replaceFirst("\\+$", ""); break; // 5 classes (dummy,p,n,neu,p+,n+) case 6: // none examples are treated as neutral ones. polarity = polarity.replaceFirst("^none$", "neutral"); break; // 5+ classes (dummy,p,n,neu,p+,n+,none) nothing to do. case 7: break; } if (polarity.equalsIgnoreCase("")) { return null; } else { return polarity; } } /** * @param lemma * @return TreeSet<String> containing the unigrams extracted from the opinions. No NLP chain is used. * * @deprecated use {@extractWfNgrams(int length, KAFDocument kaf)} instead. */ @Deprecated public TreeSet<String> extract1gramsOldNoNLPchain(String lemma) { TreeSet<String> result = new TreeSet<String>(); System.err.println("unigram extraction: _" + lemma + "_"); // Word form unigrams are required if (lemma.equalsIgnoreCase("wform")) { for (String sent : corpus.getSentences().values()) { String[] split = sent.split(" "); for (String w : split) { String w_nopunct = w.replaceAll("[^\\p{L}\\p{M}\\p{Nd}]", ""); result.add(w_nopunct); } } } return result; } /** * Extract n-grams up to a certain length from an Conll tabulated format corpus. * * @param int length : which 'n' use for 'n-grams' * @param string type (wf|lemma|pos): what type of ngrams we want to extract. * @param boolean save : safe ngrams to file or not. * @return TreeSet<String> return word form ngrams of length length */ private int extractNgramsTAB(int length, String type, List<String> discardPos, boolean save) { //System.err.println("ngram extraction Tab: _"+length+"_"+type); if (length == 0) { return 0; } for (String sent : corpus.getSentences().keySet()) { //System.err.println("ngram extraction, corpus sentences: "+corpus.getSentences().get(sent)); String[] tokens = corpus.getSentences().get(sent).split("\n"); LinkedList<String> ngrams = new LinkedList<String>(); for (String row : tokens) { String ngram = ""; String[] fields = row.split("\t"); String pos = ""; switch (type) { case "wf": ngram = fields[0]; break; case "lemma": if (fields.length > 1) { ngram = fields[1]; } if (fields.length > 2) { pos = fields[2]; } break; case "pos": if (fields.length > 2) { ngram = fields[2]; switch (ngram.length()) { case 0: ngram = "-"; break; case 1: ngram = ngram.substring(0, 1); break; default: ngram = ngram.substring(0, 2); break; } } } //if the is a blank line we assume sentence has ended and we empty and re-initialize the n-gram list if (ngram.equals("")) { //empty n-gram list and add remaining n-grams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, type); addNgram(type, ng); ngrams.removeFirst(); } continue; } if (ngrams.size() >= length) { ngrams.removeFirst(); } //if no alphanumeric char is present discard the element as invalid ngram. Or if it has a PoS tag that //should be discarded String lCurrent = ngram; if ((!discardPos.contains(pos)) && (!ngram.matches("^[^\\p{L}\\p{M}\\p{Nd}\\p{InEmoticons}]+$")) && (lCurrent.length() > 1)) { //standarize numeric values to NUMNUM lemma value //ngram.replaceFirst("^[0-9]$", "NUMNUM"); if (!type.equalsIgnoreCase("pos")) { ngrams.add(normalize(ngram, params.getProperty("normalization", "none"))); } else { ngrams.add(ngram); } } //certain punctuation marks are allowed as lemmas else if ((lCurrent.length() < 2) && (lCurrent.matches("[,;.?!]"))) { ngrams.add(lCurrent); } // add ngrams to the feature list for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), type); addNgram(type, ng); } } //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, type); addNgram(type, ng); ngrams.removeFirst(); } } return 1; } /** * Extract n-grams up to a certain length from an Conll tabulated format string. * * @param String input : input tagged conll string * @param int length : which 'n' use for 'n-grams' * @param string type (wf|lemma|pos): what type of ngrams we want to extract. * @param boolean save : safe ngrams to file or not. * @return int success: return 1 if the process ended correctly */ private int extractNgramsTABString(InputStream input, int length, String type, List<String> discardPos, boolean save) { //System.err.println("ngram extraction Tab: _"+length+"_"+type); if (length == 0) { return 0; } //System.err.println("ngram extraction, corpus sentences: "+corpus.getSentences().get(sent)); //String[] tokens = input.split("\n"); BufferedReader reader = new BufferedReader(new InputStreamReader(input)); LinkedList<String> ngrams = new LinkedList<String>(); String line; try { while ((line = reader.readLine()) != null) { String ngram = ""; String[] fields = line.split("\\s"); String pos = ""; switch (type) { case "wf": ngram = fields[0]; break; case "lemma": if (fields.length > 1) { ngram = fields[1]; } if (fields.length > 2) { pos = fields[2]; } break; case "pos": if (fields.length > 2) { ngram = fields[2]; switch (ngram.length()) { case 0: ngram = "-"; break; case 1: ngram = ngram.substring(0, 1); break; default: ngram = ngram.substring(0, 2); break; } } } //if the is a blank line we assume sentence has ended and we empty and re-initialize the n-gram list if (ngram.equals("")) { //empty n-gram list and add remaining n-grams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, type); addNgram(type, ng); ngrams.removeFirst(); } continue; } if (ngrams.size() >= length) { ngrams.removeFirst(); } //if no alphanumeric char is present discard the element as invalid ngram. Or if it has a PoS tag that //should be discarded String lCurrent = ngram; if ((!discardPos.contains(pos)) && (!ngram.matches("^[^\\p{L}\\p{M}\\p{Nd}\\p{InEmoticons}]+$")) && (lCurrent.length() > 1)) { //standarize numeric values to NUMNUM lemma value //ngram.replaceFirst("^[0-9]$", "NUMNUM"); if (!type.equalsIgnoreCase("pos")) { ngrams.add(normalize(ngram, params.getProperty("normalization", "none"))); } else { ngrams.add(ngram); } } //certain punctuation marks are allowed as lemmas else if ((lCurrent.length() < 2) && (lCurrent.matches("[,;.?!]"))) { ngrams.add(lCurrent); } // add ngrams to the feature list for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), type); addNgram(type, ng); } } } catch (IOException e) { System.err.println("EliXa::Features::extractNgramsTABString - WARNING: Error reading tagged file, " + "ngram extraction may be only partial\n"); } //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, type); addNgram(type, ng); ngrams.removeFirst(); } return 1; } /** * Extract word form n-grams up to a certain length from a kaf/naf file * * @param int length : which 'n' use for 'n-grams' * @param KAFDocument kafDoc : postagged kaf document to extract ngrams from. * @param boolean save : safe ngrams to file or not. * @return TreeSet<String> return word form ngrams of length length */ private int extractWfNgramsKAF(int length, KAFDocument kafDoc, boolean save) { //System.err.println("ngram extraction: _"+length+"_"); if (length == 0) { return 0; } for (List<WF> sent : kafDoc.getSentences()) { LinkedList<String> ngrams = new LinkedList<String>(); for (WF wf : sent) { if (ngrams.size() >= length) { ngrams.removeFirst(); } ngrams.add(wf.getForm()); //ngrams.add(normalize(wf.getForm(), params.getProperty("normalization", "none"))); // add ngrams to the feature list for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), "wf"); addNgram("wf", ng); } } //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, "wf"); addNgram("wf", ng); ngrams.removeFirst(); } } return 1; } /** * Lemma ngram extraction from a kaf document * * @param int length : which 'n' use for 'n-grams' * @param KAFDocument kafDoc : postagged kaf document to extract ngrams from. * @param boolean save : safe ngrams to file or not. * @return TreeSet<String> return lemma ngrams of length length */ private int extractLemmaNgrams(int length, KAFDocument kafDoc, List<String> discardPos, boolean save) { //System.err.println("lemma ngram extraction: _"+length+"_"); if (length == 0) { return 0; } int sentNum = kafDoc.getSentences().size(); for (int s = 0; s < sentNum; s++) { LinkedList<String> ngrams = new LinkedList<String>(); for (Term term : kafDoc.getTermsBySent(s)) { if (ngrams.size() >= length) { ngrams.removeFirst(); } //if no alphanumeric char is present discard the element as invalid ngram. Or if it has a PoS tag that //should be discarded String lCurrent = term.getLemma(); if ((!discardPos.contains(term.getPos())) && (!lCurrent.matches("[^\\p{L}\\p{M}\\p{Nd}\\p{InEmoticons}]+")) && (lCurrent.length() > 1)) { ngrams.add(lCurrent); //ngrams.add(normalize(term.getLemma(), params.getProperty("normalization", "none"))); } //certain punctuation marks and emoticons are allowed as lemmas else if ((lCurrent.length() <= 2) && (lCurrent.matches("[,;.?!]"))) { ngrams.add(lCurrent); } // add ngrams to the feature list for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma"); addNgram("lemma", ng); } } //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, "lemma"); addNgram("lemma", ng); ngrams.removeFirst(); } } return 1; } /** * POS ngram extraction from a kaf document * * @param int length : which 'n' use for 'n-grams' * @param KAFDocument kafDoc : postagged kaf document to extract ngrams from. * @param boolean save : safe ngrams to file or not. * @return TreeSet<String> return lemma ngrams of length length */ public int extractPosNgrams(int length, KAFDocument kafDoc, List<String> discardPos, boolean save) { //System.err.println("POS ngram extraction: _"+length+"_"); if (length == 0) { return 0; } int sentNum = kafDoc.getSentences().size(); for (int s = 0; s < sentNum; s++) { LinkedList<String> ngrams = new LinkedList<String>(); for (Term term : kafDoc.getTermsBySent(s)) { if (ngrams.size() >= length) { ngrams.removeFirst(); } if (!discardPos.contains(term.getPos())) { ngrams.add(term.getPos()); } // add ngrams to the feature list for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), "pos"); addNgram("pos", ng); } } //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, "pos"); addNgram("pos", ng); ngrams.removeFirst(); } } return 1; } /** * Help function to add one ngram and its frequence (current+1) to the corresponding structure depending * on the ngram type. * * @param type (wf|lemma|pos) * @param ngram */ private void addNgram(String type, String ngram) { int freq = 0; switch (type) { case "wf": freq = 0; if (wfNgrams.containsKey(ngram)) { freq = wfNgrams.get(ngram); } this.wfNgrams.put(ngram, freq + 1); break; case "lemma": freq = 0; if (lemmaNgrams.containsKey(ngram)) { freq = lemmaNgrams.get(ngram); } this.lemmaNgrams.put(ngram, freq + 1); break; case "pos": freq = 0; if (POSNgrams.containsKey(ngram)) { freq = POSNgrams.get(ngram); } this.POSNgrams.put(ngram, freq + 1); break; case "default": System.err.println("Features::addNgram - wrong type, no ngram added."); } } /** * POS tags ngram extraction from a kaf document * * @param int length : which 'n' use for 'n-grams' * @param KAFDocument kafDoc : postagged kaf document to extract ngrams from. * @param boolean save : safe ngrams to file or not. * @return TreeSet<String> return lemma ngrams of length length */ public TreeSet<String> extractPOStags(KAFDocument kafDoc, boolean save) { TreeSet<String> result = new TreeSet<String>(); for (Term term : kafDoc.getTerms()) { String pos = "POS_" + term.getPos(); // for good measure, test that the lemma is not already included as a feature if (!getAttIndexes().containsKey(pos)) { addNumericFeature(pos); result.add(pos); } } return result; } /** * Extract category information. * If the format "category#subcategory" is used in the annotation both 3 infos are extracted: * - category ; subcategory ; category#subcategory * otherwise only the annotation is considered as a single category. * * @return TreeSet<String>[3] structure with 3 lists, one containing E categories, other containing A * attributes, and the other containing E#A pairs. Those opinion containing "NULL" values are will have * "NULL#NULL" value in the third case. */ @SuppressWarnings("unchecked") public TreeSet<String>[] extractCategories() { TreeSet<String>[] result = new TreeSet[3]; result[0] = new TreeSet<String>(); result[1] = new TreeSet<String>(); result[2] = new TreeSet<String>(); for (Opinion op : corpus.getOpinions().values()) { String entAtt = op.getCategory(); try { String[] split = entAtt.split("#"); result[0].add(split[0]); result[1].add(split[1]); } catch (NullPointerException | IndexOutOfBoundsException npe) { result[0].add("NULL"); result[1].add("NULL"); } result[2].add(entAtt); } return result; } /** * Function reads an attribute map from a file (mainly word cluster files) and adds the * * @param fname : path to the file containing the feature information * @param attName : prefix for the feature name in the feature vector * @return HashMap<String,Integer> contains the elements and their respective attribute values, * in order to later fill the vectors. * * @throws IOException if the given file give reading problems. */ private HashMap<String, Integer> loadAttributeMapFromFile(String fname, String attName) throws IOException { HashMap<String, Integer> result = new HashMap<String, Integer>(); TreeSet<Integer> valueSet = new TreeSet<Integer>(); if (FileUtilsElh.checkFile(fname)) { BufferedReader breader = new BufferedReader(new FileReader(fname)); String line; while ((line = breader.readLine()) != null) { if (line.startsWith("#") || line.matches("^\\s*$")) { continue; } String[] fields = line.split(" "); Integer attValue; try { attValue = Integer.valueOf(fields[1]); } catch (NumberFormatException nfe) { attValue = Integer.parseInt(fields[1], 2); } result.put(fields[0], attValue); valueSet.add(attValue); } breader.close(); //add features to feature map addNumericFeatureSet(attName, valueSet); } return result; } /** * Function reads an attribute map from a file (mainly word cluster files) and adds the * * @param fname : path to the file containing the feature information * @param attName : prefix for the feature name in the feature vector * @return HashMap<String,HashMap<String, Double>> contains the elements and their respective attribute values, * in order to later fill the vectors. * * @throws IOException if the given file give reading problems. */ @Deprecated private HashMap<String, HashMap<String, Double>> loadPolarityLexiconFromFile(String fname, String attName) throws IOException { HashMap<String, HashMap<String, Double>> result = new HashMap<String, HashMap<String, Double>>(); if (FileUtilsElh.checkFile(fname)) { BufferedReader breader = new BufferedReader(new FileReader(fname)); String line; while ((line = breader.readLine()) != null) { if (line.startsWith("#") || line.matches("^\\s*$")) { continue; } HashMap<String, Double> values = new HashMap<String, Double>(); String[] fields = line.split("\t"); double pos = 0.0; double neg = 0.0; switch (fields.length) { // not valid entry case 1: break; // single score representation check for modifiers and shifters as well case 2: if (fields[1].matches("(?i:pos.*)")) { pos = 1.0; } //|| Double.valueOf(fields[2])>0 else if (fields[1].matches("(?i:neg.*)")) { neg = 1.0; } else { try { double sc = Double.valueOf(fields[1]); if (sc > 0) { pos = sc; } else { neg = sc; } } catch (NumberFormatException nfe) { System.err.println( "Warning, lexicon entry with strange format, it maybe a modifier/shifter :" + fields[0] + " -- " + fields[1]); } } break; case 3: pos = Double.valueOf(fields[1]); neg = Double.valueOf(fields[2]); break; default: System.err.println("format error in the polarity lexicon\n"); break; } // if both positive and negative scores are 0 we consider the word should not be in the lexicon. if (pos > 0 || neg > 0) { values.put("pos", pos); values.put("neg", neg); result.put(fields[0], values); } } breader.close(); //add features to feature map: two features, positive|negative scores addNumericFeature(attName + "posScore"); addNumericFeature(attName + "negScore"); } return result; } /** * Function reads an attribute list from a file (ngram/word list files) and adds the elements to the * attribute list * * @param fname : path to the file containing the feature information * @param attName : prefix for the feature name in the feature vector * @return HashMap<String,Integer> contains the elements and their respective attribute values, * in order to later fill the vectors. * * @throws IOException if the given file give reading problems. */ private TreeSet<String> loadAttributeListFromFile(File fname, String attName) throws IOException { TreeSet<String> valueSet = new TreeSet<String>(); if (FileUtilsElh.checkFile(fname)) { BufferedReader breader = new BufferedReader(new FileReader(fname)); String line; while ((line = breader.readLine()) != null) { // # starting lines are ignored, considered comments. Blank lines are ignored as well if (line.startsWith("#") || line.matches("^\\s*$")) { continue; } // for good measure, test that the lemma is not already included as a feature else if (!getAttIndexes().containsKey(line)) { addNumericFeature(line); valueSet.add(line); } } breader.close(); } return valueSet; } /** * addNumericFeature adds a numeric feature to the feature vector of the classifier * @param feat */ private void addNumericFeature(String feat) { this.atts.add(new Attribute(feat, this.featNum)); this.attIndexes.put(feat, this.featNum); this.featNum++; } /** * featureFromArray converts a list of ngram/words into a numeric feature to the feature vector of the classifier * @param feat * @param prefix */ private String featureFromArray(List<String> feat, String prefix) { Object[] currentNgram = feat.toArray(); String ng = Arrays.asList(currentNgram).toString().replaceAll("(^\\[|\\]$)", "").replace(", ", "_") .toLowerCase(); //.toLowerCase() // feature prefix switch (prefix) { case "wf": ng = "WF_" + ng; break; case "lemma": ng = "LEM_" + ng; break; case "pos": ng = "POS_" + ng; break; case "": break; default: ng = prefix + "_" + ng; break; } return ng; } /** * addNumericFeatureSet adds a set numeric features to the feature vector of the classifier * * @param feat * @param String prefix : prefix appended to each of the values to build the feature name * e.g. "attId_"+13 = "attId_13" */ private void addNumericFeatureSet(String prefix, TreeSet<Integer> featSet) { for (Integer i : featSet) { String attName = prefix + i.toString(); this.atts.add(new Attribute(attName, this.featNum)); this.attIndexes.put(attName, this.featNum); this.featNum++; } } /** * addNumericFeatureSet adds a set numeric features to the feature vector of the classifier * * @param feat * @param String prefix : prefix appended to each of the values to build the feature name * e.g. "attId_"+13 = "attId_13" */ private void addNumericFeatureSet(String prefix, HashMap<String, Integer> featSet, int threshold) { System.err.println("Features::addNumericFeatureSet - threshold: " + threshold); for (String s : featSet.keySet()) { if (featSet.get(s) >= threshold) { String attName = prefix + s; this.atts.add(new Attribute(attName, this.featNum)); this.attIndexes.put(attName, this.featNum); this.featNum++; } /*else { System.err.println("discarded ngram, freq="+featSet.get(s)); }*/ } } /** * addNominalFeature adds a nominal feature (feature with values in a range) * to the feature vector of the classifier * * @param feat * @param featValues */ private void addNominalFeature(String feat, List<String> featValues) { this.atts.add(new Attribute(feat, featValues)); this.attIndexes.put(feat, this.featNum); this.featNum++; } /** * Adds frequency attribute +1 to an attribute in the given feature vector * * @param String att attribute name to add a value to. * @param double[] fVector feature vector where the value should be added * */ private void addNumericToFeatureVector(String att, double[] fVector, int sentTokNum) { if (attIndexes.containsKey(att)) { int current_ind = attIndexes.get(att); //if the current word form is in the ngram list activate the feature in the vector fVector[current_ind] = fVector[current_ind] + (1 / (double) sentTokNum); //fVector[current_ind]++; } } /** * Adds frequency attribute +1 (or +(1/tokenNum) to an attribute in the given feature vector * * @param Attribute att to add a value to. * @param double[] fVector feature vector where the value should be added * */ private void addNumericToFeatureVector(Attribute att, double[] fVector, int sentTokNum) { if (!att.equals(null)) { int current_ind = att.index(); //update feature value in the feature vector fVector[current_ind] = fVector[current_ind] + (1 / (double) sentTokNum); } } /** * Given a window check if the ngrams inside (all of them) are present in the feature set, and if so, * update the feature vector accordingly * * @param ngrams * @param prefix String : possible prefix used to differentiate ngram groups in the attribute set. * @param double[] fVector : feature vector for the corresponding instance * @param int tokens : number of tokens in the sentence (in case we want to add not a frequency value * but a normalized value) * */ private void checkNgramFeatures(LinkedList<String> ngrams, double[] fVector, String prefix, int tokens, boolean empty) { //System.err.println("features::checkNgramFeatures ->"+Arrays.asList(ngrams).toString()); // if empty is active means that we are checking the end of the sentence and // the ngram list must be emptied if (empty) { while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, prefix); //add occurrence to feature vector (the functions checks if the given ngram feature exists). addNumericToFeatureVector(ng, fVector, tokens); //tokNum ngrams.removeFirst(); } } // if empty is false search for all ngrams in the window else { // add ngrams to the feature list for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), prefix); // add occurrence to feature vector (the functions checks if the given ngram feature exists). addNumericToFeatureVector(ng, fVector, tokens);//tokNum } } } /** * Check if the given word/lemma/ngram exists in the general or domain polarity lexicons, and if yes * updates the corresponding attributes in the feature vector * * @param String wrd : word/lemma/ngram to look for in the polarity lexicons * @param double[] fVector : feature vector that should be updated * */ private void checkPolarityLexicons(String wrd, double[] fVector, int tokNum, boolean ngrams) { // fill vector with general polarity scores if ((polarLexiconGen != null) && (polarLexiconGen.size() > 0)) //(!polarLexiconGen.isEmpty())) { if (polarLexiconGen.isInLexicon(wrd)) { //fVector[getAttIndexes().get("polLexGen_posScore")]+=(polarLexiconGen.get(wrd).get("pos")/(double)tokNum); //fVector[getAttIndexes().get("polLexGen_negScore")]+=(polarLexiconGen.get(wrd).get("neg")/(double)tokNum); fVector[getAttIndexes() .get("polLexGen_posScore")] += (polarLexiconGen.getPolarity(wrd).getPositiveScore() / (double) tokNum); fVector[getAttIndexes() .get("polLexGen_negScore")] += (polarLexiconGen.getPolarity(wrd).getNegativeScore() / (double) tokNum); if (ngrams) { fVector[getAttIndexes().get("polgen_" + wrd)]++; } } } // fill vector with domain polarity scores if ((polarLexiconDom != null) && (polarLexiconDom.size() > 0)) //(!polarLexiconDom.isEmpty())) { if (polarLexiconDom.isInLexicon(wrd)) { //fVector[getAttIndexes().get("polLexDom_posScore")]+=(polarLexiconDom.get(wrd).get("pos")/(double)tokNum); //fVector[getAttIndexes().get("polLexDom_negScore")]+=(polarLexiconDom.get(wrd).get("neg")/(double)tokNum); fVector[getAttIndexes() .get("polLexDom_posScore")] += (polarLexiconDom.getPolarity(wrd).getPositiveScore() / (double) tokNum); fVector[getAttIndexes() .get("polLexDom_negScore")] += (polarLexiconDom.getPolarity(wrd).getNegativeScore() / (double) tokNum); if (ngrams) { fVector[getAttIndexes().get("poldom_" + wrd)]++; } } } } /** * Check if the given word/lemma/ngram exists both in the ngram list and in the general or domain polarity * lexicons, and if yes updates the corresponding attributes in the feature vector * * @param ngrams * @param fVector * @param prefix * @param toknumNgram * @param toknumPol * @param empty * @param ngram */ private void checkNgramsAndPolarLexicons(LinkedList<String> ngrams, double[] fVector, String prefix, int toknumNgram, int toknumPol, boolean empty, boolean ngram) { //System.err.println(Arrays.asList(ngrams).toString()); // if empty is active means that we are checking the end of the sentence and // the ngram list must be emptied if (empty) { // add ngrams to the feature vector while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, prefix); //if the current lemma is in the ngram list activate the feature in the vector if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { // add occurrence to feature vector (the functions checks if the given ngram feature exists). addNumericToFeatureVector(ng, fVector, toknumNgram); //tokNum } ng = featureFromArray(ngrams, ""); if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, fVector, toknumPol, ngram); } //end polarity ngram checker ngrams.removeFirst(); } //end ngram checking } // if empty is false search for all ngrams in the window else { // add ngrams to the feature vector for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), prefix); //if the current lemma is in the ngram list activate the feature in the vector if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { // add occurrence to feature vector (the functions checks if the given ngram feature exists). addNumericToFeatureVector(ng, fVector, toknumNgram); //tokNum } ng = featureFromArray(ngrams.subList(0, i + 1), ""); if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, fVector, toknumPol, ngram); } //end polarity ngram checker } //end ngram checking } } public void setCorpus(CorpusReader corp) { this.corpus = corp; } /** * Normalize input String (urls -> URL) * * @param input : input string to normalize * @returns String */ private String normalize(String input, String normOpt) { //URL normalization if (normOpt.equalsIgnoreCase("all")) { return MicrotxtNormalizer.normalizeSentence(input, true, true, true); } else if (normOpt.equalsIgnoreCase("noHashtag")) { return MicrotxtNormalizer.normalizeSentence(input, true, true, false); } else if (normOpt.equalsIgnoreCase("noHashEmo")) { return MicrotxtNormalizer.normalizeSentence(input, true, true, false); } else if (normOpt.equalsIgnoreCase("noEmot")) { return MicrotxtNormalizer.normalizeSentence(input, true, true, true); } else if (normOpt.equalsIgnoreCase("url")) { return MicrotxtNormalizer.normalizeSentence(input, true, false, false); } else if (normOpt.equalsIgnoreCase("minimum")) { return MicrotxtNormalizer.normalizeSentence(input, false, false, true); } else if (normOpt.equalsIgnoreCase("old")) { return MicrotxtNormalizer.normalizeSentence(input, true, false, true); } else { return input; } } }