Java tutorial
/* * This file is part of ALOE. * * ALOE is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * ALOE is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License * along with ALOE. If not, see <http://www.gnu.org/licenses/>. * * Copyright (c) 2012 SCCL, University of Washington (http://depts.washington.edu/sccl) */ package etc.aloe.oilspill2010; import etc.aloe.data.ExampleSet; import etc.aloe.data.FeatureSpecification; import etc.aloe.filters.SimpleStringToWordVector; import etc.aloe.filters.SimpleStringToWordVector.NoNonsenseStemmer; import java.util.List; import weka.filters.supervised.attribute.AttributeSelection; import weka.attributeSelection.Ranker; import weka.attributeSelection.ReliefFAttributeEval; import weka.core.Instances; import weka.core.SelectedTag; import weka.filters.Filter; import weka.filters.unsupervised.attribute.PrincipalComponents; import weka.filters.unsupervised.attribute.StringToWordVector; import weka.filters.unsupervised.instance.SparseToNonSparse; /** * Generates a set of filters that extract the desired features from message * texts. * * Features include words, emoticons, pronouns, punctuations, and other strings. * * @author Michael Brooks <mjbrooks@uw.edu> */ public class FeatureGenerationImpl extends etc.aloe.cscw2013.FeatureGenerationImpl { /** * Construct a new FeatureGeneration implementation. * * @param emoticonDictionary The list of emoticons to look for in the * messages. */ public FeatureGenerationImpl(List<String> emoticonDictionary) { super(emoticonDictionary); //Change the default number of participant features this.participantFeatures = 20; } @Override public FeatureSpecification generateFeatures(ExampleSet basicExamples) { ExampleSet examples = basicExamples.copy(); FeatureSpecification spec = new FeatureSpecification(); System.out.print("Configuring features over " + examples.size() + " examples... "); try { spec.addFilter(getPronounsFilter(examples)); spec.addFilter(getPunctuationFilter(examples)); spec.addFilter(getSpecialWordsFilter(examples)); spec.addFilter(getSpellingFilter(examples)); spec.addFilter(getEmoticonsFilter(examples)); spec.addFilter(getBagOfWordsFilter(examples)); if (this.getParticipantFeatureCount() > 0) { spec.addFilter(getParticipantsFilter(examples)); } else { spec.addFilter(getRemoveParticipantFilter(examples)); } spec.addFilter(getRemoveIDFilter(examples)); //spec.addFilter(getSparseToNonsparseFilter(examples)); //spec.addFilter(getFeatureSelectionFilter(examples)); Instances output = spec.getOutputFormat(); int numAttrs = output.numAttributes(); System.out.println("generated " + (numAttrs - 1) + " features."); } catch (Exception e) { System.err.println("Error generating features."); System.err.println("\t" + e.getMessage()); } return spec; } protected Filter getSparseToNonsparseFilter(ExampleSet examples) throws Exception { SparseToNonSparse filter = new SparseToNonSparse(); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } protected Filter getFeatureSelectionFilter(ExampleSet examples) throws Exception { AttributeSelection filter = new AttributeSelection(); // package weka.filters.supervised.attribute! //CfsSubsetEval eval = new CfsSubsetEval(); //CorrelationAttributeEval eval = new CorrelationAttributeEval(); //InfoGainAttributeEval eval = new InfoGainAttributeEval(); ReliefFAttributeEval eval = new ReliefFAttributeEval(); //GreedyStepwise search = new GreedyStepwise(); //search.setNumToSelect(980); //search.setSearchBackwards(true); Ranker search = new Ranker(); search.setNumToSelect(980); filter.setEvaluator(eval); filter.setSearch(search); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } protected Filter getFeatureReductionFilter(ExampleSet examples) throws Exception { PrincipalComponents filter = new PrincipalComponents(); filter.setMaximumAttributes(10); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Get a bag of words filter based on the provided examples. * * @param examples * @return * @throws Exception */ @Override protected Filter getBagOfWordsFilter(ExampleSet examples) throws Exception { SimpleStringToWordVector filter = new SimpleStringToWordVector(); filter.setAttributeNamePrefix(BAG_OF_WORDS_FEATURE_PREFIX); filter.setStringAttributeName(ExampleSet.MESSAGE_ATTR_NAME); //This is stupid because it depends on how much data you use //bagger.setMinTermFreq(20); filter.setDoNotOperateOnPerClassBasis(true); filter.setWordsToKeep(3000); filter.setLowerCaseTokens(true); //use stemming and remove "nonsense" filter.setStemmer(new NoNonsenseStemmer(true)); filter.setTFTransform(true); filter.setIDFTransform(true); filter.setNormalizeDocLength( new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); filter.setOutputWordCounts(true); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } }