Java tutorial
/******************************************************************************* * Copyright (C) 2012 Constantine Lignos * * This file is a part of MORSEL. * * MORSEL is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * MORSEL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with MORSEL. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package edu.upenn.ircs.lignos.morsel; import java.io.BufferedOutputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintStream; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import edu.upenn.ircs.lignos.morsel.compound.Compounding; import edu.upenn.ircs.lignos.morsel.lexicon.Lexicon; import edu.upenn.ircs.lignos.morsel.lexicon.Word; import edu.upenn.ircs.lignos.morsel.lexicon.WordSet; import edu.upenn.ircs.lignos.morsel.transform.Affix; import edu.upenn.ircs.lignos.morsel.transform.AffixType; import edu.upenn.ircs.lignos.morsel.transform.Transform; import edu.upenn.ircs.lignos.morsel.transform.WordPair; import gnu.trove.THashMap; import gnu.trove.THashSet; /** * Main class for the MORSEL unsupervised morphology learning * system. Provides main method and core learning loop. * */ public class MorphLearner { // I/O Variables String corpusPath; PrintWriter output; PrintStream log; /** The lexicon being learned over */ Lexicon lex; /* The following parameters are documented in the README. See the * implementation of setParams for the mapping between the names * of these members and the official parameter names. */ private int MAX_ITER; private int TOP_AFFIXES; private boolean REEVAL; private boolean SCORE_REEVAL; private boolean DOUBLING; private int TYPE_THRESHOLD; private int STEM_LENGTH; private double OVERLAP_THRESHOLD; private double PRECISION_THRESHOLD; private int WINDOW_SIZE; private boolean HYPHENATION; private boolean FINAL_COMPOUNDING; private boolean ITER_COMPOUNDING; private boolean AGGR_COMPOUNDING; private boolean BASE_INFERENCE; private boolean TRANSFORM_OPTIMIZATION; private boolean ITERATION_ANALYSIS; private boolean WEIGHTED_TRANSFORMS; private boolean WEIGHTED_AFFIXES; private boolean TRANSFORM_RELATIONS; private boolean ANALYZE_SIMPLEX_WORDS; // Output options protected String analysisBase; private boolean outputBaseInf; private boolean outputConflation; private boolean outputCompounds; /** * Create a new learner using the given paths for I/O. * @param corpusPath the path of the input corpus * @param outPath the path for the analysis output * @param logPath the path for the log * @param paramPath the path of the parameter file * @param encoding the encoding of the input corpus file * @param outputBaseInf Whether to output the examples of base inference * @param outputConflation Whether to output conflation sets * @param outputCompounds Whether to output an analysis before compounding * @throws FileNotFoundException if the corpus or parameter files do not exist. * @throws UnsupportedEncodingException if the corpus's encoding is not readable. */ public MorphLearner(String corpusPath, String outPath, String logPath, String paramPath, String encoding, boolean outputBaseInf, boolean outputConflation, boolean outputCompounds) throws FileNotFoundException, UnsupportedEncodingException { this.corpusPath = corpusPath; this.outputBaseInf = outputBaseInf; this.outputConflation = outputConflation; this.outputCompounds = outputCompounds; try { this.output = new PrintWriter( new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outPath)), encoding)); } catch (FileNotFoundException e) { throw new FileNotFoundException("Cannot open output file: " + outPath); } // If log is "-", don't redirect if (!"-".equals(logPath)) { this.log = new PrintStream(new BufferedOutputStream(new FileOutputStream(logPath)), true, encoding); // Redirect output to log System.setOut(new PrintStream(logPath)); } System.out.println("Setting parameters from " + paramPath); this.setParams(paramPath); System.out.println("Loading corpus from " + corpusPath); loadCorpus(encoding); // Shorten the analysis path down to a base for use in analysisBase = outPath.contains(".") ? outPath.substring(0, outPath.lastIndexOf('.')) : outPath; } /** * Loads the corpus located at corpusPath. * @param encoding the encoding of the corpus * @throws FileNotFoundException if the file at corpusPath could not be read. */ public void loadCorpus(String encoding) throws FileNotFoundException { System.out.println("Loading words..."); lex = CorpusLoader.loadWordlist(corpusPath, encoding, true); if (lex == null) { throw new FileNotFoundException("Cannot open wordlist: " + corpusPath); } } /** * Clean up any open resources. */ protected void cleanUp() { output.close(); // Log can be null, so check for it if (log != null) { log.close(); } } /** * Learn from the loaded lexicon and output the results. */ public void learn() { List<Transform> learnedTransforms = new LinkedList<Transform>(); Set<Transform> badTransforms = new THashSet<Transform>(); Map<String, Transform> indexedTransforms = new THashMap<String, Transform>(); RuleInference ruleInf = new RuleInference(); TransformInference transInf = new TransformInference(); PrintWriter baseLog = null; // Print lexicon status System.out.println("Lexicon stats:"); System.out.println(lex.getStatus()); System.out.println(); // Pre-processing if (HYPHENATION) { System.out.println("Handling hyphenation..."); // Perform hyphenation lex.processHyphenation(); // Print lexicon status System.out.println("Lexicon stats:"); System.out.println(lex.getStatus()); } System.out.println(memoryStatus()); System.out.println(); // The main learning loop System.out.println("Starting learning..."); for (int i = 0; i < MAX_ITER; i++) { System.out.println("Iteration " + (i + 1)); // Print lexicon status System.out.println("Lexicon stats:"); System.out.println(lex.getStatus()); // Print set status System.out.println("Base size: " + lex.getSetWords(WordSet.BASE).size()); System.out.println("Derived size: " + lex.getSetWords(WordSet.DERIVED).size()); System.out.println("Unmodeled size: " + lex.getSetWords(WordSet.UNMODELED).size()); // Hypothesize transforms System.out.println("Hypothesizing and scoring transforms..."); ArrayList<Transform> hypTransforms = hypothesizeTransforms(learnedTransforms, badTransforms); // Score the transforms // Always pass false for doubling, this // is to ensure that rules cannot get high scores by using // orthographic accomodation. // Reeval is set by the SCORE_REEVAL parameter boolean doubling = false; boolean reeval = SCORE_REEVAL; if (TRANSFORM_OPTIMIZATION) { // If this is the first iteration, score from scratch if (i == 0) { for (Transform trans : hypTransforms) { Transform.scoreTransform(trans, lex, reeval, doubling); } } // Otherwise, incrementally score, using what we had from last round else { incrementalScoreTransforms(hypTransforms, indexedTransforms, lex, reeval, doubling); } // Update indexedTransforms for the next round indexedTransforms = indexTransforms(hypTransforms); } else { for (Transform trans : hypTransforms) { Transform.scoreTransform(trans, lex, reeval, doubling); } } // Rank the transforms sortTransforms(hypTransforms); // Trim them down, score them, and pick one List<Transform> topTransforms = Util.truncateCollection(hypTransforms, TOP_AFFIXES); Transform bestTransform = selectTransform(topTransforms, learnedTransforms, badTransforms); // Quit if no good transform was found if (bestTransform == null) { System.out.println("Out of good transforms to learn. Giving up."); break; } System.out.println("Selected " + bestTransform.toString()); // Re-evaluate the best transform if (REEVAL) { Transform reEvalTransform = new Transform(bestTransform.getAffix1(), bestTransform.getAffix2()); Transform.scoreTransform(reEvalTransform, lex, REEVAL, DOUBLING); bestTransform = reEvalTransform; } // Accept the best transform System.out.println("Learned " + bestTransform.toVerboseString()); // Reeval should be false for moving the words lex.moveTransformPairs(bestTransform, hypTransforms, TRANSFORM_OPTIMIZATION, false, doubling); // Mark learned after words have been moved bestTransform.markLearned(); learnedTransforms.add(bestTransform); // Check for relationships between transforms if (TRANSFORM_RELATIONS) { transInf.inferRelations(lex); System.out.println("Transform relationships:\n" + transInf.toString()); } // Handle the results of inference if (BASE_INFERENCE) { // Open the output file on the first iteration if we're outputting if (outputBaseInf && i == 0) { try { baseLog = new PrintWriter(analysisBase + "_baseinf.txt"); } catch (FileNotFoundException e) { System.err.println("Couldn't output base inference dump."); } } ruleInf.conservInference(lex, learnedTransforms, hypTransforms, REEVAL, DOUBLING, TRANSFORM_OPTIMIZATION, baseLog); } // Do iteration/aggressive compounding if (ITER_COMPOUNDING) { System.out.println("Handling iteration compounding..."); int nCompounds = Compounding.breakCompounds(lex, WordSet.BASE, learnedTransforms, hypTransforms, TRANSFORM_OPTIMIZATION, REEVAL, DOUBLING, transInf); System.out.println("Broke " + nCompounds + " compounds in base"); if (AGGR_COMPOUNDING) { nCompounds = Compounding.breakCompounds(lex, WordSet.UNMODELED, learnedTransforms, hypTransforms, TRANSFORM_OPTIMIZATION, REEVAL, DOUBLING, transInf); System.out.println("Broke " + nCompounds + " compounds in unmodeled"); } } // Add space between transforms System.out.println(memoryStatus()); System.out.println(); System.out.println(); // Output the analysis on the first and then every 5th iteration starting with 5 // Analyses should be one-indexed, so add one if (ITERATION_ANALYSIS && (i == 0 || (i + 1) % 5 == 0)) { outputIterAnalysis(Integer.toString(i + 1)); } } // End of main learning loop // Clean up for base inference logging if (baseLog != null) { baseLog.close(); } // After learning is complete, perform the last round of compounding if (FINAL_COMPOUNDING) { // Output an analysis pre-compounding if (outputCompounds) { outputIterAnalysis("precompound"); } // If ITER_COMPOUNDING was on, pass in the learned rules List<Transform> fillerRules = ITER_COMPOUNDING ? learnedTransforms : null; System.out.println("Handling final compounding..."); System.out.println(memoryStatus()); // Break compounds in base and unmodeled // We sneakily pass false for opt so we can just pass // null for hypTransforms- since we're done learning transforms, // we don't care that hypothesized transforms won't be updated int nCompounds = Compounding.breakCompounds(lex, WordSet.BASE, fillerRules, null, false, REEVAL, DOUBLING, transInf); System.out.println("Broke " + nCompounds + " compounds in base"); System.out.println(memoryStatus()); nCompounds = Compounding.breakCompounds(lex, WordSet.UNMODELED, fillerRules, null, false, REEVAL, DOUBLING, transInf); System.out.println("Broke " + nCompounds + " compounds in unmodeled"); } // Finally, analyze simplexes as the final step if (ANALYZE_SIMPLEX_WORDS) { System.out.println("Analyzing simplex words..."); int nAnalyzed = Compounding.analyzeSimplexWords(lex, WordSet.UNMODELED, learnedTransforms, DOUBLING, transInf); System.out.println("Analyzed " + nAnalyzed + " words in unmodeled"); } System.out.println("Learning complete. Analyzing..."); outputAnalysis(output); if (outputConflation) { outputConflationSets(); } } private String memoryStatus() { // Check the memory Runtime runtime = Runtime.getRuntime(); long usage = runtime.totalMemory() - runtime.freeMemory(); long remaining = runtime.maxMemory() - usage; // Conver to megabytes usage /= 1048576L; remaining /= 1048576L; return "Memory status: " + usage + "MB Used, " + remaining + "MB Remaining"; } private Map<String, Transform> indexTransforms(List<Transform> hypTransforms) { Map<String, Transform> index = new THashMap<String, Transform>(); for (Transform transform : hypTransforms) { index.put(transform.toKey(), transform); } return index; } private void incrementalScoreTransforms(ArrayList<Transform> hypTransforms, Map<String, Transform> indexedTransforms, Lexicon lex, boolean reEval, boolean doubling) { // If the transform is in the index, reuse it for (int i = 0; i < hypTransforms.size(); i++) { Transform hypTransform = hypTransforms.get(i); Transform scoredTransform = indexedTransforms.get(hypTransform.toKey()); // Reuse the old one if possible, otherwise score from scratch if (scoredTransform != null) { hypTransforms.set(i, scoredTransform); } else { Transform.scoreTransform(hypTransform, lex, reEval, doubling); } } } private Transform selectTransform(List<Transform> topTransforms, List<Transform> learnedTransforms, Set<Transform> badTransforms) { // Search through the transforms for the best one Transform bestTransform = null; Transform secondTransform = null; System.out.println("Selecting a transform..."); int nBadTransforms = 0; boolean isGood = false; for (int i = 0; i < topTransforms.size(); i++) { // If we've had too many bad transforms, give up // Note that only some bad transforms count against the bad transform // count if (nBadTransforms >= WINDOW_SIZE) { return null; } System.out.println(); bestTransform = topTransforms.get(i); System.out.println("Vetting transform " + bestTransform.toVerboseString()); // Keep going if the best doesn't have enough transforms if (bestTransform.getTypeCount() < TYPE_THRESHOLD) { System.out.println("Transform has too few types."); nBadTransforms++; // Add this to the set of bad transforms badTransforms.add(bestTransform); continue; } // Check for a conflict with existing transforms if (isConflict(bestTransform, learnedTransforms)) { // Add this to the set of bad transforms badTransforms.add(bestTransform); continue; } // Check the base-stem overlap int baseOverlap = baseOverlap(bestTransform); int stemOverlap = stemOverlap(bestTransform); double overlapRatio = 0; // If both ratios are zero count this as OK if (baseOverlap + stemOverlap == 0) { System.out.println("Overlap Ratio: 0/0"); overlapRatio = 0; } // If only the base is 0, make sure this is rejected else if (baseOverlap == 0) { System.out.println("Overlap ratio: " + baseOverlap + "/0"); overlapRatio = OVERLAP_THRESHOLD + 1.0; } else { overlapRatio = (double) stemOverlap / (double) baseOverlap; System.out.println("Overlap ratio: " + overlapRatio); } // Go to the next transform if the overlap ratio is too high if (overlapRatio > OVERLAP_THRESHOLD) { System.out.println("Overlap ratio too high."); continue; } // If there's still a second-best left, check it for tie-breaks if (i + 1 < topTransforms.size()) { secondTransform = topTransforms.get(i + 1); // Check for a tie and resolve it if (isTie(bestTransform, secondTransform)) { System.out.println("Breaking tie between " + bestTransform + " and " + secondTransform); bestTransform = breakTie(bestTransform, secondTransform); } } // Check that the segmentation precision is acceptable double segPrecision = Transform.calcSegPrecision(bestTransform); System.out.println("Seg. Precision: " + segPrecision); if (segPrecision < PRECISION_THRESHOLD) { System.out.println("Seg. Precision too low"); nBadTransforms++; // Add this to the set of bad transforms badTransforms.add(bestTransform); // Move on continue; } // If we didn't continue, that means the transform is good, break out isGood = true; break; } // Return the best transform if it is good return isGood ? bestTransform : null; } private boolean isConflict(Transform bestTransform, List<Transform> learnedTransforms) { // Already learned if (learnedTransforms.contains(bestTransform)) { System.out.println("Conflict: Transform already learned."); return true; } // Is inverse of it already learned? if (learnedTransforms.contains(new Transform(bestTransform.getAffix2(), bestTransform.getAffix1()))) { System.out.println("Conflict: Inverse of transform already learned."); return true; } return false; } private int baseOverlap(Transform transform) { int overlap = 0; for (WordPair pair : transform.getWordPairs()) { if (lex.isWordInSet(pair.getBase(), WordSet.BASE)) { overlap++; } } return overlap; } private int stemOverlap(Transform transform) { int overlap = 0; // Create a set of all stems in the base words Set<String> baseStems = new THashSet<String>(); for (Word word : lex.getSetWords(WordSet.BASE)) { if (word.length() > STEM_LENGTH) { baseStems.add(word.getText().substring(0, STEM_LENGTH)); } } // Count the overlap with the transform for (WordPair pair : transform.getWordPairs()) { Word base = pair.getBase(); // Skip the word if it's too short if (base.length() < STEM_LENGTH) { continue; } String stem = base.getText().substring(0, STEM_LENGTH); if (baseStems.contains(stem)) { overlap++; } } return overlap; } @SuppressWarnings("unused") private void printTransforms(List<Transform> transforms) { for (Transform transform : transforms) { System.out.println(transform.getPairsText()); } } private ArrayList<Transform> hypothesizeTransforms(List<Transform> learnedTransforms, Set<Transform> badTransforms) { List<Affix> topBUPrefixes = lex.topAffixes(TOP_AFFIXES, AffixType.PREFIX, Lexicon.AffixSet.BASEUNMOD, WEIGHTED_AFFIXES); List<Affix> topUPrefixes = lex.topAffixes(TOP_AFFIXES, AffixType.PREFIX, Lexicon.AffixSet.UNMOD, WEIGHTED_AFFIXES); List<Affix> topBUSuffixes = lex.topAffixes(TOP_AFFIXES, AffixType.SUFFIX, Lexicon.AffixSet.BASEUNMOD, WEIGHTED_AFFIXES); List<Affix> topUSuffixes = lex.topAffixes(TOP_AFFIXES, AffixType.SUFFIX, Lexicon.AffixSet.UNMOD, WEIGHTED_AFFIXES); ArrayList<Transform> hypTransforms = makeTransforms(topBUPrefixes, topUPrefixes, topBUSuffixes, topUSuffixes, badTransforms); return hypTransforms; } private static boolean isTie(Transform trans1, Transform trans2) { return (trans1.getAffix1() == trans2.getAffix2() && trans1.getAffix2() == trans2.getAffix1() && trans1.getTypeCount() == trans2.getTypeCount()); } private static Transform breakTie(Transform trans1, Transform trans2) { int baseScore = 0; int derivedScore = 0; // Give a point to whichever member of the pair is more frequent for (WordPair pair : trans1.getWordPairs()) { if (pair.getBase().getCount() > pair.getDerived().getCount()) { baseScore++; } else if (pair.getDerived().getCount() > pair.getBase().getCount()) { derivedScore++; } // Award no points in a tie } // Choose the transform with more frequent bases // Ties go to the first one if (baseScore >= derivedScore) { return trans1; } else { return trans2; } } private void outputAnalysis(PrintWriter analysisOutput) { // Output an analysis of all the words List<String> sortedWords = new ArrayList<String>(lex.getWordKeys()); Collections.sort(sortedWords); for (String wordKey : sortedWords) { Word w = lex.getWord(wordKey); if (w.shouldAnalyze()) { analysisOutput.println(wordKey + '\t' + w.analyze()); } } } private void outputIterAnalysis(String iter) { // Output the analysis for the current iteration try { PrintWriter out = new PrintWriter(analysisBase + "_" + iter + ".txt"); outputAnalysis(out); out.close(); } catch (FileNotFoundException e) { System.err.println("Couldn't output iteration analysis."); } } @SuppressWarnings("unused") private void outputTransforms(List<Transform> learnedTransforms) { try { PrintWriter out = new PrintWriter(analysisBase + "_transforms.txt"); for (Transform t : learnedTransforms) { out.println(t.toDumpString()); } out.close(); } catch (FileNotFoundException e) { System.err.println("Couldn't output transform dump."); } } private void outputConflationSets() { try { PrintWriter out = new PrintWriter(analysisBase + "_conflations.txt"); for (Word w : lex.getSetWords(WordSet.BASE)) { out.println(w.toDerivedWordsString()); } out.close(); } catch (FileNotFoundException e) { System.err.println("Couldn't output conflation sets."); } } private ArrayList<Transform> makeTransforms(List<Affix> baseUnmodPrefixes, List<Affix> unmodPrefixes, List<Affix> baseUnmodSuffixes, List<Affix> unmodSuffixes, Set<Transform> badTransforms) { ArrayList<Transform> transforms = new ArrayList<Transform>(); // Prefixes for (Affix affix1 : baseUnmodPrefixes) { for (Affix affix2 : unmodPrefixes) { // Skip if they're the same or if they // do not meet the characteristics of a good transform if (affix1 == affix2 || Affix.isBadAffixPair(affix1, affix2, AffixType.PREFIX)) { continue; } // Otherwise pair them up if the transform isn't one we've // marked as bad already Transform newTrans = new Transform(affix1, affix2); if (!badTransforms.contains(newTrans)) { transforms.add(newTrans); } } } // Suffixes for (Affix affix1 : baseUnmodSuffixes) { for (Affix affix2 : unmodSuffixes) { // Skip if they're the same or if they // do not meet the characteristics of a good transform if (affix1 == affix2 || Affix.isBadAffixPair(affix1, affix2, AffixType.SUFFIX)) { continue; } // Otherwise pair them up if the transform isn't one we've // marked as bad already Transform newTrans = new Transform(affix1, affix2); if (!badTransforms.contains(newTrans)) { transforms.add(newTrans); } } } return transforms; } /** * Sort transforms in-place according to the previously set sorting method. * Depends on the WEIGHTED_TRANSFORMS member for the sorting method. * @param transforms the transforms to be sorted */ public void sortTransforms(List<Transform> transforms) { if (WEIGHTED_TRANSFORMS) { Collections.sort(transforms, Collections.reverseOrder(new WeightedTypeCountComparator())); } else { Collections.sort(transforms, Collections.reverseOrder(new TypeCountComparator())); } } /** * Read in the properties at the specified path to set the parameters for learning. * The mapping between the names of the MorphLearner members and the official parameter * names used in the README is given in the implementation of this method. * @param paramPath the path for the properties file * @return true if the parameter file was successfully read * @throws FileNotFoundException if the properties file cannot be read */ public boolean setParams(String paramPath) throws FileNotFoundException { // Read in the params as properties Properties props = new Properties(); FileInputStream paramFile; try { paramFile = new FileInputStream(paramPath); } catch (FileNotFoundException e) { throw new FileNotFoundException("Cannot open parameter file: " + paramPath); } try { props.load(paramFile); } catch (IOException e) { throw new FileNotFoundException("Problem reading parameter file: " + paramPath); } // Iteration parameters MAX_ITER = Integer.parseInt(props.getProperty("max_iter")); TOP_AFFIXES = Integer.parseInt(props.getProperty("top_affixes")); WINDOW_SIZE = Integer.parseInt(props.getProperty("window_size")); // Word scoring parameters Word.FREQ_THRESHOLD = Double.parseDouble(props.getProperty("frequent_prob_threshold")); Word.COUNT_THRESHOLD = Integer.parseInt(props.getProperty("frequent_type_threshold")); // Transform scoring parameters REEVAL = Boolean.parseBoolean(props.getProperty("reeval")); SCORE_REEVAL = Boolean.parseBoolean(props.getProperty("score_reeval")); DOUBLING = Boolean.parseBoolean(props.getProperty("doubling")); // Transform selection parameters TYPE_THRESHOLD = Integer.parseInt(props.getProperty("type_threshold")); STEM_LENGTH = Integer.parseInt(props.getProperty("overlap_stem_length")); OVERLAP_THRESHOLD = Double.parseDouble(props.getProperty("overlap_threshold")); PRECISION_THRESHOLD = Double.parseDouble(props.getProperty("precision_threshold")); // Preprocessing flags HYPHENATION = Boolean.parseBoolean(props.getProperty("hyphenation")); FINAL_COMPOUNDING = Boolean.parseBoolean(props.getProperty("compounding")); ITER_COMPOUNDING = Boolean.parseBoolean(props.getProperty("iter_compounding")); AGGR_COMPOUNDING = Boolean.parseBoolean(props.getProperty("aggr_compounding")); // Rule inference flags BASE_INFERENCE = Boolean.parseBoolean(props.getProperty("rule_inference_conservative")); // Implementation details TRANSFORM_OPTIMIZATION = Boolean.parseBoolean(props.getProperty("transform_optimization")); ITERATION_ANALYSIS = Boolean.parseBoolean(props.getProperty("iteration_analysis")); WEIGHTED_TRANSFORMS = Boolean.parseBoolean(props.getProperty("weighted_transforms")); WEIGHTED_AFFIXES = Boolean.parseBoolean(props.getProperty("weighted_affixes")); // Compounding aggressiveness controls Compounding.TRANSFORM_RELATIONS = TRANSFORM_RELATIONS = Boolean .parseBoolean(props.getProperty("transform_relations")); ANALYZE_SIMPLEX_WORDS = Boolean.parseBoolean(props.getProperty("allow_unmod_simplex_word_analysis")); // Clean up try { paramFile.close(); } catch (IOException e) { // Ignore it } return true; } /** * Call the learner on the specified arguments. Exit using standard error codes * (sysexits.h) if an error is encountered. * @param args command line arguments */ public static void main(String[] args) { // create the command line parser CommandLineParser parser = new PosixParser(); // Set up command line arguments Options options = new Options(); options.addOption(new Option("h", "help", false, "dislay this help and exit")); Option encodingOption = new Option("e", "encoding", true, "input and output file encoding. Defaults to ISO8859_1."); encodingOption.setArgName("encoding"); options.addOption(encodingOption); options.addOption(new Option("b", "base-inf", false, "output the examples of base inference. This does not change whether base inference" + " is used; it simply outputs the examples that used it.")); options.addOption(new Option("s", "conflation-sets", false, "output the learner's conflation sets")); options.addOption(new Option("c", "compounds", false, "output the learner's analsyis before final compounding is used")); HelpFormatter formatter = new HelpFormatter(); String usage = "MorphLearner [options] wordlist outfile logfile paramfile"; CommandLine line = null; String[] otherArgs = null; try { // Parse the command line arguments line = parser.parse(options, args); otherArgs = line.getArgs(); // Handle help if (line.hasOption("help")) { formatter.printHelp(usage, options); System.exit(0); } if (otherArgs.length != 4) { throw new ParseException("Incorrect number of required arguments specified"); } } catch (ParseException exp) { System.out.println(exp.getMessage()); formatter.printHelp(usage, options); System.exit(64); } // Get options String encoding = line.getOptionValue("encoding", "ISO8859_1"); boolean outputBaseInf = line.hasOption("base-inf"); boolean outputConflation = line.hasOption("conflation-sets"); boolean outputCompounds = line.hasOption("compounds"); // Setup timing long start; float elapsedSeconds; start = System.currentTimeMillis(); // Initialize the learner MorphLearner learner = null; try { learner = new MorphLearner(otherArgs[0], otherArgs[1], otherArgs[2], otherArgs[3], encoding, outputBaseInf, outputConflation, outputCompounds); } catch (FileNotFoundException e) { System.err.println(e.getMessage()); System.exit(66); } catch (UnsupportedEncodingException e) { System.err.println("Unsupported file encoding: " + encoding); System.exit(74); } // Output time stats elapsedSeconds = (System.currentTimeMillis() - start) / 1000F; System.out.println("Init time: " + elapsedSeconds + "s\n"); // Time learning start = System.currentTimeMillis(); learner.learn(); learner.cleanUp(); // Output time stats elapsedSeconds = (System.currentTimeMillis() - start) / 1000F; System.out.println("\nLearning time: " + elapsedSeconds + "s"); } /** * Compare transforms by their weighted type count * */ public static class WeightedTypeCountComparator implements Comparator<Object> { @Override public int compare(Object transform1, Object transform2) { return Long.compare(((Transform) transform1).getWeightedTypeCount(), ((Transform) transform2).getWeightedTypeCount()); } } /** * Compare transforms by their type count * */ public static class TypeCountComparator implements Comparator<Object> { @Override public int compare(Object transform1, Object transform2) { return Long.compare(((Transform) transform1).getTypeCount(), ((Transform) transform2).getTypeCount()); } } /** * Compare transforms by their token count * */ public static class TokenCountComparator implements Comparator<Object> { @Override public int compare(Object transform1, Object transform2) { return Long.compare(((Transform) transform1).getTokenCount(), ((Transform) transform2).getTokenCount()); } } }