List of usage examples for java.util Scanner hasNextLine
public boolean hasNextLine()
From source file:com.joliciel.talismane.TalismaneConfig.java
/** * The rules to apply when running the parser. * @return/*from ww w. j a v a2 s . com*/ */ public List<ParserRule> getParserRules() { try { if (parserRules == null) { parserRules = new ArrayList<ParserRule>(); if (parserRuleFilePath != null && parserRuleFilePath.equalsIgnoreCase("null")) { // add no rules! (not even built-in ones) } else { for (int i = 0; i <= 1; i++) { Scanner rulesScanner = null; if (i == 0) { if (parserRulesReplace) continue; rulesScanner = this.implementation.getDefaultParserRulesScanner(); } else { if (parserRuleFilePath != null && parserRuleFilePath.length() > 0) { File parserRuleFile = new File(parserRuleFilePath); rulesScanner = new Scanner(new BufferedReader(new InputStreamReader( new FileInputStream(parserRuleFile), this.getInputCharset().name()))); } } if (rulesScanner != null) { List<String> ruleDescriptors = new ArrayList<String>(); while (rulesScanner.hasNextLine()) { String ruleDescriptor = rulesScanner.nextLine(); if (ruleDescriptor.length() > 0) { ruleDescriptors.add(ruleDescriptor); LOG.trace(ruleDescriptor); } } List<ParserRule> rules = this.getParserFeatureService().getRules(ruleDescriptors, dynamiseFeatures); parserRules.addAll(rules); } } } } return parserRules; } catch (Exception e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.joliciel.talismane.extensions.Extensions.java
/** * To be called just before running the Talismane command, to * prepare anything specifically required for extensions to function correctly. * @param config/* www . j a va 2s .c om*/ * @param talismane */ public void prepareCommand(TalismaneConfig config, Talismane talismane) { try { if (command == null) return; TalismaneSession talismaneSession = config.getTalismaneService().getTalismaneSession(); switch (command) { case toStandoff: { StandoffWriter standoffWriter = new StandoffWriter(); talismane.setParseConfigurationProcessor(standoffWriter); break; } case toStandoffSentences: { InputStream inputStream = StandoffWriter.class.getResourceAsStream("standoffSentences.ftl"); Reader templateReader = new BufferedReader(new InputStreamReader(inputStream)); FreemarkerTemplateWriter templateWriter = new FreemarkerTemplateWriter(templateReader); talismane.setParseConfigurationProcessor(templateWriter); break; } case fromStandoff: { Scanner scanner = new Scanner(config.getReader()); StandoffReader standoffReader = new StandoffReader(talismaneSession, scanner); standoffReader.setParserService(config.getParserService()); standoffReader.setPosTaggerService(config.getPosTaggerService()); standoffReader.setTokeniserService(config.getTokeniserService()); standoffReader.setTokenFilterService(config.getTokenFilterService()); config.setParserCorpusReader(standoffReader); break; } case corpusStatistics: { CorpusStatistics stats = new CorpusStatistics(talismaneSession); if (referenceStatsPath != null) { File referenceStatsFile = new File(referenceStatsPath); CorpusStatistics referenceStats = CorpusStatistics.loadFromFile(referenceStatsFile); stats.setReferenceWords(referenceStats.getWords()); stats.setReferenceLowercaseWords(referenceStats.getLowerCaseWords()); } File csvFile = new File(config.getOutDir(), config.getBaseName() + "_stats.csv"); csvFile.delete(); csvFile.createNewFile(); Writer csvFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(csvFile, false), "UTF8")); stats.setWriter(csvFileWriter); File serializationFile = new File(config.getOutDir(), config.getBaseName() + "_stats.zip"); serializationFile.delete(); stats.setSerializationFile(serializationFile); ParserRegexBasedCorpusReader corpusReader = (ParserRegexBasedCorpusReader) config .getParserCorpusReader(); corpusReader.setPredictTransitions(false); talismane.setParseConfigurationProcessor(stats); break; } case posTaggerStatistics: { PosTaggerStatistics stats = new PosTaggerStatistics(talismaneSession); if (referenceStatsPath != null) { File referenceStatsFile = new File(referenceStatsPath); PosTaggerStatistics referenceStats = PosTaggerStatistics.loadFromFile(referenceStatsFile); stats.setReferenceWords(referenceStats.getWords()); stats.setReferenceLowercaseWords(referenceStats.getLowerCaseWords()); } File csvFile = new File(config.getOutDir(), config.getBaseName() + "_stats.csv"); csvFile.delete(); csvFile.createNewFile(); Writer csvFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(csvFile, false), "UTF8")); stats.setWriter(csvFileWriter); File serializationFile = new File(config.getOutDir(), config.getBaseName() + "_stats.zip"); serializationFile.delete(); stats.setSerializationFile(serializationFile); talismane.setPosTagSequenceProcessor(stats); break; } case modifyCorpus: { if (corpusRulesPath == null) throw new TalismaneException("corpusRules is required for modifyCorpus command"); List<String> corpusRules = new ArrayList<String>(); File corpusRulesFile = new File(corpusRulesPath); Scanner scanner = new Scanner( new BufferedReader(new InputStreamReader(new FileInputStream(corpusRulesFile), "UTF-8"))); while (scanner.hasNextLine()) { corpusRules.add(scanner.nextLine()); } CorpusModifier corpusModifier = new CorpusModifier(config.getParseConfigurationProcessor(), corpusRules); talismane.setParseConfigurationProcessor(corpusModifier); break; } case projectify: { CorpusProjectifier projectifier = new CorpusProjectifier(config.getParseConfigurationProcessor()); talismane.setParseConfigurationProcessor(projectifier); break; } default: { throw new RuntimeException("Unknown command: " + command); } } } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.joliciel.talismane.TalismaneConfigImpl.java
public void loadParameters(Map<String, String> args) { try {/* w w w . j a va2 s . c om*/ if (args.size() == 0) { System.out.println("Talismane usage instructions: "); System.out.println("* indicates optional, + indicates default value"); System.out.println(""); System.out.println( "Usage: command=analyse *startModule=[sentence+|tokenise|postag|parse] *endModule=[sentence|tokenise|postag|parse+] *inFile=[inFilePath, stdin if missing] *outFile=[outFilePath, stdout if missing] *template=[outputTemplatePath]"); System.out.println(""); System.out.println("Additional optional parameters:"); System.out.println( " *encoding=[UTF-8, ...] *includeDetails=[true|false+] posTaggerRules*=[posTaggerRuleFilePath] textFilters*=[regexFilterFilePath] *sentenceModel=[path] *tokeniserModel=[path] *posTaggerModel=[path] *parserModel=[path] *inputPatternFile=[inputPatternFilePath] *posTagSet=[posTagSetPath]"); return; } String logConfigPath = args.get("logConfigFile"); if (logConfigPath != null) { args.remove("logConfigFile"); Properties props = new Properties(); props.load(new FileInputStream(logConfigPath)); PropertyConfigurator.configure(props); } String performanceConifPath = args.get("performanceConfigFile"); if (performanceConifPath != null) { args.remove("performanceConfigFile"); performanceConfigFile = this.getFile(performanceConifPath); } String encoding = null; String inputEncoding = null; String outputEncoding = null; String builtInTemplate = null; String posTagSetPath = null; String externalResourcePath = null; String transitionSystemStr = null; String languagePackPath = null; for (Entry<String, String> arg : args.entrySet()) { String argName = arg.getKey(); String argValue = arg.getValue(); if (argName.equals("command")) { String commandString = argValue; if (commandString.equals("analyze")) commandString = "analyse"; command = Command.valueOf(commandString); } else if (argName.equals("option")) { option = Option.valueOf(argValue); } else if (argName.equals("mode")) { mode = Mode.valueOf(argValue); } else if (argName.equals("module")) { if (argValue.equalsIgnoreCase("sentence") || argValue.equalsIgnoreCase("sentenceDetector")) module = Talismane.Module.SentenceDetector; else if (argValue.equalsIgnoreCase("tokenise") || argValue.equalsIgnoreCase("tokeniser")) module = Talismane.Module.Tokeniser; else if (argValue.equalsIgnoreCase("postag") || argValue.equalsIgnoreCase("posTagger")) module = Talismane.Module.PosTagger; else if (argValue.equalsIgnoreCase("parse") || argValue.equalsIgnoreCase("parser")) module = Talismane.Module.Parser; else if (argValue.equalsIgnoreCase("language") || argValue.equalsIgnoreCase("languageDetector")) module = Talismane.Module.LanguageDetector; else throw new TalismaneException("Unknown module: " + argValue); } else if (argName.equals("startModule")) { if (argValue.equalsIgnoreCase("sentence") || argValue.equalsIgnoreCase("sentenceDetector")) startModule = Talismane.Module.SentenceDetector; else if (argValue.equalsIgnoreCase("tokenise") || argValue.equalsIgnoreCase("tokeniser")) startModule = Talismane.Module.Tokeniser; else if (argValue.equalsIgnoreCase("postag") || argValue.equalsIgnoreCase("posTagger")) startModule = Talismane.Module.PosTagger; else if (argValue.equalsIgnoreCase("parse") || argValue.equalsIgnoreCase("parser")) startModule = Talismane.Module.Parser; else throw new TalismaneException("Unknown startModule: " + argValue); } else if (argName.equals("endModule")) { if (argValue.equalsIgnoreCase("sentence") || argValue.equalsIgnoreCase("sentenceDetector")) endModule = Talismane.Module.SentenceDetector; else if (argValue.equalsIgnoreCase("tokenise") || argValue.equalsIgnoreCase("tokeniser")) endModule = Talismane.Module.Tokeniser; else if (argValue.equalsIgnoreCase("postag") || argValue.equalsIgnoreCase("posTagger")) endModule = Talismane.Module.PosTagger; else if (argValue.equalsIgnoreCase("parse") || argValue.equalsIgnoreCase("parser")) endModule = Talismane.Module.Parser; else throw new TalismaneException("Unknown endModule: " + argValue); } else if (argName.equals("inFile")) inFilePath = argValue; else if (argName.equals("inDir")) inDirPath = argValue; else if (argName.equals("outFile")) outFilePath = argValue; else if (argName.equals("outDir")) outDirPath = argValue; else if (argName.equals("template")) templatePath = argValue; else if (argName.equals("builtInTemplate")) builtInTemplate = argValue; else if (argName.equals("encoding")) { if (inputEncoding != null || outputEncoding != null) throw new TalismaneException( "The parameter 'encoding' cannot be used with 'inputEncoding' or 'outputEncoding'"); encoding = argValue; } else if (argName.equals("inputEncoding")) { if (encoding != null) throw new TalismaneException( "The parameter 'encoding' cannot be used with 'inputEncoding' or 'outputEncoding'"); inputEncoding = argValue; } else if (argName.equals("outputEncoding")) { if (encoding != null) throw new TalismaneException( "The parameter 'encoding' cannot be used with 'inputEncoding' or 'outputEncoding'"); outputEncoding = argValue; } else if (argName.equals("includeDetails")) includeDetails = argValue.equalsIgnoreCase("true"); else if (argName.equals("propagateBeam")) propagateBeam = argValue.equalsIgnoreCase("true"); else if (argName.equals("beamWidth")) beamWidth = Integer.parseInt(argValue); else if (argName.equals("languageModel")) languageModelFilePath = argValue; else if (argName.equals("sentenceModel")) sentenceModelFilePath = argValue; else if (argName.equals("tokeniserModel")) tokeniserModelFilePath = argValue; else if (argName.equals("posTaggerModel")) posTaggerModelFilePath = argValue; else if (argName.equals("parserModel")) parserModelFilePath = argValue; else if (argName.equals("inputPatternFile")) inputPatternFilePath = argValue; else if (argName.equals("inputPattern")) inputRegex = argValue; else if (argName.equals("evaluationPatternFile")) evaluationPatternFilePath = argValue; else if (argName.equals("evaluationPattern")) evaluationRegex = argValue; else if (argName.equals("posTaggerRules")) { if (argValue.startsWith("replace:")) { posTaggerRulesReplace = true; posTaggerRuleFilePath = argValue.substring("replace:".length()); } else { posTaggerRuleFilePath = argValue; } } else if (argName.equals("parserRules")) { if (argValue.startsWith("replace:")) { parserRulesReplace = true; parserRuleFilePath = argValue.substring("replace:".length()); } else { parserRuleFilePath = argValue; } } else if (argName.equals("posTagSet")) posTagSetPath = argValue; else if (argName.equals("textFilters")) { if (argValue.startsWith("replace:")) { textFiltersReplace = true; textFiltersPath = argValue.substring("replace:".length()); } else { textFiltersPath = argValue; } } else if (argName.equals("tokenFilters")) { if (argValue.startsWith("replace:")) { tokenFiltersReplace = true; tokenFiltersPath = argValue.substring("replace:".length()); } else { tokenFiltersPath = argValue; } } else if (argName.equals("tokenSequenceFilters")) { if (argValue.startsWith("replace:")) { tokenSequenceFiltersReplace = true; tokenSequenceFilterPath = argValue.substring("replace:".length()); } else { tokenSequenceFilterPath = argValue; } } else if (argName.equals("posTagSequenceFilters")) posTagSequenceFilterPath = argValue; else if (argName.equals("logStats")) logStats = argValue.equalsIgnoreCase("true"); else if (argName.equals("newline")) newlineMarker = MarkerFilterType.valueOf(argValue); else if (argName.equals("fileName")) fileName = argValue; else if (argName.equals("processByDefault")) processByDefault = argValue.equalsIgnoreCase("true"); else if (argName.equals("maxParseAnalysisTime")) maxParseAnalysisTime = Integer.parseInt(argValue); else if (argName.equals("minFreeMemory")) minFreeMemory = Integer.parseInt(argValue); else if (argName.equals("transitionSystem")) transitionSystemStr = argValue; else if (argName.equals("sentenceCount")) maxSentenceCount = Integer.parseInt(argValue); else if (argName.equals("startSentence")) startSentence = Integer.parseInt(argValue); else if (argName.equals("endBlockCharCode")) endBlockCharacter = (char) Integer.parseInt(argValue); else if (argName.equals("outputGuesses")) outputGuesses = argValue.equalsIgnoreCase("true"); else if (argName.equals("outputGuessCount")) outputGuessCount = Integer.parseInt(argValue); else if (argName.equals("suffix")) suffix = argValue; else if (argName.equals("includeDistanceFScores")) includeDistanceFScores = argValue.equalsIgnoreCase("true"); else if (argName.equals("includeTransitionLog")) includeTransitionLog = argValue.equalsIgnoreCase("true"); else if (argName.equals("evaluationFile")) evaluationFilePath = argValue; else if (argName.equals("labeledEvaluation")) labeledEvaluation = argValue.equalsIgnoreCase("true"); else if (argName.equals("tokeniserBeamWidth")) tokeniserBeamWidth = Integer.parseInt(argValue); else if (argName.equals("posTaggerBeamWidth")) posTaggerBeamWidth = Integer.parseInt(argValue); else if (argName.equals("parserBeamWidth")) parserBeamWidth = Integer.parseInt(argValue); else if (argName.equals("propagateTokeniserBeam")) propagateTokeniserBeam = argValue.equalsIgnoreCase("true"); else if (argName.equals("blockSize")) blockSize = Integer.parseInt(argValue); else if (argName.equals("crossValidationSize")) crossValidationSize = Integer.parseInt(argValue); else if (argName.equals("includeIndex")) includeIndex = Integer.parseInt(argValue); else if (argName.equals("excludeIndex")) excludeIndex = Integer.parseInt(argValue); else if (argName.equals("dynamiseFeatures")) dynamiseFeatures = argValue.equalsIgnoreCase("true"); else if (argName.equals("predictTransitions")) predictTransitions = argValue.equalsIgnoreCase("true"); else if (argName.equals("lexicon")) { if (argValue.startsWith("replace:")) { replaceLexicon = true; lexiconPath = argValue.substring("replace:".length()); } else { lexiconPath = argValue; } } else if (argName.equals("perceptronScoring")) { PerceptronScoring perceptronScoring = PerceptronScoring.valueOf(argValue); MachineLearningSession.setPerceptronScoring(perceptronScoring); } else if (argName.equals("parseComparisonStrategy")) { parseComparisonStrategyType = ParseComparisonStrategyType.valueOf(argValue); } else if (argName.equals("sentenceReader")) { sentenceReaderPath = argValue; } else if (argName.equals("skipLabel")) { skipLabel = argValue; } else if (argName.equals("errorLabels")) { errorLabels = new HashSet<String>(); String[] labels = argValue.split(","); for (String label : labels) { errorLabels.add(label); } } else if (argName.equals("earlyStop")) { earlyStop = argValue.equalsIgnoreCase("true"); } else if (argName.equals("languageFeatures")) { languageFeaturePath = argValue; } else if (argName.equals("sentenceFeatures")) { sentenceFeaturePath = argValue; } else if (argName.equals("tokeniserFeatures")) { tokeniserFeaturePath = argValue; } else if (argName.equals("tokeniserPatterns")) { tokeniserPatternFilePath = argValue; } else if (argName.equals("posTaggerFeatures")) { posTaggerFeaturePath = argValue; } else if (argName.equals("parserFeatures")) { parserFeaturePath = argValue; } else if (argName.equals("externalResources")) { externalResourcePath = argValue; } else if (argName.equals("testWords")) { String[] parts = argValue.split(";"); testWords = new HashSet<String>(); for (String part : parts) testWords.add(part); } else if (argName.equals("includeLexiconCoverage")) { includeLexiconCoverage = argValue.equalsIgnoreCase("true"); } else if (argName.equals("includeUnknownWordResults")) { includeUnknownWordResults = argValue.equalsIgnoreCase("true"); } else if (argName.equals("iterations")) iterations = Integer.parseInt(argValue); else if (argName.equals("cutoff")) cutoff = Integer.parseInt(argValue); else if (argName.equals("dependencyLabels")) dependencyLabelPath = argValue; else if (argName.equals("parsingConstrainer")) parsingConstrainerPath = argValue; else if (argName.equals("algorithm")) algorithm = MachineLearningAlgorithm.valueOf(argValue); else if (argName.equals("linearSVMSolver")) solverType = LinearSVMSolverType.valueOf(argValue); else if (argName.equals("linearSVMCost")) constraintViolationCost = Double.parseDouble(argValue); else if (argName.equals("linearSVMEpsilon")) epsilon = Double.parseDouble(argValue); else if (argName.equals("perceptronTolerance")) perceptronTolerance = Double.parseDouble(argValue); else if (argName.equals("averageAtIntervals")) averageAtIntervals = argValue.equalsIgnoreCase("true"); else if (argName.equals("perceptronObservationPoints")) { String[] points = argValue.split(","); perceptronObservationPoints = new ArrayListNoNulls<Integer>(); for (String point : points) perceptronObservationPoints.add(Integer.parseInt(point)); } else if (argName.equals("tokeniserType")) { tokeniserType = TokeniserType.valueOf(argValue); } else if (argName.equals("patternTokeniser")) patternTokeniserType = PatternTokeniserType.valueOf(argValue); else if (argName.equals("excludeFile")) { excludeFileName = argValue; } else if (argName.equals("port")) { port = Integer.parseInt(argValue); } else if (argName.equals("preloadLexicon")) { preloadLexicon = argValue.equalsIgnoreCase("true"); } else if (argName.equals("locale")) { locale = Locale.forLanguageTag(argValue); } else if (argName.equals("languageCorpusMap")) { languageCorpusMapPath = argValue; } else if (argName.equals("corpusLexicalEntryRegex")) { corpusLexicalEntryRegexPath = argValue; } else if (argName.equals("languagePack")) { languagePackPath = argValue; } else { System.out.println("Unknown argument: " + argName); throw new RuntimeException("Unknown argument: " + argName); } } if (command == null) throw new TalismaneException("No command provided."); if (!(implementation instanceof LanguagePackImplementation) && languagePackPath != null) throw new TalismaneException("The implementation " + implementation.getClass().getSimpleName() + " does not accept language packs"); if (implementation instanceof LanguagePackImplementation) { if (languagePackPath != null) { File languagePackFile = this.getFile(languagePackPath); if (!languagePackFile.exists()) throw new TalismaneException( "languagePack: could not find file: " + languagePackFile.getPath()); LOG.debug("Setting language pack to " + languagePackFile.getPath()); ((LanguagePackImplementation) implementation).setLanguagePack(languagePackFile); } } if (command.equals(Command.evaluate)) { if (outDirPath.length() == 0) throw new TalismaneException("Missing argument: outdir"); } if (startModule == null) startModule = module; if (startModule == null) startModule = Module.SentenceDetector; if (endModule == null) endModule = module; if (endModule == null) endModule = Module.Parser; if (module == null) module = endModule; if (command == Command.train) { if (module == Module.LanguageDetector) { if (languageModelFilePath == null) throw new TalismaneException( "languageModel is required when training a language detector model"); if (languageCorpusMapPath == null) throw new TalismaneException( "languageCorpusMap is required when training a language detector model"); if (languageFeaturePath == null) throw new TalismaneException( "languageFeatures is required when training a language detector model"); } else if (module == Module.SentenceDetector) { if (sentenceModelFilePath == null) throw new TalismaneException( "sentenceModel is required when training a sentence detector model"); if (sentenceFeaturePath == null) throw new TalismaneException( "sentenceFeatures is required when training a sentence detector model"); } else if (module == Module.Tokeniser) { if (tokeniserModelFilePath == null) throw new TalismaneException("tokeniserModel is required when training a tokeniser model"); if (tokeniserFeaturePath == null) throw new TalismaneException( "tokeniserFeatures is required when training a tokeniser model"); } else if (module == Module.PosTagger) { if (posTaggerModelFilePath == null) throw new TalismaneException("posTaggerModel is required when training a posTagger model"); if (posTaggerFeaturePath == null) throw new TalismaneException( "posTaggerFeatures is required when training a posTagger model"); } else if (module == Module.Parser) { this.predictTransitions = true; if (parserModelFilePath == null) throw new TalismaneException("parserModel is required when training a parser model"); if (parserFeaturePath == null) throw new TalismaneException("parserFeatures is required when training a parser model"); } } if (builtInTemplate != null) { if (builtInTemplate.equalsIgnoreCase("with_location")) { tokeniserTemplateName = "tokeniser_template_with_location.ftl"; posTaggerTemplateName = "posTagger_template_with_location.ftl"; parserTemplateName = "parser_conll_template_with_location.ftl"; } else if (builtInTemplate.equalsIgnoreCase("with_prob")) { tokeniserTemplateName = "tokeniser_template_with_prob.ftl"; posTaggerTemplateName = "posTagger_template_with_prob.ftl"; parserTemplateName = "parser_conll_template_with_prob.ftl"; } else if (builtInTemplate.equalsIgnoreCase("with_comments")) { posTaggerTemplateName = "posTagger_template_with_comments.ftl"; parserTemplateName = "parser_conll_template_with_comments.ftl"; } else { throw new TalismaneException("Unknown builtInTemplate: " + builtInTemplate); } } if (posTaggerBeamWidth < 0) posTaggerBeamWidth = beamWidth; if (parserBeamWidth < 0) parserBeamWidth = beamWidth; inputCharset = Charset.defaultCharset(); outputCharset = Charset.defaultCharset(); if (encoding != null) { inputCharset = Charset.forName(encoding); outputCharset = Charset.forName(encoding); } else { if (inputEncoding != null) inputCharset = Charset.forName(inputEncoding); if (outputEncoding != null) outputCharset = Charset.forName(outputEncoding); } if (fileName == null && inFilePath != null) { fileName = inFilePath; } if (posTagSetPath != null) { File posTagSetFile = this.getFile(posTagSetPath); Scanner posTagSetScanner = new Scanner(new BufferedReader( new InputStreamReader(new FileInputStream(posTagSetFile), this.getInputCharset().name()))); PosTagSet posTagSet = this.getPosTaggerService().getPosTagSet(posTagSetScanner); talismaneSession.setPosTagSet(posTagSet); } if (transitionSystemStr != null) { TransitionSystem transitionSystem = null; if (transitionSystemStr.equalsIgnoreCase("ShiftReduce")) { transitionSystem = this.getParserService().getShiftReduceTransitionSystem(); } else if (transitionSystemStr.equalsIgnoreCase("ArcEager")) { transitionSystem = this.getParserService().getArcEagerTransitionSystem(); } else { throw new TalismaneException("Unknown transition system: " + transitionSystemStr); } if (dependencyLabelPath != null) { File dependencyLabelFile = this.getFile(dependencyLabelPath); Scanner depLabelScanner = new Scanner(new BufferedReader( new InputStreamReader(new FileInputStream(dependencyLabelFile), "UTF-8"))); List<String> dependencyLabels = new ArrayListNoNulls<String>(); while (depLabelScanner.hasNextLine()) { String dependencyLabel = depLabelScanner.nextLine(); if (!dependencyLabel.startsWith("#")) dependencyLabels.add(dependencyLabel); } transitionSystem.setDependencyLabels(dependencyLabels); } talismaneSession.setTransitionSystem(transitionSystem); } if (this.lexiconPath != null) { File lexiconFile = this.getFile(lexiconPath); if (!lexiconFile.exists()) throw new TalismaneException("lexicon: File " + lexiconPath + " does not exist"); LexiconDeserializer lexiconDeserializer = new LexiconDeserializer(talismaneSession); List<PosTaggerLexicon> lexicons = lexiconDeserializer.deserializeLexicons(lexiconFile); for (PosTaggerLexicon oneLexicon : lexicons) { talismaneSession.addLexicon(oneLexicon); } if (!replaceLexicon) { List<PosTaggerLexicon> defaultLexicons = this.implementation.getDefaultLexicons(); if (defaultLexicons != null) { for (PosTaggerLexicon oneLexicon : defaultLexicons) { talismaneSession.addLexicon(oneLexicon); } } } } if (externalResourcePath != null) { externalResourceFinder = this.getMachineLearningService().getExternalResourceFinder(); List<String> paths = new ArrayListNoNulls<String>(); if (externalResourcePath != null && externalResourcePath.length() > 0) { LOG.info("externalResourcePath: " + externalResourcePath); String[] parts = externalResourcePath.split(";"); for (String part : parts) paths.add(part); } for (String path : paths) { LOG.info("Reading external resources from " + path); if (path.length() > 0) { File externalResourceFile = this.getFile(path); externalResourceFinder.addExternalResources(externalResourceFile); } } ExternalResourceFinder parserResourceFinder = this.getParserFeatureService() .getExternalResourceFinder(); ExternalResourceFinder posTaggerResourceFinder = this.getPosTaggerFeatureService() .getExternalResourceFinder(); ExternalResourceFinder tokeniserResourceFinder = this.getTokenFeatureService() .getExternalResourceFinder(); ExternalResourceFinder sentenceResourceFinder = this.getSentenceDetectorFeatureService() .getExternalResourceFinder(); for (ExternalResource<?> externalResource : externalResourceFinder.getExternalResources()) { parserResourceFinder.addExternalResource(externalResource); posTaggerResourceFinder.addExternalResource(externalResource); tokeniserResourceFinder.addExternalResource(externalResource); sentenceResourceFinder.addExternalResource(externalResource); } ExternalResourceFinder tokenFilterResourceFinder = this.getTokenFilterService() .getExternalResourceFinder(); for (ExternalWordList externalWordList : externalResourceFinder.getExternalWordLists()) { tokenFilterResourceFinder.addExternalWordList(externalWordList); } } } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.joliciel.talismane.TalismaneConfigImpl.java
/** * Text marker filters are applied to raw text segments extracted from the stream, 3 segments at a time. * This means that if a particular marker crosses segment borders, it is handled correctly. * @return//from w w w .j a v a 2 s . co m */ @Override public List<TextMarkerFilter> getTextMarkerFilters() { try { if (textMarkerFilters == null) { textMarkerFilters = new ArrayListNoNulls<TextMarkerFilter>(); // insert sentence breaks at end of block this.addTextMarkerFilter(this.getFilterService().getRegexMarkerFilter( new MarkerFilterType[] { MarkerFilterType.SENTENCE_BREAK }, "" + endBlockCharacter, blockSize)); // handle newline as requested if (newlineMarker.equals(MarkerFilterType.SENTENCE_BREAK)) this.addTextMarkerFilter(this.getFilterService().getNewlineEndOfSentenceMarker()); else if (newlineMarker.equals(MarkerFilterType.SPACE)) this.addTextMarkerFilter(this.getFilterService().getNewlineSpaceMarker()); // get rid of duplicate white-space always this.addTextMarkerFilter(this.getFilterService().getDuplicateWhiteSpaceFilter()); List<String> paths = new ArrayListNoNulls<String>(); if (textFiltersPath != null && textFiltersPath.length() > 0) { LOG.debug("textFiltersPath: " + textFiltersPath); String[] parts = textFiltersPath.split(";"); for (String part : parts) paths.add(part); } if (!textFiltersReplace) { // default text filter path paths.add(""); } for (String path : paths) { LOG.debug("Text marker filters"); Scanner textFilterScanner = null; if (path.length() > 0) { LOG.debug("From: " + path); File textFilterFile = this.getFile(path); if (!textFilterFile.exists()) { throw new TalismaneException("textFilters: File " + path + " does not exist"); } textFilterScanner = new Scanner(new BufferedReader(new InputStreamReader( new FileInputStream(textFilterFile), this.getInputCharset().name()))); } else { LOG.debug("From default"); textFilterScanner = this.implementation.getDefaultTextMarkerFiltersScanner(); } if (textFilterScanner != null) { while (textFilterScanner.hasNextLine()) { String descriptor = textFilterScanner.nextLine(); LOG.debug(descriptor); if (descriptor.length() > 0 && !descriptor.startsWith("#")) { TextMarkerFilter textMarkerFilter = this.getFilterService() .getTextMarkerFilter(descriptor, blockSize); this.addTextMarkerFilter(textMarkerFilter); } } } } } return textMarkerFilters; } catch (Exception e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:au.org.emii.portal.composer.MapComposer.java
public void loadUserSession(String sessionid) { Scanner scanner = null; try {// ww w .j a va2 s.co m String sfld = getSettingsSupplementary().getProperty(StringConstants.ANALYSIS_OUTPUT_DIR) + "session/" + sessionid; File sessfolder = new File(sfld); if (!sessfolder.exists()) { showMessage("Session information does not exist. Please provide a valid session id"); return; } scanner = new Scanner(new File(sfld + "/details.txt")); // first grab the zoom level and bounding box String[] mapdetails = scanner.nextLine().split(","); BoundingBox bb = new BoundingBox(); bb.setMinLongitude(Float.parseFloat(mapdetails[1])); bb.setMinLatitude(Float.parseFloat(mapdetails[2])); bb.setMaxLongitude(Float.parseFloat(mapdetails[3])); bb.setMaxLatitude(Float.parseFloat(mapdetails[4])); openLayersJavascript.setAdditionalScript(openLayersJavascript.zoomToBoundingBox(bb, true)); String[] scatterplotNames = null; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.startsWith("scatterplotNames")) { scatterplotNames = line.substring(17).split("___"); } } ArrayUtils.reverse(scatterplotNames); // ignore fields not found XStream xstream = new XStream(new DomDriver()) { protected MapperWrapper wrapMapper(MapperWrapper next) { return new MapperWrapper(next) { public boolean shouldSerializeMember(Class definedIn, String fieldName) { if (definedIn == Object.class || !super.shouldSerializeMember(definedIn, fieldName)) System.out.println("faled to read: " + definedIn + ", " + fieldName); return definedIn != Object.class ? super.shouldSerializeMember(definedIn, fieldName) : false; } }; } @Override public Object unmarshal(HierarchicalStreamReader reader) { Object o = super.unmarshal(reader); if (o instanceof BiocacheQuery) ((BiocacheQuery) o).getFullQ(false); return o; } @Override public Object unmarshal(HierarchicalStreamReader reader, Object root) { Object o = super.unmarshal(reader, root); if (o instanceof BiocacheQuery) ((BiocacheQuery) o).getFullQ(false); return o; } @Override public Object unmarshal(HierarchicalStreamReader reader, Object root, DataHolder dataHolder) { Object o = super.unmarshal(reader, root, dataHolder); if (o instanceof BiocacheQuery) ((BiocacheQuery) o).getFullQ(false); return o; } }; PersistenceStrategy strategy = new FilePersistenceStrategy(new File(sfld), xstream); List list = new XmlArrayList(strategy); ListIterator it = list.listIterator(list.size()); int scatterplotIndex = 0; while (it.hasPrevious()) { Object o = it.previous(); MapLayer ml = null; if (o instanceof MapLayer) { ml = (MapLayer) o; LOGGER.debug("Loading " + ml.getName() + " -> " + ml.isDisplayed()); addUserDefinedLayerToMenu(ml, false); } else if (o instanceof ScatterplotDataDTO) { ScatterplotDataDTO spdata = (ScatterplotDataDTO) o; loadScatterplot(spdata, "My Scatterplot " + scatterplotIndex++); } if (ml != null) { addUserDefinedLayerToMenu(ml, true); } } } catch (Exception e) { try { File f = new File("/data/sessions/" + sessionid + ".txt"); PrintWriter pw = new PrintWriter(f); e.printStackTrace(pw); pw.close(); } catch (Exception ex) { } LOGGER.error("Unable to load session data", e); showMessage("Unable to load session data"); } finally { if (scanner != null) { scanner.close(); } try { File f = new File("/data/sessions/ok/" + sessionid + ".txt"); FileUtils.writeStringToFile(f, "ok"); } catch (Exception ex) { } } }
From source file:com.joliciel.talismane.TalismaneConfigImpl.java
private List<PosTagSequenceFilter> getPosTagSequenceFilters(MachineLearningModel model) { try {/* w ww . j a va2 s . c o m*/ if (posTaggerPostProcessingFilters == null) { List<String> posTaggerPostProcessingFilterDescriptors = new ArrayListNoNulls<String>(); posTaggerPostProcessingFilters = new ArrayListNoNulls<PosTagSequenceFilter>(); List<Scanner> scanners = new ArrayListNoNulls<Scanner>(); if (posTagSequenceFilterPath != null) { File filterFile = this.getFile(posTagSequenceFilterPath); Scanner scanner = new Scanner(new BufferedReader( new InputStreamReader(new FileInputStream(filterFile), this.getInputCharset()))); scanners.add(scanner); } else if (model != null) { List<String> modelDescriptors = model.getDescriptors() .get(PosTagFilterService.POSTAG_POSTPROCESSING_FILTER_DESCRIPTOR_KEY); if (modelDescriptors != null) { String modelDescriptorString = ""; if (modelDescriptors != null) { for (String descriptor : modelDescriptors) { modelDescriptorString += descriptor + "\n"; } } Scanner scanner = new Scanner(modelDescriptorString); scanners.add(scanner); } } for (Scanner scanner : scanners) { while (scanner.hasNextLine()) { String descriptor = scanner.nextLine(); LOG.debug(descriptor); posTaggerPostProcessingFilterDescriptors.add(descriptor); if (descriptor.length() > 0 && !descriptor.startsWith("#")) { PosTagSequenceFilter filter = this.getPosTagFilterService() .getPosTagSequenceFilter(descriptor); posTaggerPostProcessingFilters.add(filter); } } } this.getDescriptors().put(PosTagFilterService.POSTAG_POSTPROCESSING_FILTER_DESCRIPTOR_KEY, posTaggerPostProcessingFilterDescriptors); } return posTaggerPostProcessingFilters; } catch (Exception e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.joliciel.talismane.TalismaneConfigImpl.java
/** * The rules to apply when running the pos-tagger. * @return//w ww .j a va 2s .co m */ @Override public List<PosTaggerRule> getPosTaggerRules() { try { if (posTaggerRules == null) { posTaggerRules = new ArrayListNoNulls<PosTaggerRule>(); for (int i = 0; i <= 1; i++) { Scanner rulesScanner = null; if (i == 0) { if (posTaggerRulesReplace) continue; rulesScanner = this.implementation.getDefaultPosTaggerRulesScanner(); } else { if (posTaggerRuleFilePath != null && posTaggerRuleFilePath.length() > 0) { File posTaggerRuleFile = this.getFile(posTaggerRuleFilePath); if (!posTaggerRuleFile.exists()) { throw new TalismaneException( "posTaggerRules: File " + posTaggerRuleFilePath + " does not exist"); } rulesScanner = new Scanner(new BufferedReader(new InputStreamReader( new FileInputStream(posTaggerRuleFile), this.getInputCharset().name()))); } } if (rulesScanner != null) { List<String> ruleDescriptors = new ArrayListNoNulls<String>(); while (rulesScanner.hasNextLine()) { String ruleDescriptor = rulesScanner.nextLine(); if (ruleDescriptor.length() > 0) { ruleDescriptors.add(ruleDescriptor); LOG.trace(ruleDescriptor); } } List<PosTaggerRule> rules = this.getPosTaggerFeatureService().getRules(ruleDescriptors); posTaggerRules.addAll(rules); } } } return posTaggerRules; } catch (Exception e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.joliciel.talismane.TalismaneConfigImpl.java
/** * The rules to apply when running the parser. * @return/*from w w w . java 2s.co m*/ */ @Override public List<ParserRule> getParserRules() { try { if (parserRules == null) { parserRules = new ArrayListNoNulls<ParserRule>(); if (parserRuleFilePath != null && parserRuleFilePath.equalsIgnoreCase("null")) { // add no rules! (not even built-in ones) } else { for (int i = 0; i <= 1; i++) { Scanner rulesScanner = null; if (i == 0) { if (parserRulesReplace) continue; rulesScanner = this.implementation.getDefaultParserRulesScanner(); } else { if (parserRuleFilePath != null && parserRuleFilePath.length() > 0) { File parserRuleFile = this.getFile(parserRuleFilePath); if (!parserRuleFile.exists()) { throw new TalismaneException( "parserRules: File " + parserRuleFilePath + " does not exist"); } rulesScanner = new Scanner(new BufferedReader(new InputStreamReader( new FileInputStream(parserRuleFile), this.getInputCharset().name()))); } } if (rulesScanner != null) { List<String> ruleDescriptors = new ArrayListNoNulls<String>(); while (rulesScanner.hasNextLine()) { String ruleDescriptor = rulesScanner.nextLine(); if (ruleDescriptor.length() > 0) { ruleDescriptors.add(ruleDescriptor); LOG.trace(ruleDescriptor); } } List<ParserRule> rules = this.getParserFeatureService().getRules(ruleDescriptors, dynamiseFeatures); parserRules.addAll(rules); } } } } return parserRules; } catch (Exception e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.joliciel.talismane.TalismaneConfigImpl.java
/** * TokenFilters to be applied during analysis. * @return//from ww w . j a va2 s . com */ private List<TokenSequenceFilter> getTokenSequenceFilters(MachineLearningModel model) { try { if (tokenSequenceFilters == null) { List<String> tokenSequenceFilterDescriptors = new ArrayListNoNulls<String>(); tokenSequenceFilters = new ArrayListNoNulls<TokenSequenceFilter>(); LOG.debug("Token sequence filters"); List<Scanner> scanners = new ArrayListNoNulls<Scanner>(); if (tokenSequenceFilterPath != null && tokenSequenceFilterPath.length() > 0) { LOG.debug("tokenSequenceFilterPath: " + tokenSequenceFilterPath); String[] parts = tokenSequenceFilterPath.split(";"); for (String part : parts) { if (part.length() > 0) { LOG.debug("From: " + part); File tokenSequenceFilterFile = this.getFile(part); if (!tokenSequenceFilterFile.exists()) { throw new TalismaneException( "tokenSequenceFilters: File " + part + " does not exist"); } Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader( new FileInputStream(tokenSequenceFilterFile), this.getInputCharset()))); scanners.add(scanner); } } } if (!tokenSequenceFiltersReplace) { if (model != null) { LOG.debug("From model"); List<String> modelDescriptors = model.getDescriptors() .get(PosTagFilterService.POSTAG_PREPROCESSING_FILTER_DESCRIPTOR_KEY); String modelDescriptorString = ""; if (modelDescriptors != null) { for (String descriptor : modelDescriptors) { modelDescriptorString += descriptor + "\n"; } } Scanner scanner = new Scanner(modelDescriptorString); scanners.add(scanner); } else { // default token filters LOG.debug("From default"); Scanner scanner = this.implementation.getDefaultTokenSequenceFiltersScanner(); scanners.add(scanner); } } for (Scanner scanner : scanners) { while (scanner.hasNextLine()) { String descriptor = scanner.nextLine(); LOG.debug(descriptor); tokenSequenceFilterDescriptors.add(descriptor); if (descriptor.length() > 0 && !descriptor.startsWith("#")) { TokenSequenceFilter tokenSequenceFilter = this.getTokenFilterService() .getTokenSequenceFilter(descriptor); if (tokenSequenceFilter instanceof NeedsTalismaneSession) ((NeedsTalismaneSession) tokenSequenceFilter).setTalismaneSession(talismaneSession); tokenSequenceFilters.add(tokenSequenceFilter); } } } this.getDescriptors().put(PosTagFilterService.POSTAG_PREPROCESSING_FILTER_DESCRIPTOR_KEY, tokenSequenceFilterDescriptors); } return tokenSequenceFilters; } catch (Exception e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }