Java tutorial
/////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2014 Joliciel Informatique // //This file is part of Talismane. // //Talismane is free software: you can redistribute it and/or modify //it under the terms of the GNU Affero General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //Talismane is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Affero General Public License for more details. // //You should have received a copy of the GNU Affero General Public License //along with Talismane. If not, see <http://www.gnu.org/licenses/>. ////////////////////////////////////////////////////////////////////////////// package com.joliciel.talismane.terminology; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.Scanner; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.log4j.PropertyConfigurator; import com.joliciel.talismane.Talismane; import com.joliciel.talismane.TalismaneConfig; import com.joliciel.talismane.TalismaneException; import com.joliciel.talismane.TalismaneService; import com.joliciel.talismane.TalismaneServiceLocator; import com.joliciel.talismane.TalismaneSession; import com.joliciel.talismane.terminology.TermExtractor.TerminologyProperty; import com.joliciel.talismane.utils.StringUtils; public class TalismaneTermExtractorMain { private static final Log LOG = LogFactory.getLog(TalismaneTermExtractorMain.class); private enum Command { analyse, extract, list } public static void main(String[] args) throws Exception { String termFilePath = null; String outFilePath = null; Command command = Command.extract; int depth = -1; String databasePropertiesPath = null; String projectCode = null; String terminologyPropertiesPath = null; Map<String, String> argMap = StringUtils.convertArgs(args); String logConfigPath = argMap.get("logConfigFile"); if (logConfigPath != null) { argMap.remove("logConfigFile"); Properties props = new Properties(); props.load(new FileInputStream(logConfigPath)); PropertyConfigurator.configure(props); } Map<String, String> innerArgs = new HashMap<String, String>(); for (Entry<String, String> argEntry : argMap.entrySet()) { String argName = argEntry.getKey(); String argValue = argEntry.getValue(); if (argName.equals("command")) command = Command.valueOf(argValue); else if (argName.equals("termFile")) termFilePath = argValue; else if (argName.equals("outFile")) outFilePath = argValue; else if (argName.equals("depth")) depth = Integer.parseInt(argValue); else if (argName.equals("databaseProperties")) databasePropertiesPath = argValue; else if (argName.equals("terminologyProperties")) terminologyPropertiesPath = argValue; else if (argName.equals("projectCode")) projectCode = argValue; else innerArgs.put(argName, argValue); } if (termFilePath == null && databasePropertiesPath == null) throw new TalismaneException("Required argument: termFile or databasePropertiesPath"); if (termFilePath != null) { String currentDirPath = System.getProperty("user.dir"); File termFileDir = new File(currentDirPath); if (termFilePath.lastIndexOf("/") >= 0) { String termFileDirPath = termFilePath.substring(0, termFilePath.lastIndexOf("/")); termFileDir = new File(termFileDirPath); termFileDir.mkdirs(); } } long startTime = new Date().getTime(); try { if (command.equals(Command.analyse)) { innerArgs.put("command", "analyse"); } else { innerArgs.put("command", "process"); } String sessionId = ""; TalismaneServiceLocator locator = TalismaneServiceLocator.getInstance(sessionId); TalismaneService talismaneService = locator.getTalismaneService(); TalismaneConfig config = talismaneService.getTalismaneConfig(innerArgs, sessionId); TerminologyServiceLocator terminologyServiceLocator = TerminologyServiceLocator.getInstance(locator); TerminologyService terminologyService = terminologyServiceLocator.getTerminologyService(); TerminologyBase terminologyBase = null; if (projectCode == null) throw new TalismaneException("Required argument: projectCode"); File file = new File(databasePropertiesPath); FileInputStream fis = new FileInputStream(file); Properties dataSourceProperties = new Properties(); dataSourceProperties.load(fis); terminologyBase = terminologyService.getPostGresTerminologyBase(projectCode, dataSourceProperties); TalismaneSession talismaneSession = talismaneService.getTalismaneSession(); if (command.equals(Command.analyse) || command.equals(Command.extract)) { Locale locale = talismaneSession.getLocale(); Map<TerminologyProperty, String> terminologyProperties = new HashMap<TerminologyProperty, String>(); if (terminologyPropertiesPath != null) { Map<String, String> terminologyPropertiesStr = StringUtils.getArgMap(terminologyPropertiesPath); for (String key : terminologyPropertiesStr.keySet()) { try { TerminologyProperty property = TerminologyProperty.valueOf(key); terminologyProperties.put(property, terminologyPropertiesStr.get(key)); } catch (IllegalArgumentException e) { throw new TalismaneException("Unknown terminology property: " + key); } } } else { terminologyProperties = getDefaultTerminologyProperties(locale); } if (depth <= 0 && !terminologyProperties.containsKey(TerminologyProperty.maxDepth)) throw new TalismaneException("Required argument: depth"); InputStream regexInputStream = getInputStreamFromResource( "parser_conll_with_location_input_regex.txt"); Scanner regexScanner = new Scanner(regexInputStream, "UTF-8"); String inputRegex = regexScanner.nextLine(); regexScanner.close(); config.setInputRegex(inputRegex); Charset outputCharset = config.getOutputCharset(); TermExtractor termExtractor = terminologyService.getTermExtractor(terminologyBase, terminologyProperties); if (depth > 0) termExtractor.setMaxDepth(depth); termExtractor.setOutFilePath(termFilePath); if (outFilePath != null) { if (outFilePath.lastIndexOf("/") >= 0) { String outFileDirPath = outFilePath.substring(0, outFilePath.lastIndexOf("/")); File outFileDir = new File(outFileDirPath); outFileDir.mkdirs(); } File outFile = new File(outFilePath); outFile.delete(); outFile.createNewFile(); Writer writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outFilePath), outputCharset)); TermAnalysisWriter termAnalysisWriter = new TermAnalysisWriter(writer); termExtractor.addTermObserver(termAnalysisWriter); } Talismane talismane = config.getTalismane(); talismane.setParseConfigurationProcessor(termExtractor); talismane.process(); } else if (command.equals(Command.list)) { List<Term> terms = terminologyBase.findTerms(2, null, 0, null, null); for (Term term : terms) { LOG.debug("Term: " + term.getText()); LOG.debug("Frequency: " + term.getFrequency()); LOG.debug("Heads: " + term.getHeads()); LOG.debug("Expansions: " + term.getExpansions()); LOG.debug("Contexts: " + term.getContexts()); } } } finally { long endTime = new Date().getTime(); long totalTime = endTime - startTime; LOG.info("Total time: " + totalTime); } } public static Map<TerminologyProperty, String> getDefaultTerminologyProperties(Locale locale) { Map<TerminologyProperty, String> terminologyProperties = new HashMap<TerminologyProperty, String>(); if (locale.getLanguage().equals("fr")) { terminologyProperties.put(TerminologyProperty.adjectivalTags, "ADJ,VPP"); terminologyProperties.put(TerminologyProperty.coordinationLabels, "coord,dep_coord"); terminologyProperties.put(TerminologyProperty.determinentTags, "DET"); terminologyProperties.put(TerminologyProperty.nominalTags, "NC,NPP"); terminologyProperties.put(TerminologyProperty.nonStandaloneIfHasDependents, "VPR"); terminologyProperties.put(TerminologyProperty.nonStandaloneTags, "P,CC,CS,PONCT,P+D"); terminologyProperties.put(TerminologyProperty.nonTopLevelLabels, "det,coord,dep_coord"); terminologyProperties.put(TerminologyProperty.prepositionalTags, "P,P+D"); terminologyProperties.put(TerminologyProperty.termStopTags, "V,VS,VIMP,PRO,P+PRO,PROREL,PROWH,PONCT"); terminologyProperties.put(TerminologyProperty.zeroDepthLabels, "prep,det,coord,dep_coord"); terminologyProperties.put(TerminologyProperty.lemmaGender, "m"); terminologyProperties.put(TerminologyProperty.lemmaNumber, "s"); } else { throw new TalismaneException( "Terminology extraction properties unknown for language: " + locale.getLanguage()); } return terminologyProperties; } private static InputStream getInputStreamFromResource(String resource) { String path = "/com/joliciel/talismane/terminology/resources/" + resource; LOG.debug("Getting " + path); InputStream inputStream = TalismaneTermExtractorMain.class.getResourceAsStream(path); return inputStream; } }