Java tutorial
/******************************************************************************* * Copyright 2014 * FG Language Technology * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tu.darmstadt.lt.ner.preprocessing; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.List; import java.util.Properties; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.apache.uima.UIMAException; import org.apache.uima.UIMAFramework; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import org.cleartk.ml.CleartkSequenceAnnotator; import org.cleartk.ml.crfsuite.CrfSuiteStringOutcomeDataWriter; import org.cleartk.ml.jar.DefaultSequenceDataWriterFactory; import org.cleartk.ml.jar.DirectoryDataWriterFactory; import org.cleartk.ml.jar.GenericJarClassifierFactory; import org.cleartk.util.cr.FilesCollectionReader; import de.tu.darmstadt.lt.ner.annotator.NERAnnotator; import de.tu.darmstadt.lt.ner.reader.NERReader; import de.tu.darmstadt.lt.ner.writer.EvaluatedNERWriter; import de.tu.darmstadt.lt.ner.writer.SentenceToCRFTestFileWriter; public class GermaNERMain { private static final Logger LOG = Logger.getLogger(GermaNERMain.class.getName()); static File modelDirectory; static InputStream configFile = null; static Properties prop; public static Properties getPropFile() { return prop; } public static void initNERModel() throws IOException { configFile = configFile == null ? ClassLoader.getSystemResourceAsStream("config.properties") : configFile; prop = new Properties(); loadConfig(); } /** * * @param NER_TagFile * @param modelDirectory * = the directory where the training model will be saved/or found * @param language * = the language of the document, de for German and en for English * @param createPos * = if bulletin MatePOS tagger is to be used * @param freebaseListFile * = use freebase lists as a feature * @param usePosition * = use the position of the token as a feature * @param suffixCLass * = if a file to match common suffixes to a given class is given * @throws ResourceInitializationException * @throws UIMAException * @throws IOException */ public static void writeModel(File NER_TagFile, File modelDirectory) throws UIMAException, IOException { runPipeline( FilesCollectionReader.getCollectionReaderWithSuffixes( NER_TagFile.getAbsolutePath(), NERReader.CONLL_VIEW, NER_TagFile.getName()), createEngine(NERReader.class), createEngine(NERAnnotator.class, NERAnnotator.PARAM_FEATURE_EXTRACTION_FILE, modelDirectory.getAbsolutePath() + "/feature.xml", CleartkSequenceAnnotator.PARAM_IS_TRAINING, true, DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, modelDirectory.getAbsolutePath(), DefaultSequenceDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, CrfSuiteStringOutcomeDataWriter.class)); } public static void trainModel(File modelDirectory) throws Exception { org.cleartk.ml.jar.Train.main(modelDirectory.getAbsolutePath()); } public static void classifyTestFile(File aClassifierJarPath, File testPosFile, File outputFile, File aNodeResultFile, List<Integer> aSentencesIds) throws UIMAException, IOException { runPipeline( FilesCollectionReader.getCollectionReaderWithSuffixes( testPosFile.getAbsolutePath(), NERReader.CONLL_VIEW, testPosFile.getName()), createEngine(NERReader.class), createEngine(NERAnnotator.class, NERAnnotator.PARAM_FEATURE_EXTRACTION_FILE, aClassifierJarPath.getAbsolutePath() + "/feature.xml", NERAnnotator.FEATURE_FILE, aClassifierJarPath.getAbsolutePath(), GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, aClassifierJarPath.getAbsolutePath() + "/model.jar"), createEngine(EvaluatedNERWriter.class, EvaluatedNERWriter.OUTPUT_FILE, outputFile, EvaluatedNERWriter.IS_GOLD, false, EvaluatedNERWriter.NOD_OUTPUT_FILE, aNodeResultFile, EvaluatedNERWriter.SENTENCES_ID, aSentencesIds)); } public static void classifyTestFile(File testPosFile, File outputFile, File aNodeResultFile, List<Integer> aSentencesIds) throws UIMAException, IOException { initNERModel(); setModelDir(); runPipeline( FilesCollectionReader.getCollectionReaderWithSuffixes( testPosFile.getAbsolutePath(), NERReader.CONLL_VIEW, testPosFile.getName()), createEngine(NERReader.class), createEngine(NERAnnotator.class, NERAnnotator.PARAM_FEATURE_EXTRACTION_FILE, modelDirectory.getAbsolutePath() + "/feature.xml", NERAnnotator.FEATURE_FILE, modelDirectory.getAbsolutePath(), GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, modelDirectory.getAbsolutePath() + "/model.jar"), createEngine(EvaluatedNERWriter.class, EvaluatedNERWriter.OUTPUT_FILE, outputFile, EvaluatedNERWriter.IS_GOLD, false, EvaluatedNERWriter.NOD_OUTPUT_FILE, aNodeResultFile, EvaluatedNERWriter.SENTENCES_ID, aSentencesIds)); } /** * This is a helper method, can be called from NoD. If you use a DKPro tokenizer during * training, this mehtod use the same tokenizer available in DKPro, * * @param sentences * pure sentences * @return * @throws UIMAException * @throws IllegalArgumentException * @throws IOException */ public static void sentenceToCRFFormat(List<String> sentences, String aCRFFileName, String aLanguage) throws UIMAException, IllegalArgumentException, IOException { SimplePipeline.runPipeline(JCasFactory.createJCas(), createEngine(SentenceToCRFTestFileWriter.class, SentenceToCRFTestFileWriter.SENTENCE_ITERATOR, sentences, SentenceToCRFTestFileWriter.CRF_TEST_FILE_NAME, aCRFFileName, SentenceToCRFTestFileWriter.CRF_TEST_FILE_LANG, aLanguage)); } public static void main(String[] arg) throws Exception { long startTime = System.currentTimeMillis(); String usage = "USAGE: java -jar germanner.jar [-c config.properties] \n" + " [-f trainingFileName] -t testFileName -d ModelOutputDirectory-o outputFile"; long start = System.currentTimeMillis(); ChangeColon c = new ChangeColon(); List<String> argList = Arrays.asList(arg); try { if (argList.contains("-c") && argList.get(argList.indexOf("-c") + 1) != null) { if (!new File(argList.get(argList.indexOf("-c") + 1)).exists()) { LOG.error("Default configuration is read from the system\n"); } else { configFile = new FileInputStream(argList.get(argList.indexOf("-c") + 1)); } } if (argList.contains("-t") && argList.get(argList.indexOf("-t") + 1) != null) { if (!new File(argList.get(argList.indexOf("-t") + 1)).exists()) { LOG.error("There is no test file to tag"); System.exit(1); } Configuration.testFileName = argList.get(argList.indexOf("-t") + 1); } if (argList.contains("-f") && argList.get(argList.indexOf("-f") + 1) != null) { if (!new File(argList.get(argList.indexOf("-f") + 1)).exists()) { LOG.error("The system is running in tagging mode. No training data provided"); } else { Configuration.trainFileName = argList.get(argList.indexOf("-f") + 1); } } if (argList.contains("-d") && argList.get(argList.indexOf("-d") + 1) != null) { if (new File(argList.get(argList.indexOf("-d") + 1)).exists()) { Configuration.modelDir = argList.get(argList.indexOf("-d") + 1); } else { File dir = new File(argList.get(argList.indexOf("-d") + 1)); dir.mkdirs(); Configuration.modelDir = dir.getAbsolutePath(); } } // load a properties file initNERModel(); } catch (IOException ex) { ex.printStackTrace(); } try { setModelDir(); File outputtmpFile = new File(modelDirectory, "result.tmp"); File outputFile = null; if (argList.contains("-o") && argList.get(argList.indexOf("-o") + 1) != null) { outputFile = new File(argList.get(argList.indexOf("-o") + 1)); } else { LOG.error("The directory for this output file does not exist. Output file " + "will be found in the current directury under folder \"output\""); outputFile = new File(modelDirectory, "result.tsv"); } if (Configuration.mode.equals("ft") && (Configuration.trainFileName == null || Configuration.testFileName == null)) { LOG.error(usage); System.exit(1); } if (Configuration.mode.equals("f") && Configuration.trainFileName == null) { LOG.error(usage); System.exit(1); } if (Configuration.mode.equals("t") && Configuration.testFileName == null) { LOG.error(usage); System.exit(1); } if (Configuration.mode.equals("f") && Configuration.trainFileName != null) { c.normalize(Configuration.trainFileName, Configuration.trainFileName + ".normalized"); System.out.println("Start model generation"); writeModel(new File(Configuration.trainFileName + ".normalized"), modelDirectory); System.out.println("Start model generation -- done"); System.out.println("Start training"); trainModel(modelDirectory); System.out.println("Start training ---done"); } else if (Configuration.mode.equals("ft") && Configuration.trainFileName != null && Configuration.testFileName != null) { c.normalize(Configuration.trainFileName, Configuration.trainFileName + ".normalized"); c.normalize(Configuration.testFileName, Configuration.testFileName + ".normalized"); System.out.println("Start model generation"); writeModel(new File(Configuration.trainFileName + ".normalized"), modelDirectory); System.out.println("Start model generation -- done"); System.out.println("Start training"); trainModel(modelDirectory); System.out.println("Start training ---done"); System.out.println("Start tagging"); classifyTestFile(modelDirectory, new File(Configuration.testFileName + ".normalized"), outputtmpFile, null, null); System.out.println("Start tagging ---done"); // re-normalized the colon changed text c.deNormalize(outputtmpFile.getAbsolutePath(), outputFile.getAbsolutePath()); } else { c.normalize(Configuration.testFileName, Configuration.testFileName + ".normalized"); System.out.println("Start tagging"); classifyTestFile(modelDirectory, new File(Configuration.testFileName + ".normalized"), outputtmpFile, null, null); // re-normalized the colon changed text c.deNormalize(outputtmpFile.getAbsolutePath(), outputFile.getAbsolutePath()); System.out.println("Start tagging ---done"); } long now = System.currentTimeMillis(); UIMAFramework.getLogger().log(Level.INFO, "Time: " + (now - start) + "ms"); } catch (Exception e) { LOG.error(usage); e.printStackTrace(); } long endTime = System.currentTimeMillis(); long totalTime = endTime - startTime; System.out.println("NER tarin/test done in " + totalTime / 1000 + " seconds"); } private static void setModelDir() throws IOException, FileNotFoundException { modelDirectory = (Configuration.modelDir == null || Configuration.modelDir.isEmpty()) ? new File("output") : new File(Configuration.modelDir); modelDirectory.mkdirs(); if (!new File(modelDirectory, "model.jar").exists()) { IOUtils.copyLarge(ClassLoader.getSystemResourceAsStream("model/model.jar"), new FileOutputStream(new File(modelDirectory, "model.jar"))); } if (!new File(modelDirectory, "MANIFEST.MF").exists()) { IOUtils.copyLarge(ClassLoader.getSystemResourceAsStream("model/MANIFEST.MF"), new FileOutputStream(new File(modelDirectory, "MANIFEST.MF"))); } if (!new File(modelDirectory, "feature.xml").exists()) { IOUtils.copyLarge(ClassLoader.getSystemResourceAsStream("feature/feature.xml"), new FileOutputStream(new File(modelDirectory, "feature.xml"))); } } public static void loadConfig() throws IOException { prop.load(configFile); if (Configuration.testFileName != null) { if (Configuration.trainFileName != null) { Configuration.mode = "ft"; } else { Configuration.mode = "t"; } } if (Configuration.trainFileName != null) { if (Configuration.testFileName != null) { Configuration.mode = "ft"; } else { Configuration.mode = "f"; } } Configuration.useClarkPosInduction = prop.getProperty("useClarkPosInduction").equals("1") ? true : false; Configuration.usePosition = prop.getProperty("usePosition").equals("1") ? true : false; Configuration.useFreeBase = prop.getProperty("useFreeBase").equals("1") ? true : false; Configuration.modelDir = prop.getProperty("modelDir"); } }