Java tutorial
/* Copyright 2009-2013 The MITRE Corporation. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. * ************************************************************************** * NOTICE * This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012 The MITRE Corporation. All Rights Reserved. * ************************************************************************** **/ package org.opensextant.examples; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.util.List; import java.util.Set; import java.util.TreeSet; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.opensextant.regex.RegexAnnotation; import org.opensextant.regex.RegexMatcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; // TODO: Auto-generated Javadoc /** * A simple example of using the RegexMatcher which is used to tag text based on * regular expressions defined in a config file. This example reads a tab * separated file containing text to be tagged. That file should be in the form: * EntityType _TAB_ POS/NEG _TAB_ TestPhrase where * <ol> * <li>EntityType is the type of entity</li> * <li>POS/NEG indicates whether this is a positive (should be tagged) or * negative example (should not be tagged)</li> * <li>TestPhrase is the text which should be examined</li> * <p> * See LanguageResources/TestData/RegexTestData for example test inputs<br> * See LanguageResources/resources/patterns for regex pattern definitions */ public class RegexTest { /** Log object. */ private static final Logger LOGGER = LoggerFactory.getLogger(RegexTest.class); /** * Instantiates a new regex test. */ private RegexTest() { } /** * The main method. * * @param args * the arguments */ public static void main(String[] args) { // file containing the tab separated test data String testFileName = args[0]; // the file containing the regex definintions String patternFileName = args[1]; // file into which we will write the results String resultFileName = args[2]; File testFile = new File(testFileName); URL patternFile = null; BufferedWriter resWriter = null; // setup the output File resFile = new File(resultFileName); try { resWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resFile), "UTF-8")); } catch (UnsupportedEncodingException e) { LOGGER.error("Couldnt write to " + resFile.getName() + ":" + e.getMessage(), e); return; } catch (FileNotFoundException e) { LOGGER.error("Couldnt write to " + resFile.getName() + ":" + e.getMessage(), e); return; } // write the results header try { resWriter.write( "Entity Type\tPos\\Neg\tTest Input\tScore\tComplete\tCount\tTypes Found\tRules Matched\tAnnotations Found"); resWriter.newLine(); } catch (IOException e) { LOGGER.error("Couldnt write to " + resFile.getName() + ":" + e.getMessage(), e); } // get the pattern file as a URL try { patternFile = new File(patternFileName).toURI().toURL(); } catch (MalformedURLException e) { LOGGER.error("Couldn't use pattern file " + patternFileName + ":" + e.getMessage(), e); } // initialize the regex matcher RegexMatcher reger = new RegexMatcher(patternFile); LOGGER.info("Loaded " + reger.getRules().size() + " rules " + " for types " + reger.getTypes()); LOGGER.info("Writing results to " + resFile.getAbsolutePath()); // loop over the lines of the test file LineIterator iter = null; try { iter = FileUtils.lineIterator(testFile, "UTF-8"); } catch (IOException e) { LOGGER.error("Couldnt read from " + testFile.getName() + ":" + e.getMessage(), e); } int lineCount = 0; while (iter.hasNext()) { // get next line String line = iter.next(); // skip comments and blank lines if (line == null || line.startsWith("#") || line.trim().length() == 0) { continue; } lineCount++; // get the fields of the line String[] pieces = line.split("[\t\\s]+", 3); String entityType = pieces[0]; String posOrNeg = pieces[1]; String testText = pieces[2]; // send the test text to regex matcher List<RegexAnnotation> annos = reger.match(testText); // examine the results and return a line to be sent to the results // file String results = score(entityType, posOrNeg, testText, annos); annos.clear(); try { // write the original line and the results to the results file resWriter.write(line + "\t" + results); resWriter.newLine(); } catch (IOException e) { LOGGER.error("Couldn't write to " + resFile.getName() + ":" + e.getMessage(), e); } } iter.close(); LOGGER.info("Tagged and scored " + lineCount + " test lines"); // cleanup try { resWriter.flush(); resWriter.close(); } catch (IOException e) { LOGGER.error("Couldn't close " + resFile.getName() + ":" + e.getMessage(), e); } } /** * Score. * * @param correctType * the correct type * @param posOrNeg * the pos or neg * @param annos * the annos * @return the string */ public static String score(String correctType, String posOrNeg, String testText, List<RegexAnnotation> annos) { String assessment = "??"; int annoCount = annos.size(); int correctMatchLength = testText.trim().length(); Set<String> typesFound = new TreeSet<String>(); Set<String> rulesFired = new TreeSet<String>(); String tmpRes = ""; int maxMatchedLength = 0; for (RegexAnnotation a : annos) { typesFound.add(a.getType()); rulesFired.add(a.getRule()); int matchLen = a.getMatchText().trim().length(); if (matchLen > maxMatchedLength) { maxMatchedLength = matchLen; } tmpRes += a + ","; } if ("POS".equalsIgnoreCase(posOrNeg)) { // if POS example // perfect result if (typesFound.contains(correctType) && annoCount == 1) { assessment = "TP"; } // got it plus some other type(s) if (typesFound.contains(correctType) && typesFound.size() > 1) { assessment = "TP+FP"; } // got it but more than once if (typesFound.contains(correctType) && typesFound.size() == 1 && annoCount > 1) { assessment = "TP+DUPE"; } // got wrong type if (!typesFound.contains(correctType) && annoCount > 0) { assessment = "FN+FP"; } // got nothing if (annoCount == 0) { assessment = "FN"; } } else // perfect result (doesnt matter what else is found) if (!typesFound.contains(correctType)) { assessment = "TN"; } else { assessment = "FP"; } // check the match lengths to see if we got the whole thing boolean whole = correctMatchLength == maxMatchedLength; return assessment + "\t" + whole + "\t" + annoCount + "\t" + typesFound + "\t" + rulesFired + "\t" + tmpRes; } }