Java tutorial
/////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2012 Assaf Urieli // //This file is part of Jochre. // //Jochre is free software: you can redistribute it and/or modify //it under the terms of the GNU Affero General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //Jochre is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Affero General Public License for more details. // //You should have received a copy of the GNU Affero General Public License //along with Jochre. If not, see <http://www.gnu.org/licenses/>. ////////////////////////////////////////////////////////////////////////////// package com.joliciel.jochre; import java.awt.image.BufferedImage; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.Map.Entry; import java.util.Scanner; import java.util.Set; import java.util.TreeSet; import java.util.zip.ZipInputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.log4j.PropertyConfigurator; import com.joliciel.jochre.analyser.AnalyserService; import com.joliciel.jochre.analyser.ErrorLogger; import com.joliciel.jochre.analyser.FScoreObserver; import com.joliciel.jochre.analyser.ImageAnalyser; import com.joliciel.jochre.analyser.LetterAssigner; import com.joliciel.jochre.analyser.OriginalShapeLetterAssigner; import com.joliciel.jochre.analyser.SimpleLetterFScoreObserver; import com.joliciel.jochre.boundaries.BoundaryDetector; import com.joliciel.jochre.boundaries.BoundaryService; import com.joliciel.jochre.boundaries.MergeEvaluator; import com.joliciel.jochre.boundaries.MergeOutcome; import com.joliciel.jochre.boundaries.ShapeMerger; import com.joliciel.jochre.boundaries.ShapeSplitter; import com.joliciel.jochre.boundaries.SplitCandidateFinder; import com.joliciel.jochre.boundaries.SplitEvaluator; import com.joliciel.jochre.boundaries.SplitOutcome; import com.joliciel.jochre.boundaries.features.BoundaryFeatureService; import com.joliciel.jochre.boundaries.features.MergeFeature; import com.joliciel.jochre.boundaries.features.SplitFeature; import com.joliciel.jochre.doc.DocumentObserver; import com.joliciel.jochre.doc.DocumentService; import com.joliciel.jochre.doc.ImageDocumentExtractor; import com.joliciel.jochre.doc.JochreDocument; import com.joliciel.jochre.doc.JochreDocumentGenerator; import com.joliciel.jochre.doc.SourceFileProcessor; import com.joliciel.jochre.doc.JochrePage; import com.joliciel.jochre.graphics.CorpusSelectionCriteria; import com.joliciel.jochre.graphics.GraphicsService; import com.joliciel.jochre.graphics.ImageStatus; import com.joliciel.jochre.graphics.JochreCorpusGroupReader; import com.joliciel.jochre.graphics.JochreCorpusImageReader; import com.joliciel.jochre.graphics.JochreCorpusShapeReader; import com.joliciel.jochre.graphics.JochreImage; import com.joliciel.jochre.graphics.Shape; import com.joliciel.jochre.graphics.features.ShapeFeature; import com.joliciel.jochre.graphics.features.VerticalElongationFeature; import com.joliciel.jochre.letterGuesser.ComponentCharacterValidator; import com.joliciel.jochre.letterGuesser.Letter; import com.joliciel.jochre.letterGuesser.LetterGuesser; import com.joliciel.jochre.letterGuesser.LetterGuesserService; import com.joliciel.jochre.letterGuesser.LetterValidator; import com.joliciel.jochre.letterGuesser.features.LetterFeature; import com.joliciel.jochre.letterGuesser.features.LetterFeatureService; import com.joliciel.jochre.letterGuesser.features.LetterFeatureTester; import com.joliciel.jochre.lexicon.CorpusLexiconBuilder; import com.joliciel.jochre.lexicon.DefaultLexiconWrapper; import com.joliciel.jochre.lexicon.DefaultWordSplitter; import com.joliciel.jochre.lexicon.FakeLexicon; import com.joliciel.jochre.lexicon.Lexicon; import com.joliciel.jochre.lexicon.LexiconErrorWriter; import com.joliciel.jochre.lexicon.LexiconMerger; import com.joliciel.jochre.lexicon.LexiconService; import com.joliciel.jochre.lexicon.LocaleSpecificLexiconService; import com.joliciel.jochre.lexicon.MostLikelyWordChooser; import com.joliciel.jochre.lexicon.TextFileLexicon; import com.joliciel.jochre.lexicon.UnknownWordListWriter; import com.joliciel.jochre.lexicon.WordSplitter; import com.joliciel.jochre.output.OutputService; import com.joliciel.jochre.output.TextFormat; import com.joliciel.jochre.pdf.PdfImageVisitor; import com.joliciel.jochre.pdf.PdfImageSaver; import com.joliciel.jochre.pdf.PdfService; import com.joliciel.jochre.security.SecurityService; import com.joliciel.jochre.security.User; import com.joliciel.jochre.stats.FScoreCalculator; import com.joliciel.talismane.machineLearning.ClassificationEventStream; import com.joliciel.talismane.machineLearning.ClassificationModel; import com.joliciel.talismane.machineLearning.DecisionFactory; import com.joliciel.talismane.machineLearning.MachineLearningAlgorithm; import com.joliciel.talismane.machineLearning.MachineLearningService; import com.joliciel.talismane.machineLearning.ClassificationModelTrainer; import com.joliciel.talismane.machineLearning.OutcomeEqualiserEventStream; import com.joliciel.talismane.machineLearning.features.FeatureResult; import com.joliciel.talismane.machineLearning.features.FeatureService; import com.joliciel.talismane.machineLearning.features.RuntimeEnvironment; import com.joliciel.talismane.machineLearning.maxent.MaxentModelTrainer; import com.joliciel.talismane.utils.LogUtils; import com.joliciel.talismane.utils.PerformanceMonitor; /** * Class encapsulating the various top-level Jochre commands and command-line interface. * @author Assaf Urieli * */ public class Jochre implements LocaleSpecificLexiconService { private static final Log LOG = LogFactory.getLog(Jochre.class); public enum BoundaryDetectorType { LetterByLetter, Deterministic } public enum OutputFormat { Jochre, JochrePageByPage, AbbyyFineReader8, HTML, UnknownWords } GraphicsService graphicsService; DocumentService documentService; AnalyserService analyserService; LexiconService lexiconService; LetterGuesserService letterGuesserService; BoundaryService boundaryService; SecurityService securityService; PdfService pdfService; LetterFeatureService letterFeatureService; BoundaryFeatureService boundaryFeatureService; MachineLearningService machineLearningService; FeatureService featureService; Locale locale = null; int userId = -1; String dataSourcePropertiesPath; String encoding = null; String lexiconPath = null; WordSplitter wordSplitter = null; Lexicon lexicon = null; Map<String, Set<Integer>> documentGroups = new LinkedHashMap<String, Set<Integer>>(); public Jochre() { } private void initialise() { JochreServiceLocator locator = JochreServiceLocator.getInstance(); graphicsService = locator.getGraphicsServiceLocator().getGraphicsService(); documentService = locator.getDocumentServiceLocator().getDocumentService(); analyserService = locator.getAnalyserServiceLocator().getAnalyserService(); lexiconService = locator.getLexiconServiceLocator().getLexiconService(); letterGuesserService = locator.getLetterGuesserServiceLocator().getLetterGuesserService(); boundaryService = locator.getBoundaryServiceLocator().getBoundaryService(); securityService = locator.getSecurityServiceLocator().getSecurityService(); pdfService = locator.getPdfServiceLocator().getPdfService(); letterFeatureService = locator.getLetterFeatureServiceLocator().getLetterFeatureService(); boundaryFeatureService = locator.getBoundaryFeatureServiceLocator().getBoundaryFeatureService(); machineLearningService = locator.getMachineLearningServiceLocator().getMachineLearningService(); featureService = locator.getFeatureService(); } public static void main(String[] args) throws Exception { Map<String, String> argMap = new HashMap<String, String>(); for (String arg : args) { int equalsPos = arg.indexOf('='); String argName = arg.substring(0, equalsPos); String argValue = arg.substring(equalsPos + 1); argMap.put(argName, argValue); } Jochre jochre = new Jochre(); jochre.execute(argMap); } /** * Usage (* indicates optional):<br/> * Jochre load [filename] [isoLanguageCode] [firstPage]* [lastPage]*<br/> * Loads a file (pdf or image) and segments it into letters. * The analysed version is stored in the persistent store. * Writes [filename].xml to the same location, to enable the user to indicate the text * to associate with this file.<br/> * Jochre extract [filename] [outputDirectory] [firstPage]* [lastPage]*<br/> * Extracts images form a pdf file.<br/> * @param args */ public void execute(Map<String, String> argMap) throws Exception { if (argMap.size() == 0) { System.out.println("Usage (* indicates optional):"); System.out.println( "Jochre command=load file=[filename] name=[userFriendlyName] lang=[isoLanguageCode] first=[firstPage]* last=[lastPage]* outputDir=[outputDirectory]* showSeg=[true/false]"); System.out.println( "Jochre command=extract file=[filename] outputDir=[outputDirectory] first=[firstPage]* last=[lastPage]*"); System.out.println("Jochre command=analyse"); System.out.println( "Jochre command=train file=[filename] outputDir=[outputDirectory] iterations=[iterations] cutoff=[cutoff]"); return; } String logConfigPath = argMap.get("logConfigFile"); if (logConfigPath != null) { argMap.remove("logConfigFile"); Properties props = new Properties(); props.load(new FileInputStream(logConfigPath)); PropertyConfigurator.configure(props); } File performanceConfigFile = null; String command = ""; String filename = ""; String userFriendlyName = ""; String outputDirPath = null; int firstPage = -1; int lastPage = -1; int shapeId = -1; int docId = -1; int imageId = 0; int iterations = 100; int cutoff = 0; int userId = -1; int imageCount = 0; int multiplier = 0; int beamWidth = 5; boolean showSegmentation = false; boolean drawPixelSpread = false; boolean save = false; String letterModelPath = ""; String splitModelPath = ""; String mergeModelPath = ""; ImageStatus[] imageSet = new ImageStatus[] { ImageStatus.TRAINING_HELD_OUT }; String letterFeatureFilePath = ""; String splitFeatureFilePath = ""; String mergeFeatureFilePath = ""; boolean reconstructLetters = false; double minProbForDecision = 0.5; double junkThreshold = 0.0; BoundaryDetectorType boundaryDetectorType = BoundaryDetectorType.LetterByLetter; int excludeImageId = 0; int crossValidationSize = -1; int includeIndex = -1; int excludeIndex = -1; Set<Integer> documentSet = null; boolean frequencyAdjusted = false; double smoothing = 0.3; double frequencyLogBase = 10.0; String suffix = ""; String dataSourcePath = null; String docGroupPath = null; boolean includeBeam = false; List<OutputFormat> outputFormats = new ArrayList<Jochre.OutputFormat>(); outputFormats.add(OutputFormat.Jochre); outputFormats.add(OutputFormat.HTML); for (Entry<String, String> argMapEntry : argMap.entrySet()) { String argName = argMapEntry.getKey(); String argValue = argMapEntry.getValue(); if (argName.equals("command")) command = argValue; else if (argName.equals("file")) filename = argValue; else if (argName.equals("name")) userFriendlyName = argValue; else if (argName.equals("lang")) locale = new Locale(argValue); else if (argName.equals("first")) firstPage = Integer.parseInt(argValue); else if (argName.equals("last")) lastPage = Integer.parseInt(argValue); else if (argName.equals("outputDir")) outputDirPath = argValue; else if (argName.equals("showSeg")) showSegmentation = (argValue.equals("true")); else if (argName.equals("drawPixelSpread")) drawPixelSpread = (argValue.equals("true")); else if (argName.equals("save")) save = (argValue.equals("true")); else if (argName.equals("shapeId")) shapeId = Integer.parseInt(argValue); else if (argName.equals("imageId")) imageId = Integer.parseInt(argValue); else if (argName.equals("docId")) docId = Integer.parseInt(argValue); else if (argName.equals("userId")) userId = Integer.parseInt(argValue); else if (argName.equals("iterations")) iterations = Integer.parseInt(argValue); else if (argName.equals("cutoff")) cutoff = Integer.parseInt(argValue); else if (argName.equals("imageCount")) imageCount = Integer.parseInt(argValue); else if (argName.equals("beamWidth")) beamWidth = Integer.parseInt(argValue); else if (argName.equals("multiplier")) multiplier = Integer.parseInt(argValue); else if (argName.equals("letterModel")) letterModelPath = argValue; else if (argName.equals("splitModel")) splitModelPath = argValue; else if (argName.equals("mergeModel")) mergeModelPath = argValue; else if (argName.equals("letterFeatures")) letterFeatureFilePath = argValue; else if (argName.equals("splitFeatures")) splitFeatureFilePath = argValue; else if (argName.equals("mergeFeatures")) mergeFeatureFilePath = argValue; else if (argName.equals("imageStatus")) { if (argValue.equals("heldOut")) imageSet = new ImageStatus[] { ImageStatus.TRAINING_HELD_OUT }; else if (argValue.equals("test")) imageSet = new ImageStatus[] { ImageStatus.TRAINING_TEST }; else if (argValue.equals("training")) imageSet = new ImageStatus[] { ImageStatus.TRAINING_VALIDATED }; else if (argValue.equals("all")) imageSet = new ImageStatus[] { ImageStatus.TRAINING_VALIDATED, ImageStatus.TRAINING_HELD_OUT, ImageStatus.TRAINING_TEST }; else throw new RuntimeException("Unknown imageSet: " + argValue); } else if (argName.equals("reconstructLetters")) reconstructLetters = (argValue.equals("true")); else if (argName.equals("minProbForDecision")) minProbForDecision = Double.parseDouble(argValue); else if (argName.equals("junkThreshold")) junkThreshold = Double.parseDouble(argValue); else if (argName.equals("boundaryDetector")) boundaryDetectorType = BoundaryDetectorType.valueOf(argValue); else if (argName.equals("lexicon")) lexiconPath = argValue; else if (argName.equals("freqLogBase")) { frequencyLogBase = Double.parseDouble(argValue); frequencyAdjusted = true; } else if (argName.equals("smoothing")) smoothing = Double.parseDouble(argValue); else if (argName.equals("excludeImageId")) excludeImageId = Integer.parseInt(argValue); else if (argName.equals("crossValidationSize")) crossValidationSize = Integer.parseInt(argValue); else if (argName.equals("includeIndex")) includeIndex = Integer.parseInt(argValue); else if (argName.equals("excludeIndex")) excludeIndex = Integer.parseInt(argValue); else if (argName.equals("docSet")) { String[] docIdArray = argValue.split(","); documentSet = new HashSet<Integer>(); for (String docIdString : docIdArray) { int oneId = Integer.parseInt(docIdString); documentSet.add(oneId); } } else if (argName.equals("docGroupFile")) { docGroupPath = argValue; } else if (argName.equals("frequencyAdjusted")) frequencyAdjusted = argValue.equalsIgnoreCase("true"); else if (argName.equals("suffix")) suffix = argValue; else if (argName.equals("dataSource")) dataSourcePath = argValue; else if (argName.equals("encoding")) encoding = argValue; else if (argName.equals("performanceConfigFile")) performanceConfigFile = new File(argValue); else if (argName.equals("includeBeam")) includeBeam = argValue.equalsIgnoreCase("true"); else if (argName.equals("outputFormat")) { outputFormats = new ArrayList<Jochre.OutputFormat>(); String[] outputFormatStrings = argValue.split(","); for (String outputFormatString : outputFormatStrings) { outputFormats.add(OutputFormat.valueOf(outputFormatString)); } if (outputFormats.size() == 0) throw new JochreException("At least one outputFormat required."); } else throw new RuntimeException("Unknown argument: " + argName); } PerformanceMonitor.start(performanceConfigFile); try { if (userFriendlyName.length() == 0) userFriendlyName = filename; if (locale == null) { throw new JochreException("Argument lang is required"); } if (encoding == null) encoding = Charset.defaultCharset().name(); JochreServiceLocator locator = JochreServiceLocator.getInstance(); if (dataSourcePath != null) locator.setDataSourcePropertiesFile(dataSourcePath); this.initialise(); this.setUserId(userId); CorpusSelectionCriteria criteria = this.getGraphicsService().getCorpusSelectionCriteria(); criteria.setImageId(imageId); criteria.setImageCount(imageCount); criteria.setImageStatusesToInclude(imageSet); criteria.setExcludeImageId(excludeImageId); criteria.setCrossValidationSize(crossValidationSize); criteria.setIncludeIndex(includeIndex); criteria.setExcludeIndex(excludeIndex); criteria.setDocumentId(docId); criteria.setDocumentIds(documentSet); if (docGroupPath != null) { File docGroupFile = new File(docGroupPath); Scanner scanner = new Scanner( new BufferedReader(new InputStreamReader(new FileInputStream(docGroupFile), encoding))); while (scanner.hasNextLine()) { String line = scanner.nextLine(); int equalsPos = line.indexOf('='); String groupName = line.substring(0, equalsPos); String[] ids = line.substring(equalsPos + 1).split(","); Set<Integer> idSet = new HashSet<Integer>(); for (String idString : ids) { idSet.add(Integer.parseInt(idString)); } documentGroups.put(groupName, idSet); } } OutputService outputService = locator.getTextServiceLocator().getTextService(); MostLikelyWordChooser wordChooser = null; LexiconService lexiconService = locator.getLexiconServiceLocator().getLexiconService(); wordChooser = lexiconService.getMostLikelyWordChooser(this.getLexicon(), this.getWordSplitter()); wordChooser.setAdditiveSmoothing(smoothing); wordChooser.setFrequencyLogBase(frequencyLogBase); wordChooser.setFrequencyAdjusted(frequencyAdjusted); JochreSession.setJunkConfidenceThreshold(junkThreshold); if (command.equals("segment")) { this.doCommandSegment(filename, userFriendlyName, showSegmentation, drawPixelSpread, outputDirPath, save, firstPage, lastPage); } else if (command.equals("extract")) { this.doCommandExtractImages(filename, outputDirPath, firstPage, lastPage); } else if (command.equals("updateImages")) { this.doCommandUpdateImages(filename, docId, firstPage, lastPage); } else if (command.equals("applyFeatures")) { this.doCommandApplyFeatures(imageId, shapeId, letterFeatureFilePath); } else if (command.equals("train")) { this.doCommandTrain(letterModelPath, letterFeatureFilePath, iterations, cutoff, criteria, reconstructLetters); } else if (command.equals("evaluate") || command.equals("evaluateComplex")) { this.doCommandEvaluate(letterModelPath, criteria, outputDirPath, wordChooser, beamWidth, reconstructLetters, save, suffix, includeBeam); } else if (command.equals("evaluateFull")) { this.doCommandEvaluateFull(letterModelPath, splitModelPath, mergeModelPath, criteria, save, outputDirPath, wordChooser, beamWidth, boundaryDetectorType, minProbForDecision, suffix); } else if (command.equals("analyse")) { this.doCommandAnalyse(letterModelPath, docId, criteria, wordChooser); } else if (command.equals("trainSplits")) { this.doCommandTrainSplits(splitModelPath, splitFeatureFilePath, iterations, cutoff, criteria); } else if (command.equals("evaluateSplits")) { this.doCommandEvaluateSplits(splitModelPath, criteria, beamWidth, minProbForDecision); } else if (command.equals("trainMerge")) { this.doCommandTrainMerge(mergeModelPath, mergeFeatureFilePath, multiplier, iterations, cutoff, criteria); } else if (command.equals("evaluateMerge")) { this.doCommandEvaluateMerge(mergeModelPath, criteria, minProbForDecision); } else if (command.equals("logImage")) { this.doCommandLogImage(shapeId); } else if (command.equals("testFeature")) { this.doCommandTestFeature(shapeId); } else if (command.equals("serializeLexicon")) { File outputDir = new File(outputDirPath); outputDir.mkdirs(); File inputFile = new File(filename); if (inputFile.isDirectory()) { File[] lexiconFiles = inputFile.listFiles(); for (File oneLexFile : lexiconFiles) { LOG.debug(oneLexFile.getName() + ": " + ", size: " + oneLexFile.length()); TextFileLexicon lexicon = new TextFileLexicon(oneLexFile, encoding); String baseName = oneLexFile.getName().substring(0, oneLexFile.getName().indexOf(".")); if (baseName.lastIndexOf("/") > 0) baseName = baseName.substring(baseName.lastIndexOf("/") + 1); File lexiconFile = new File(outputDir, baseName + ".obj"); lexicon.serialize(lexiconFile); } } else { LOG.debug(filename + ": " + inputFile.exists() + ", size: " + inputFile.length()); TextFileLexicon lexicon = new TextFileLexicon(inputFile, encoding); String baseName = filename.substring(0, filename.indexOf(".")); if (baseName.lastIndexOf("/") > 0) baseName = baseName.substring(baseName.lastIndexOf("/") + 1); File lexiconFile = new File(outputDir, baseName + ".obj"); lexicon.serialize(lexiconFile); } } else if (command.equals("analyseFile")) { File pdfFile = new File(filename); File letterModelFile = new File(letterModelPath); File splitModelFile = null; File mergeModelFile = null; if (splitModelPath.length() > 0) splitModelFile = new File(splitModelPath); if (mergeModelPath.length() > 0) mergeModelFile = new File(mergeModelPath); File outputDir = new File(outputDirPath); outputDir.mkdirs(); String baseName = filename; if (baseName.lastIndexOf('.') > 0) baseName = filename.substring(0, filename.lastIndexOf('.')); if (baseName.lastIndexOf('/') > 0) baseName = baseName.substring(baseName.lastIndexOf('/') + 1); if (baseName.lastIndexOf('\\') > 0) baseName = baseName.substring(baseName.lastIndexOf('\\') + 1); List<DocumentObserver> observers = new ArrayList<DocumentObserver>(); for (OutputFormat outputFormat : outputFormats) { switch (outputFormat) { case AbbyyFineReader8: { Writer analysisFileWriter = null; String outputFileName = baseName + "_abbyy8.xml"; File analysisFile = new File(outputDir, outputFileName); analysisFile.delete(); analysisFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(analysisFile, true), "UTF8")); DocumentObserver observer = outputService.getAbbyyFineReader8Exporter(analysisFileWriter); observers.add(observer); break; } case HTML: { Writer htmlWriter = null; String htmlFileName = baseName + ".html"; File htmlFile = new File(outputDir, htmlFileName); htmlFile.delete(); htmlWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(htmlFile, true), "UTF8")); DocumentObserver textGetter = outputService.getTextGetter(htmlWriter, TextFormat.XHTML, this.getLexicon()); observers.add(textGetter); break; } case Jochre: { Writer analysisFileWriter = null; String outputFileName = baseName + ".xml"; File analysisFile = new File(outputDir, outputFileName); analysisFile.delete(); analysisFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(analysisFile, true), "UTF8")); DocumentObserver observer = outputService.getJochreXMLExporter(analysisFileWriter); observers.add(observer); break; } case JochrePageByPage: { DocumentObserver observer = outputService.getJochrePageByPageExporter(outputDir, baseName); observers.add(observer); break; } case UnknownWords: { if (this.getLexicon() != null) { File unknownWordFile = new File(outputDir, "unknownWords.txt"); unknownWordFile.delete(); Writer unknownWordWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(unknownWordFile, true), "UTF8")); UnknownWordListWriter unknownWordListWriter = new UnknownWordListWriter( unknownWordWriter); observers.add(unknownWordListWriter); } break; } } } this.doCommandAnalyse(pdfFile, letterModelFile, splitModelFile, mergeModelFile, wordChooser, observers, firstPage, lastPage); } else { throw new RuntimeException("Unknown command: " + command); } } catch (Exception e) { LogUtils.logError(LOG, e); throw e; } finally { PerformanceMonitor.end(); } LOG.debug("#### finished #####"); } /** * Test a feature on a particular shape. * @param shapeId */ public void doCommandTestFeature(int shapeId) { // just a utility for testing a feature on a particular shape ShapeFeature<?> feature = new VerticalElongationFeature(); if (shapeId > 0) { Shape shape = graphicsService.loadShape(shapeId); shape.writeImageToLog(); RuntimeEnvironment env = this.featureService.getRuntimeEnvironment(); feature.check(shape, env); } else { // String result = "false"; // TrainingServiceLocator trainingServiceLocator = locator.getTrainingServiceLocator(); // TrainingService trainingService = trainingServiceLocator.getTrainingService(); // List<Integer> shapeIds = trainingService.findShapesForFeature("", feature, result); List<Integer> shapeIds = graphicsService.findShapeIds(""); Map<Object, Integer> outcomeMap = new HashMap<Object, Integer>(); for (int oneShapeId : shapeIds) { Shape shape = graphicsService.loadShape(oneShapeId); shape.writeImageToLog(); RuntimeEnvironment env = this.featureService.getRuntimeEnvironment(); FeatureResult<?> weightedOutcome = feature.check(shape, env); Object outcome = weightedOutcome.getOutcome(); Integer count = outcomeMap.get(outcome); if (count == null) outcomeMap.put(outcome, 1); else outcomeMap.put(outcome, count.intValue() + 1); } LOG.debug("Outcomes:"); for (Object outcome : outcomeMap.keySet()) { LOG.debug("Outcome " + outcome.toString() + ", count " + outcomeMap.get(outcome)); } } } /** * Rebuild the training corpus lexicon. */ public void doCommandBuildLexicon(String outputDirPath, WordSplitter wordSplitter, CorpusSelectionCriteria criteria) { try { CorpusLexiconBuilder builder = lexiconService.getCorpusLexiconBuilder(wordSplitter); builder.setCriteria(criteria); TextFileLexicon lexicon = builder.buildLexicon(); File outputDir = new File(outputDirPath); outputDir.mkdirs(); File textFile = new File(outputDir, "jochreCorpusLexicon.txt"); textFile.delete(); Writer textFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(textFile, true), "UTF8")); try { lexicon.writeFile(textFileWriter); } finally { textFileWriter.flush(); textFileWriter.close(); } File lexiconFile = new File(outputDir, "jochreCorpusLexicon.zip"); lexicon.serialize(lexiconFile); } catch (IOException e) { throw new RuntimeException(e); } } /** * Log a shape's image to the log file, to make sure it got segmented and stored correctly. * @param shapeId */ public void doCommandLogImage(int shapeId) { // just a utility for making sure images got segmented and stored correctly if (shapeId > 0) { Shape shape = graphicsService.loadShape(shapeId); shape.writeImageToLog(); LOG.debug("Letter: " + shape.getLetter()); } } /** * Train the letter merging model. * @param mergeModelPath the path where the model should be saved * @param iterations the number of training iterations * @param cutoff the feature count cutoff * @param imageCount the maximum number of training corpus images to use when training - if <= 0 will use all. */ public void doCommandTrainMerge(String mergeModelPath, String mergeFeatureFilePath, int multiplier, int iterations, int cutoff, CorpusSelectionCriteria criteria) { try { if (mergeModelPath.length() == 0) throw new RuntimeException("Missing argument: mergeModel"); if (!mergeModelPath.endsWith(".zip")) throw new RuntimeException("mergeModel must end with .zip"); if (mergeFeatureFilePath.length() == 0) throw new RuntimeException("Missing argument: mergeFeatures"); String modelDirPath = mergeModelPath.substring(0, mergeModelPath.lastIndexOf('/')); File modelDir = new File(modelDirPath); modelDir.mkdirs(); File mergeFeatureFile = new File(mergeFeatureFilePath); Scanner scanner = new Scanner(mergeFeatureFile); List<String> mergeFeatureDescriptors = new ArrayList<String>(); while (scanner.hasNextLine()) { String descriptor = scanner.nextLine(); mergeFeatureDescriptors.add(descriptor); LOG.debug(descriptor); } Set<MergeFeature<?>> mergeFeatures = this.boundaryFeatureService .getMergeFeatureSet(mergeFeatureDescriptors); double maxWidthRatio = 1.2; double maxDistanceRatio = 0.15; ClassificationEventStream corpusEventStream = boundaryService.getJochreMergeEventStream(criteria, mergeFeatures, maxWidthRatio, maxDistanceRatio); if (multiplier > 0) { corpusEventStream = new OutcomeEqualiserEventStream(corpusEventStream, multiplier); } File file = new File(mergeModelPath); Map<String, Object> trainParameters = new HashMap<String, Object>(); trainParameters.put(MaxentModelTrainer.MaxentModelParameter.Iterations.name(), iterations); trainParameters.put(MaxentModelTrainer.MaxentModelParameter.Cutoff.name(), cutoff); ClassificationModelTrainer<MergeOutcome> trainer = machineLearningService .getClassificationModelTrainer(MachineLearningAlgorithm.MaxEnt, trainParameters); DecisionFactory<MergeOutcome> mergeDecisionFactory = boundaryService.getMergeDecisionFactory(); ClassificationModel<MergeOutcome> mergeModel = trainer.trainModel(corpusEventStream, mergeDecisionFactory, mergeFeatureDescriptors); mergeModel.persist(file); } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } } /** * Evaluate the letter merging model on its own. * @param mergeModelPath the path of the model to be evaluated. * @param testSet the test set to be evaluated * @param imageCount the maximum number of corpus images to use when testing - if <= 0 will use all. * @param minProbForDecision * @throws IOException */ public void doCommandEvaluateMerge(String mergeModelPath, CorpusSelectionCriteria criteria, double minProbForDecision) throws IOException { if (mergeModelPath.length() == 0) throw new RuntimeException("Missing argument: mergeModel"); if (!mergeModelPath.endsWith(".zip")) throw new RuntimeException("mergeModel must end with .zip"); ZipInputStream zis = new ZipInputStream(new FileInputStream(mergeModelPath)); ClassificationModel<MergeOutcome> mergeModel = machineLearningService.getClassificationModel(zis); List<String> mergeFeatureDescriptors = mergeModel.getFeatureDescriptors(); Set<MergeFeature<?>> mergeFeatures = boundaryFeatureService.getMergeFeatureSet(mergeFeatureDescriptors); JochreCorpusGroupReader groupReader = graphicsService.getJochreCorpusGroupReader(); groupReader.setSelectionCriteria(criteria); double maxWidthRatio = 1.2; double maxDistanceRatio = 0.15; ShapeMerger merger = boundaryService.getShapeMerger(mergeFeatures, mergeModel.getDecisionMaker()); MergeEvaluator evaluator = boundaryService.getMergeEvaluator(maxWidthRatio, maxDistanceRatio); if (minProbForDecision >= 0) evaluator.setMinProbabilityForDecision(minProbForDecision); FScoreCalculator<String> fScoreCalculator = evaluator.evaluate(groupReader, merger); LOG.debug(fScoreCalculator.getTotalFScore()); } /** * Train the letter splitting model. * @param splitModelPath the path where the model should be saved * @param iterations the number of training iterations * @param cutoff the feature count cutoff * @param imageCount the maximum number of training corpus images to use when training - if <= 0 will use all. */ public void doCommandTrainSplits(String splitModelPath, String splitFeatureFilePath, int iterations, int cutoff, CorpusSelectionCriteria criteria) { try { if (splitModelPath.length() == 0) throw new RuntimeException("Missing argument: splitModel"); if (!splitModelPath.endsWith(".zip")) throw new RuntimeException("splitModel must end with .zip"); if (splitFeatureFilePath.length() == 0) throw new RuntimeException("Missing argument: splitFeatures"); String modelDirPath = splitModelPath.substring(0, splitModelPath.lastIndexOf('/')); File modelDir = new File(modelDirPath); modelDir.mkdirs(); File splitFeatureFile = new File(splitFeatureFilePath); Scanner scanner = new Scanner(splitFeatureFile); List<String> splitFeatureDescriptors = new ArrayList<String>(); while (scanner.hasNextLine()) { String descriptor = scanner.nextLine(); splitFeatureDescriptors.add(descriptor); LOG.debug(descriptor); } Set<SplitFeature<?>> splitFeatures = this.boundaryFeatureService .getSplitFeatureSet(splitFeatureDescriptors); double minWidthRatio = 1.1; double minHeightRatio = 1.0; ClassificationEventStream corpusEventStream = boundaryService.getJochreSplitEventStream(criteria, splitFeatures, minWidthRatio, minHeightRatio); File splitModelFile = new File(splitModelPath); Map<String, Object> trainParameters = new HashMap<String, Object>(); trainParameters.put(MaxentModelTrainer.MaxentModelParameter.Iterations.name(), iterations); trainParameters.put(MaxentModelTrainer.MaxentModelParameter.Cutoff.name(), cutoff); ClassificationModelTrainer<SplitOutcome> trainer = machineLearningService .getClassificationModelTrainer(MachineLearningAlgorithm.MaxEnt, trainParameters); DecisionFactory<SplitOutcome> splitDecisionFactory = boundaryService.getSplitDecisionFactory(); ClassificationModel<SplitOutcome> splitModel = trainer.trainModel(corpusEventStream, splitDecisionFactory, splitFeatureDescriptors); splitModel.persist(splitModelFile); } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } } /** * Evaluate the letter splitting model on its own. * @param splitModelPath the path of the model to be evaluated. * @param testSet the test set to be evaluated * @param imageCount the maximum number of corpus images to use when testing - if <= 0 will use all. * @param minProbForDecision * @throws IOException */ public void doCommandEvaluateSplits(String splitModelPath, CorpusSelectionCriteria criteria, int beamWidth, double minProbForDecision) throws IOException { if (splitModelPath.length() == 0) throw new RuntimeException("Missing argument: splitModel"); if (!splitModelPath.endsWith(".zip")) throw new RuntimeException("splitModel must end with .zip"); ZipInputStream zis = new ZipInputStream(new FileInputStream(splitModelPath)); ClassificationModel<SplitOutcome> splitModel = machineLearningService.getClassificationModel(zis); List<String> splitFeatureDescriptors = splitModel.getFeatureDescriptors(); Set<SplitFeature<?>> splitFeatures = boundaryFeatureService.getSplitFeatureSet(splitFeatureDescriptors); double minWidthRatio = 1.1; double minHeightRatio = 1.0; int maxDepth = 2; SplitCandidateFinder splitCandidateFinder = boundaryService.getSplitCandidateFinder(); splitCandidateFinder.setMinDistanceBetweenSplits(5); ShapeSplitter shapeSplitter = boundaryService.getShapeSplitter(splitCandidateFinder, splitFeatures, splitModel.getDecisionMaker(), minWidthRatio, beamWidth, maxDepth); JochreCorpusShapeReader shapeReader = graphicsService.getJochreCorpusShapeReader(); shapeReader.setSelectionCriteria(criteria); SplitEvaluator splitEvaluator = boundaryService.getSplitEvaluator(5, minWidthRatio, minHeightRatio); if (minProbForDecision >= 0) splitEvaluator.setMinProbabilityForDecision(minProbForDecision); FScoreCalculator<String> fScoreCalculator = splitEvaluator.evaluate(shapeReader, shapeSplitter); LOG.debug(fScoreCalculator.getTotalFScore()); } /** * Train a letter guessing model. * @param letterModelPath the path where the model should be saved * @param iterations the number of training iterations * @param cutoff the feature count cutoff * @param imageCount the maximum number of training corpus images to use when training - if <= 0 will use all. */ public void doCommandTrain(String letterModelPath, String letterFeatureFilePath, int iterations, int cutoff, CorpusSelectionCriteria criteria, boolean reconstructLetters) { try { if (letterModelPath.length() == 0) throw new RuntimeException("Missing argument: letterModel"); if (!letterModelPath.endsWith(".zip")) throw new RuntimeException("letterModel must end with .zip"); if (letterFeatureFilePath.length() == 0) throw new RuntimeException("Missing argument: letterFeatures"); String modelDirPath = letterModelPath.substring(0, letterModelPath.lastIndexOf('/')); File modelDir = new File(modelDirPath); modelDir.mkdirs(); File letterFeatureFile = new File(letterFeatureFilePath); Scanner scanner = new Scanner(letterFeatureFile); List<String> descriptors = new ArrayList<String>(); while (scanner.hasNextLine()) { String descriptor = scanner.nextLine(); descriptors.add(descriptor); LOG.debug(descriptor); } Set<LetterFeature<?>> features = letterFeatureService.getLetterFeatureSet(descriptors); BoundaryDetector boundaryDetector = null; if (reconstructLetters) { ShapeSplitter splitter = boundaryService.getTrainingCorpusShapeSplitter(); ShapeMerger merger = boundaryService.getTrainingCorpusShapeMerger(); boundaryDetector = boundaryService.getLetterByLetterBoundaryDetector(splitter, merger, 1); } else { boundaryDetector = boundaryService.getOriginalBoundaryDetector(); } LetterValidator letterValidator = new ComponentCharacterValidator(locale); ClassificationEventStream corpusEventStream = letterGuesserService.getJochreLetterEventStream(criteria, features, boundaryDetector, letterValidator); File letterModelFile = new File(letterModelPath); DecisionFactory<Letter> letterDecisionFactory = letterGuesserService.getLetterDecisionFactory(); Map<String, Object> trainParameters = new HashMap<String, Object>(); trainParameters.put(MaxentModelTrainer.MaxentModelParameter.Iterations.name(), iterations); trainParameters.put(MaxentModelTrainer.MaxentModelParameter.Cutoff.name(), cutoff); ClassificationModelTrainer<Letter> trainer = machineLearningService .getClassificationModelTrainer(MachineLearningAlgorithm.MaxEnt, trainParameters); ClassificationModel<Letter> letterModel = trainer.trainModel(corpusEventStream, letterDecisionFactory, descriptors); letterModel.persist(letterModelFile); } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } } /** * Evaluate a given letter guessing model. * @param letterModelPath the path to the model * @param testSet the set of images to be evaluated * @param imageId the single image to be evaluated * @param imageId2 * @param outputDirPath the directory to which we write the evaluation files * @param includeBeam * @param lexicon the lexicon to use for word correction * @throws IOException */ public void doCommandEvaluate(String letterModelPath, CorpusSelectionCriteria criteria, String outputDirPath, MostLikelyWordChooser wordChooser, int beamWidth, boolean reconstructLetters, boolean save, String suffix, boolean includeBeam) throws IOException { if (letterModelPath.length() == 0) throw new RuntimeException("Missing argument: letterModel"); if (!letterModelPath.endsWith(".zip")) throw new RuntimeException("letterModel must end with .zip"); if (outputDirPath == null || outputDirPath.length() == 0) throw new RuntimeException("Missing argument: outputDir"); File outputDir = new File(outputDirPath); outputDir.mkdirs(); ZipInputStream zis = new ZipInputStream(new FileInputStream(letterModelPath)); ClassificationModel<Letter> letterModel = machineLearningService.getClassificationModel(zis); List<String> letterFeatureDescriptors = letterModel.getFeatureDescriptors(); Set<LetterFeature<?>> letterFeatures = letterFeatureService.getLetterFeatureSet(letterFeatureDescriptors); LetterGuesser letterGuesser = letterGuesserService.getLetterGuesser(letterFeatures, letterModel.getDecisionMaker()); String baseName = letterModelPath.substring(0, letterModelPath.indexOf(".")); if (baseName.lastIndexOf("/") > 0) baseName = baseName.substring(baseName.lastIndexOf("/") + 1); baseName += suffix; BoundaryDetector boundaryDetector = null; if (reconstructLetters) { ShapeSplitter splitter = boundaryService.getTrainingCorpusShapeSplitter(); ShapeMerger merger = boundaryService.getTrainingCorpusShapeMerger(); boundaryDetector = boundaryService.getLetterByLetterBoundaryDetector(splitter, merger, 1); boundaryDetector.setMinWidthRatioForSplit(0.0); boundaryDetector.setMinHeightRatioForSplit(0.0); boundaryDetector.setMaxWidthRatioForMerge(100.0); boundaryDetector.setMaxDistanceRatioForMerge(100.0); } else { boundaryDetector = boundaryService.getOriginalBoundaryDetector(); } ImageAnalyser evaluator = analyserService.getBeamSearchImageAnalyser(beamWidth, 0.01); evaluator.setBoundaryDetector(boundaryDetector); evaluator.setLetterGuesser(letterGuesser); evaluator.setMostLikelyWordChooser(wordChooser); FScoreObserver fScoreObserver = null; LetterValidator letterValidator = new ComponentCharacterValidator(locale); if (reconstructLetters) { OriginalShapeLetterAssigner originalShapeLetterAssigner = new OriginalShapeLetterAssigner(); originalShapeLetterAssigner.setEvaluate(true); originalShapeLetterAssigner.setSave(save); originalShapeLetterAssigner.setLetterValidator(letterValidator); fScoreObserver = originalShapeLetterAssigner; } else { LetterAssigner letterAssigner = new LetterAssigner(); letterAssigner.setSave(save); evaluator.addObserver(letterAssigner); fScoreObserver = new SimpleLetterFScoreObserver(letterValidator); } evaluator.addObserver(fScoreObserver); ErrorLogger errorLogger = new ErrorLogger(); if (wordChooser != null) { errorLogger.setLexicon(wordChooser.getLexicon()); errorLogger.setWordSplitter(wordChooser.getWordSplitter()); } Writer errorWriter = null; File errorFile = new File(outputDir, baseName + "_errors.txt"); errorFile.delete(); errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(errorFile, true), "UTF8")); errorLogger.setErrorWriter(errorWriter); evaluator.addObserver(errorLogger); LexiconErrorWriter lexiconErrorWriter = new LexiconErrorWriter(outputDir, baseName, wordChooser); if (documentGroups != null) lexiconErrorWriter.setDocumentGroups(documentGroups); lexiconErrorWriter.setIncludeBeam(includeBeam); // find all document names (alphabetical ordering) Set<String> documentNameSet = new TreeSet<String>(); JochreCorpusImageReader imageReader1 = graphicsService.getJochreCorpusImageReader(); CorpusSelectionCriteria docCriteria = graphicsService.getCorpusSelectionCriteria(); docCriteria.setImageStatusesToInclude(criteria.getImageStatusesToInclude()); docCriteria.setImageId(criteria.getImageId()); docCriteria.setDocumentId(criteria.getDocumentId()); docCriteria.setDocumentIds(criteria.getDocumentIds()); imageReader1.setSelectionCriteria(docCriteria); JochreDocument currentDoc = null; while (imageReader1.hasNext()) { JochreImage image = imageReader1.next(); if (!image.getPage().getDocument().equals(currentDoc)) { currentDoc = image.getPage().getDocument(); documentNameSet.add(currentDoc.getName()); } } List<String> documentNames = new ArrayList<String>(documentNameSet); lexiconErrorWriter.setDocumentNames(documentNames); evaluator.addObserver(lexiconErrorWriter); JochreCorpusImageReader imageReader = graphicsService.getJochreCorpusImageReader(); imageReader.setSelectionCriteria(criteria); // evaluator.setOutcomesToAnalyse(new String[] {""}); try { evaluator.analyse(imageReader); } finally { if (errorWriter != null) errorWriter.close(); } LOG.debug("F-score for " + letterModelPath + ": " + fScoreObserver.getFScoreCalculator().getTotalFScore()); String modelFileName = baseName; if (reconstructLetters) modelFileName += "_Reconstruct"; File fscoreFile = new File(outputDir, modelFileName + "_fscores.csv"); fScoreObserver.getFScoreCalculator().writeScoresToCSVFile(fscoreFile); } /** * Analyse a document or image or test set based on a given letter-guessing model. * @param letterModelPath the path to the letter-guessing model. * @param docId the document to be analysed * @param imageId the image to be analysed * @param testSet the test set to be analysed * @throws IOException */ public void doCommandAnalyse(String letterModelPath, int docId, CorpusSelectionCriteria criteria, MostLikelyWordChooser wordChooser) throws IOException { if (letterModelPath.length() == 0) throw new RuntimeException("Missing argument: letterModel"); if (!letterModelPath.endsWith(".zip")) throw new RuntimeException("letterModel must end with .zip"); ZipInputStream zis = new ZipInputStream(new FileInputStream(letterModelPath)); ClassificationModel<Letter> letterModel = machineLearningService.getClassificationModel(zis); List<String> letterFeatureDescriptors = letterModel.getFeatureDescriptors(); Set<LetterFeature<?>> letterFeatures = letterFeatureService.getLetterFeatureSet(letterFeatureDescriptors); LetterGuesser letterGuesser = letterGuesserService.getLetterGuesser(letterFeatures, letterModel.getDecisionMaker()); ImageAnalyser analyser = analyserService.getBeamSearchImageAnalyser(5, 0.01); analyser.setLetterGuesser(letterGuesser); analyser.setMostLikelyWordChooser(wordChooser); JochreCorpusImageReader imageReader = graphicsService.getJochreCorpusImageReader(); imageReader.setSelectionCriteria(criteria); LetterAssigner letterAssigner = new LetterAssigner(); analyser.addObserver(letterAssigner); if (docId > 0) { JochreDocument doc = documentService.loadJochreDocument(docId); for (JochrePage page : doc.getPages()) { for (JochreImage image : page.getImages()) { if (image.getImageStatus().equals(ImageStatus.AUTO_NEW)) { analyser.analyse(image); } image.clearMemory(); } } } else { analyser.analyse(imageReader); } } public void doCommandAnalyse(File sourceFile, File letterModelFile, File splitModelFile, File mergeModelFile, MostLikelyWordChooser wordChooser, List<DocumentObserver> observers, int firstPage, int lastPage) throws IOException { ZipInputStream zis = new ZipInputStream(new FileInputStream(letterModelFile)); ClassificationModel<Letter> letterModel = machineLearningService.getClassificationModel(zis); List<String> letterFeatureDescriptors = letterModel.getFeatureDescriptors(); Set<LetterFeature<?>> letterFeatures = letterFeatureService.getLetterFeatureSet(letterFeatureDescriptors); LetterGuesser letterGuesser = letterGuesserService.getLetterGuesser(letterFeatures, letterModel.getDecisionMaker()); ImageAnalyser analyser = analyserService.getBeamSearchImageAnalyser(5, 0.01); analyser.setLetterGuesser(letterGuesser); analyser.setMostLikelyWordChooser(wordChooser); BoundaryDetector boundaryDetector = null; if (splitModelFile != null && mergeModelFile != null) { double minWidthRatioForSplit = 1.1; double minHeightRatioForSplit = 1.0; int splitBeamWidth = 5; int maxSplitDepth = 2; SplitCandidateFinder splitCandidateFinder = boundaryService.getSplitCandidateFinder(); splitCandidateFinder.setMinDistanceBetweenSplits(5); ZipInputStream splitZis = new ZipInputStream(new FileInputStream(splitModelFile)); ClassificationModel<SplitOutcome> splitModel = machineLearningService.getClassificationModel(splitZis); List<String> splitFeatureDescriptors = splitModel.getFeatureDescriptors(); Set<SplitFeature<?>> splitFeatures = boundaryFeatureService.getSplitFeatureSet(splitFeatureDescriptors); ShapeSplitter shapeSplitter = boundaryService.getShapeSplitter(splitCandidateFinder, splitFeatures, splitModel.getDecisionMaker(), minWidthRatioForSplit, splitBeamWidth, maxSplitDepth); ZipInputStream mergeZis = new ZipInputStream(new FileInputStream(splitModelFile)); ClassificationModel<MergeOutcome> mergeModel = machineLearningService.getClassificationModel(mergeZis); List<String> mergeFeatureDescriptors = mergeModel.getFeatureDescriptors(); Set<MergeFeature<?>> mergeFeatures = boundaryFeatureService.getMergeFeatureSet(mergeFeatureDescriptors); double maxWidthRatioForMerge = 1.2; double maxDistanceRatioForMerge = 0.15; double minProbForDecision = 0.5; ShapeMerger shapeMerger = boundaryService.getShapeMerger(mergeFeatures, mergeModel.getDecisionMaker()); boundaryDetector = boundaryService.getDeterministicBoundaryDetector(shapeSplitter, shapeMerger, minProbForDecision); boundaryDetector.setMinWidthRatioForSplit(minWidthRatioForSplit); boundaryDetector.setMinHeightRatioForSplit(minHeightRatioForSplit); boundaryDetector.setMaxWidthRatioForMerge(maxWidthRatioForMerge); boundaryDetector.setMaxDistanceRatioForMerge(maxDistanceRatioForMerge); analyser.setBoundaryDetector(boundaryDetector); OriginalShapeLetterAssigner shapeLetterAssigner = new OriginalShapeLetterAssigner(); shapeLetterAssigner.setEvaluate(false); shapeLetterAssigner.setSingleLetterMethod(false); analyser.addObserver(shapeLetterAssigner); } else { boundaryDetector = boundaryService.getOriginalBoundaryDetector(); analyser.setBoundaryDetector(boundaryDetector); LetterAssigner letterAssigner = new LetterAssigner(); analyser.addObserver(letterAssigner); } JochreDocumentGenerator documentGenerator = documentService.getJochreDocumentGenerator(sourceFile.getName(), "", locale); documentGenerator.requestAnalysis(analyser); for (DocumentObserver observer : observers) documentGenerator.addDocumentObserver(observer); if (!sourceFile.exists()) throw new JochreException("The file " + sourceFile.getPath() + " does not exist"); if (sourceFile.getName().toLowerCase().endsWith(".pdf")) { PdfImageVisitor pdfImageVisitor = pdfService.getPdfImageVisitor(sourceFile, firstPage, lastPage, documentGenerator); pdfImageVisitor.visitImages(); } else if (sourceFile.getName().toLowerCase().endsWith(".png") || sourceFile.getName().toLowerCase().endsWith(".jpg") || sourceFile.getName().toLowerCase().endsWith(".jpeg") || sourceFile.getName().toLowerCase().endsWith(".gif")) { ImageDocumentExtractor extractor = documentService.getImageDocumentExtractor(sourceFile, documentGenerator); extractor.extractDocument(); } else if (sourceFile.isDirectory()) { ImageDocumentExtractor extractor = documentService.getImageDocumentExtractor(sourceFile, documentGenerator); extractor.extractDocument(); } else { throw new RuntimeException("Unrecognised file extension"); } } /** * Evaluate a suite of split/merge models and letter guessing model. * @param letterModelPath the path to the letter-guessing model * @param splitModelPath the path to the splitting model * @param mergeModelPath the path to the merging model * @param testSet the set of images to evaluate in the saved corpus * @param imageId the single image to evaluate in the saved corpus * @param imageId2 * @param save whether or not the letter guesses should be saved * @param outputDirPath the output directory where we write the evaluation results * @param boundaryDetectorType * @param minProbForDecision * @throws IOException */ public void doCommandEvaluateFull(String letterModelPath, String splitModelPath, String mergeModelPath, CorpusSelectionCriteria criteria, boolean save, String outputDirPath, MostLikelyWordChooser wordChooser, int beamWidth, BoundaryDetectorType boundaryDetectorType, double minProbForDecision, String suffix) throws IOException { if (letterModelPath.length() == 0) throw new RuntimeException("Missing argument: letterModel"); if (outputDirPath == null || outputDirPath.length() == 0) throw new RuntimeException("Missing argument: outputDir"); File outputDir = new File(outputDirPath); outputDir.mkdirs(); String baseName = letterModelPath.substring(0, letterModelPath.indexOf(".")); if (baseName.lastIndexOf("/") > 0) baseName = baseName.substring(baseName.lastIndexOf("/") + 1); ZipInputStream zis = new ZipInputStream(new FileInputStream(letterModelPath)); ClassificationModel<Letter> letterModel = machineLearningService.getClassificationModel(zis); List<String> letterFeatureDescriptors = letterModel.getFeatureDescriptors(); Set<LetterFeature<?>> letterFeatures = letterFeatureService.getLetterFeatureSet(letterFeatureDescriptors); LetterGuesser letterGuesser = letterGuesserService.getLetterGuesser(letterFeatures, letterModel.getDecisionMaker()); if (splitModelPath.length() == 0) throw new RuntimeException("Missing argument: splitModel"); if (!splitModelPath.endsWith(".zip")) throw new RuntimeException("splitModel must end with .zip"); ZipInputStream splitZis = new ZipInputStream(new FileInputStream(splitModelPath)); ClassificationModel<SplitOutcome> splitModel = machineLearningService.getClassificationModel(splitZis); List<String> splitFeatureDescriptors = splitModel.getFeatureDescriptors(); Set<SplitFeature<?>> splitFeatures = boundaryFeatureService.getSplitFeatureSet(splitFeatureDescriptors); double minWidthRatioForSplit = 1.1; double minHeightRatioForSplit = 1.0; int maxSplitDepth = 2; SplitCandidateFinder splitCandidateFinder = boundaryService.getSplitCandidateFinder(); splitCandidateFinder.setMinDistanceBetweenSplits(5); ShapeSplitter shapeSplitter = boundaryService.getShapeSplitter(splitCandidateFinder, splitFeatures, splitModel.getDecisionMaker(), minWidthRatioForSplit, beamWidth, maxSplitDepth); if (mergeModelPath.length() == 0) throw new RuntimeException("Missing argument: mergeModel"); if (!mergeModelPath.endsWith(".zip")) throw new RuntimeException("mergeModel must end with .zip"); ZipInputStream mergeZis = new ZipInputStream(new FileInputStream(mergeModelPath)); ClassificationModel<MergeOutcome> mergeModel = machineLearningService.getClassificationModel(mergeZis); List<String> mergeFeatureDescriptors = mergeModel.getFeatureDescriptors(); Set<MergeFeature<?>> mergeFeatures = boundaryFeatureService.getMergeFeatureSet(mergeFeatureDescriptors); double maxWidthRatioForMerge = 1.2; double maxDistanceRatioForMerge = 0.15; ShapeMerger shapeMerger = boundaryService.getShapeMerger(mergeFeatures, mergeModel.getDecisionMaker()); JochreCorpusImageReader imageReader = graphicsService.getJochreCorpusImageReader(); imageReader.setSelectionCriteria(criteria); BoundaryDetector boundaryDetector = null; switch (boundaryDetectorType) { case LetterByLetter: boundaryDetector = boundaryService.getLetterByLetterBoundaryDetector(shapeSplitter, shapeMerger, 5); break; case Deterministic: boundaryDetector = boundaryService.getDeterministicBoundaryDetector(shapeSplitter, shapeMerger, minProbForDecision); break; } boundaryDetector.setMinWidthRatioForSplit(minWidthRatioForSplit); boundaryDetector.setMinHeightRatioForSplit(minHeightRatioForSplit); boundaryDetector.setMaxWidthRatioForMerge(maxWidthRatioForMerge); boundaryDetector.setMaxDistanceRatioForMerge(maxDistanceRatioForMerge); ImageAnalyser evaluator = analyserService.getBeamSearchImageAnalyser(beamWidth, 0.01); evaluator.setLetterGuesser(letterGuesser); evaluator.setMostLikelyWordChooser(wordChooser); evaluator.setBoundaryDetector(boundaryDetector); LetterValidator letterValidator = new ComponentCharacterValidator(locale); OriginalShapeLetterAssigner shapeLetterAssigner = new OriginalShapeLetterAssigner(); shapeLetterAssigner.setEvaluate(true); shapeLetterAssigner.setSave(save); shapeLetterAssigner.setLetterValidator(letterValidator); shapeLetterAssigner.setSingleLetterMethod(false); evaluator.addObserver(shapeLetterAssigner); ErrorLogger errorLogger = new ErrorLogger(); if (wordChooser != null) { errorLogger.setLexicon(wordChooser.getLexicon()); errorLogger.setWordSplitter(wordChooser.getWordSplitter()); } Writer errorWriter = null; File errorFile = new File(outputDir, baseName + suffix + "errors.txt"); errorFile.delete(); errorWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(errorFile, true), "UTF8")); errorLogger.setErrorWriter(errorWriter); evaluator.addObserver(errorLogger); // evaluator.setOutcomesToAnalyse(new String[] {""}); try { evaluator.analyse(imageReader); } finally { if (errorWriter != null) errorWriter.close(); } LOG.debug("F-score for " + letterModelPath + ": " + shapeLetterAssigner.getFScoreCalculator().getTotalFScore()); String modelFileName = baseName + suffix + "_full"; File fscoreFile = new File(outputDir, modelFileName + "_fscores.csv"); shapeLetterAssigner.getFScoreCalculator().writeScoresToCSVFile(fscoreFile); } /** * Apply a set of images to a given image or a given shape. * @param imageId * @param shapeId */ public void doCommandApplyFeatures(int imageId, int shapeId, String letterFeatureFilePath) { try { if (letterFeatureFilePath.length() == 0) throw new RuntimeException("Missing argument: letterFeatures"); LetterFeatureTester featureTester = letterFeatureService.getFeatureTester(); File letterFeatureFile = new File(letterFeatureFilePath); Scanner scanner = new Scanner(letterFeatureFile); List<String> descriptors = new ArrayList<String>(); while (scanner.hasNextLine()) { String descriptor = scanner.nextLine(); descriptors.add(descriptor); LOG.debug(descriptor); } Set<LetterFeature<?>> features = letterFeatureService.getLetterFeatureSet(descriptors); Set<String> letters = new HashSet<String>(); // letters.add(""); // letters.add(""); featureTester.applyFeatures(features, letters, imageId, shapeId); } catch (FileNotFoundException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } } /** * Update the images in an existing Jochre document. * @param filename the PDF file containing the images * @param docId the id of the document ot update * @param firstPage the first page to segment, if <=0 will start with first document page * @param lastPage the last page to segment, if <=0 will segment until last document page */ public void doCommandUpdateImages(String filename, int docId, int firstPage, int lastPage) { if (filename.length() == 0) throw new RuntimeException("Missing argument: file"); if (docId < 0) throw new RuntimeException("Missing argument: docId"); JochreDocument doc = documentService.loadJochreDocument(docId); if (filename.toLowerCase().endsWith(".pdf")) { File pdfFile = new File(filename); PdfImageVisitor pdfImageVisitor = pdfService.getPdfImageVisitor(pdfFile, firstPage, lastPage, new PdfImageUpdater(doc)); pdfImageVisitor.visitImages(); } else { throw new RuntimeException("Unrecognised file extension"); } } /** * Extract the images from a PDF file. * @param filename the path to the PDF file * @param outputDirPath the directory where to store the images extracted. * @param firstPage the first page to segment, if <=0 will start with first document page * @param lastPage the last page to segment, if <=0 will segment until last document page */ public void doCommandExtractImages(String filename, String outputDirPath, int firstPage, int lastPage) { if (filename.length() == 0) throw new RuntimeException("Missing argument: file"); if (outputDirPath.length() == 0) throw new RuntimeException("Missing argument: outputDir"); if (filename.toLowerCase().endsWith(".pdf")) { File pdfFile = new File(filename); PdfImageSaver pdfImageSaver = pdfService.getPdfImageSaver(pdfFile); pdfImageSaver.saveImages(outputDirPath, firstPage, lastPage); } else { throw new RuntimeException("Unrecognised file extension"); } } /** * Segment a file, without analysing it. * @param filename the path of the file to load * @param userFriendlyName a name to store against this file in the database * @param showSegmentation whether or not to output the graphical segmentation files * @param outputDirPath an output directory for the graphical segmentation files * @param save should we save this file to the database? * @param firstPage the first page to segment, if <=0 will start with first document page * @param lastPage the last page to segment, if <=0 will segment until last document page */ public void doCommandSegment(String filename, String userFriendlyName, boolean showSegmentation, boolean drawPixelSpread, String outputDirPath, boolean save, int firstPage, int lastPage) { if (filename.length() == 0) throw new RuntimeException("Missing argument: file"); if (userId < 0 && save) throw new RuntimeException("Missing argument (for save=true): userId"); User user = null; if (userId >= 0) { user = securityService.loadUser(userId); } File file = new File(filename); JochreDocumentGenerator sourceFileProcessor = this.documentService .getJochreDocumentGenerator(file.getName(), userFriendlyName, locale); sourceFileProcessor.setDrawPixelSpread(drawPixelSpread); if (save) sourceFileProcessor.requestSave(user); if (showSegmentation) { if (outputDirPath != null && outputDirPath.length() > 0) { File outputDir = new File(outputDirPath); outputDir.mkdirs(); sourceFileProcessor.requestSegmentation(outputDir); } } if (filename.toLowerCase().endsWith(".pdf")) { PdfImageVisitor pdfImageVisitor = pdfService.getPdfImageVisitor(file, firstPage, lastPage, sourceFileProcessor); pdfImageVisitor.visitImages(); } else if (filename.toLowerCase().endsWith(".png") || filename.toLowerCase().endsWith(".jpg") || filename.toLowerCase().endsWith(".jpeg") || filename.toLowerCase().endsWith(".gif")) { ImageDocumentExtractor extractor = documentService.getImageDocumentExtractor(file, sourceFileProcessor); extractor.extractDocument(); } else { throw new RuntimeException("Unrecognised file extension"); } } static class PdfImageUpdater implements SourceFileProcessor { JochreDocument doc = null; public PdfImageUpdater(JochreDocument document) { this.doc = document; } @Override public JochreDocument onDocumentStart() { return this.doc; } @Override public void onDocumentComplete(JochreDocument doc) { // nothing to do here } @Override public JochreDocument getDocument() { return this.doc; } @Override public JochrePage onPageStart(int pageIndex) { JochrePage thePage = null; for (JochrePage page : this.doc.getPages()) { if (page.getIndex() == pageIndex) { thePage = page; break; } } return thePage; } @Override public void onPageComplete(JochrePage jochrePage) { // nothing here. } @Override public JochreImage onImageFound(JochrePage jochrePage, BufferedImage image, String imageName, int imageIndex) { JochreImage theImage = jochrePage.getImages().get(0); theImage.setOriginalImage(image); theImage.save(); return theImage; } } public int getUserId() { return userId; } public void setUserId(int userId) { this.userId = userId; } public Locale getLocale() { return locale; } public void setLocale(Locale locale) { this.locale = locale; JochreSession.setLocale(locale); } public GraphicsService getGraphicsService() { return graphicsService; } public void setGraphicsService(GraphicsService graphicsService) { this.graphicsService = graphicsService; } protected Lexicon readLexicon(File lexiconDir) { Lexicon myLexicon = null; if (lexiconDir.isDirectory()) { LexiconMerger lexiconMerger = new LexiconMerger(); File[] lexiconFiles = lexiconDir.listFiles(); for (File lexiconFile : lexiconFiles) { if (lexiconFile.getName().endsWith(".txt")) { TextFileLexicon textFileLexicon = new TextFileLexicon(lexiconFile, encoding); lexiconMerger.addLexicon(textFileLexicon); } else { TextFileLexicon textFileLexicon = TextFileLexicon.deserialize(lexiconFile); lexiconMerger.addLexicon(textFileLexicon); } } myLexicon = lexiconMerger; } else { if (lexiconDir.getName().endsWith(".txt")) { TextFileLexicon textFileLexicon = new TextFileLexicon(lexiconDir, encoding); myLexicon = textFileLexicon; } else { TextFileLexicon textFileLexicon = TextFileLexicon.deserialize(lexiconDir); myLexicon = textFileLexicon; } } return myLexicon; } @Override public Lexicon getLexicon() { if (lexicon == null) { if (lexiconPath != null && lexiconPath.length() > 0) { File lexiconDir = new File(lexiconPath); Lexicon myLexicon = this.readLexicon(lexiconDir); this.lexicon = new DefaultLexiconWrapper(myLexicon); } else { this.lexicon = new FakeLexicon(); } } return this.lexicon; } @Override public WordSplitter getWordSplitter() { if (wordSplitter == null) wordSplitter = new DefaultWordSplitter(); return wordSplitter; } public String getLexiconPath() { return lexiconPath; } public void setLexiconPath(String lexiconPath) { this.lexiconPath = lexiconPath; } }