Java tutorial
/** * This file is part of CERMINE project. * Copyright (c) 2011-2013 ICM-UW * * CERMINE is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * CERMINE is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with CERMINE. If not, see <http://www.gnu.org/licenses/>. */ package pl.edu.icm.cermine.evaluation; import com.google.common.collect.Lists; import java.io.*; import java.util.*; import java.util.regex.Pattern; import org.apache.commons.io.FileUtils; import pl.edu.icm.cermine.exception.AnalysisException; import pl.edu.icm.cermine.exception.TransformationException; import pl.edu.icm.cermine.structure.DocstrumSegmenter; import pl.edu.icm.cermine.structure.DocumentSegmenter; import pl.edu.icm.cermine.structure.HierarchicalReadingOrderResolver; import pl.edu.icm.cermine.structure.ReadingOrderResolver; import pl.edu.icm.cermine.structure.model.*; import pl.edu.icm.cermine.structure.tools.BxModelUtils; import pl.edu.icm.cermine.structure.tools.UnsegmentedPagesFlattener; import pl.edu.icm.cermine.structure.transformers.BxDocumentToTrueVizWriter; import pl.edu.icm.cermine.structure.transformers.TrueVizToBxDocumentReader; /** * * @author krusek */ public class SegmentationEvaluator extends AbstractSingleInputEvaluator<BxDocument, BxDocument, BxDocument, SegmentationEvaluator.Results> { private static final Pattern FILENAME_PATTERN = Pattern.compile("(.+)\\.xml"); private DocumentSegmenter pageSegmenter = new DocstrumSegmenter(); private final Set<BxZoneLabel> ignoredLabels = EnumSet.noneOf(BxZoneLabel.class); private UnsegmentedPagesFlattener flattener = new UnsegmentedPagesFlattener(); private final ReadingOrderResolver resolver = new HierarchicalReadingOrderResolver(); private TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader(); private BxDocumentToTrueVizWriter writer = new BxDocumentToTrueVizWriter(); @Override protected Pattern getFilenamePattern() { return FILENAME_PATTERN; } public void setPageSegmenter(DocumentSegmenter pageSegmenter) { this.pageSegmenter = pageSegmenter; } public void setIgnoredLabels(Collection<BxZoneLabel> labels) { ignoredLabels.clear(); ignoredLabels.addAll(labels); } public void setLabels(Collection<BxZoneLabel> labels) { ignoredLabels.addAll(EnumSet.allOf(BxZoneLabel.class)); ignoredLabels.removeAll(labels); } @Override protected void preprocessDocument(BxDocument document) { flattener.process(document); } @Override protected BxDocument processDocument(BxDocument document) throws AnalysisException { return pageSegmenter.segmentDocument(document); } @Override protected Results compareItems(BxDocument expected, BxDocument actual) { Results results = new Results(); for (int i = 0; i < expected.childrenCount(); i++) { BxPage expPage = expected.getChild(i); BxPage actPage = actual.getChild(i); results.zoneLevel.add(compareZones(expPage, actPage)); results.lineLevel.add(compareLines(expPage, actPage)); results.wordLevel.add(compareWords(expPage, actPage)); } return results; } private void printSeparator() { System.out.print(" +----------+"); Results.printSeparator(); } @Override protected void printDocumentStart() { System.out.print(" | Page |"); Results.printLevelHeader(); System.out.print(" | |"); Results.printColumnHeader(); printSeparator(); } @Override protected void printItemResults(BxDocument expected, BxDocument actual, int idx, Results results) { printItemResults(idx, results); } protected void printItemResults(int pageIndex, Results results) { Formatter formatter = new Formatter(System.out, Locale.US); formatter.format(" | %8d |", pageIndex + 1); results.printResults(formatter); } @Override protected void printDocumentResults(Results results) { printSeparator(); Formatter formatter = new Formatter(System.out, Locale.US); formatter.format(" | Total: |"); results.printResults(formatter); } @Override protected Results newResults() { return new Results(); } @Override protected void printFinalResults(Results results) { results.printSummary(); } private LevelResults compareWords(BxPage expected, BxPage actual) { Map<BxChunk, BxWord> map = BxModelUtils.mapChunksToWords(actual); LevelResults results = new LevelResults(); for (BxZone expectedZone : expected) { if (ignoredLabels.contains(expectedZone.getLabel())) { continue; } for (BxLine expectedLine : expectedZone) { for (BxWord expectedWord : expectedLine) { Set<BxWord> actualWords = new HashSet<BxWord>(); for (BxChunk chunk : expectedWord) { actualWords.add(map.get(chunk)); } if (actualWords.size() == 1) { for (BxWord actualWord : actualWords) { if (actualWord.childrenCount() == expectedWord.childrenCount()) { results.matched++; } } } results.all++; } } } return results; } private LevelResults compareLines(BxPage expected, BxPage actual) { Map<BxChunk, BxLine> map = BxModelUtils.mapChunksToLines(actual); LevelResults results = new LevelResults(); for (BxZone expectedZone : expected) { if (ignoredLabels.contains(expectedZone.getLabel())) { continue; } for (BxLine expectedLine : expectedZone) { Set<BxLine> actualLines = new HashSet<BxLine>(); for (BxWord word : expectedLine) { for (BxChunk chunk : word) { actualLines.add(map.get(chunk)); } } if (actualLines.size() == 1) { for (BxLine actualLine : actualLines) { if (BxModelUtils.countChunks(actualLine) == BxModelUtils.countChunks(expectedLine)) { results.matched++; } } } results.all++; } } return results; } private LevelResults compareZones(BxPage expected, BxPage actual) { Map<BxChunk, BxZone> map = BxModelUtils.mapChunksToZones(actual); LevelResults results = new LevelResults(); for (BxZone expectedZone : expected) { if (ignoredLabels.contains(expectedZone.getLabel())) { continue; } Set<BxZone> actualZones = new HashSet<BxZone>(); for (BxLine line : expectedZone) { for (BxWord word : line) { for (BxChunk chunk : word) { actualZones.add(map.get(chunk)); } } } if (actualZones.size() == 1) { for (BxZone actualZone : actualZones) { if (BxModelUtils.countChunks(actualZone) == BxModelUtils.countChunks(expectedZone)) { results.matched++; } } } results.all++; } return results; } @Override protected BxDocument prepareActualDocument(BxDocument document) throws AnalysisException { document = BxModelUtils.deepClone(document); preprocessDocument(document); return processDocument(document); } @Override protected BxDocument prepareExpectedDocument(BxDocument document) throws AnalysisException { resolver.resolve(document); return document; } @Override protected BxDocument readDocument(Reader input) throws TransformationException { return new BxDocument().setPages(reader.read(input)); } @Override protected void writeDocument(BxDocument document, Writer output) throws TransformationException { writer.write(output, Lists.newArrayList(document)); } @Override protected Iterator<BxDocument> iterateItems(final BxDocument document) { return new Iterator<BxDocument>() { private boolean used = false; @Override public boolean hasNext() { return !used; } @Override public BxDocument next() { used = true; return document; } @Override public void remove() { used = true; } }; } public static class Results implements AbstractEvaluator.Results<Results> { private LevelResults zoneLevel = new LevelResults(); private LevelResults lineLevel = new LevelResults(); private LevelResults wordLevel = new LevelResults(); @Override public void add(Results results) { zoneLevel.add(results.zoneLevel); lineLevel.add(results.lineLevel); wordLevel.add(results.wordLevel); } public void printResults(Formatter formatter) { zoneLevel.printResults(formatter); lineLevel.printResults(formatter); wordLevel.printResults(formatter); formatter.format("%n"); } public static void printLevelHeader() { System.out.print(" Zones |"); System.out.print(" Lines |"); System.out.print(" Words |"); System.out.println(); } public static void printColumnHeader() { LevelResults.printHeader(); LevelResults.printHeader(); LevelResults.printHeader(); System.out.println(); } public static void printSeparator() { LevelResults.printSeparator(); LevelResults.printSeparator(); LevelResults.printSeparator(); System.out.println(); } public void printSummary() { Formatter formatter = new Formatter(System.out, Locale.US); System.out.println(" * zones"); zoneLevel.printSummary(formatter); System.out.println(" * lines"); lineLevel.printSummary(formatter); System.out.println(" * words"); wordLevel.printSummary(formatter); } } public static class LevelResults { private int all = 0; private int matched = 0; public void add(LevelResults results) { all += results.all; matched += results.matched; } public void printResults(Formatter formatter) { formatter.format(" %8d %8d %7.2f%% |", all, matched, getScore() * 100); } public static void printHeader() { System.out.print(" All Matched Score |"); } public static void printSeparator() { System.out.print("----------------------------------------------+"); } public void printSummary(Formatter formatter) { formatter.format(" * all : %8d%n", all); formatter.format(" * matched : %8d%n", matched); formatter.format(" * score : %7.2f%%%n", getScore() * 100); } public double getScore() { if (all == 0) { return 1.0; } else { return ((double) matched) / all; } } } public static void main(String[] args) throws AnalysisException, IOException, TransformationException { SegmentationEvaluator evaluator = new SegmentationEvaluator(); evaluator.ignoredLabels.add(BxZoneLabel.BODY_TABLE); evaluator.ignoredLabels.add(BxZoneLabel.BODY_FIGURE); evaluator.ignoredLabels.add(BxZoneLabel.BODY_EQUATION); File file = new File(args[0]); Collection<File> files = FileUtils.listFiles(file, new String[] { "xml" }, true); Results results = evaluator.newResults(); int i = 0; double zoneScores = 0; double lineScores = 0; double wordScores = 0; BxDocument origDoc; BxDocument testDoc; FileReader reader; for (File filee : files) { System.out.println(new Date(System.currentTimeMillis())); System.out.println(filee.getName()); reader = new FileReader(filee); origDoc = evaluator.prepareExpectedDocument(evaluator.readDocument(reader)); testDoc = evaluator.prepareActualDocument(origDoc); Results docRes = evaluator.compareDocuments(origDoc, testDoc); results.add(docRes); zoneScores += results.zoneLevel.getScore(); lineScores += results.lineLevel.getScore(); wordScores += results.wordLevel.getScore(); System.out.println(++i); } zoneScores /= i; lineScores /= i; wordScores /= i; System.out.println("Documents: " + i); System.out.println("Average zone score: " + zoneScores); System.out.println("Average line score: " + lineScores); System.out.println("Average word score: " + wordScores); results.printSummary(); } }