Java tutorial
package com.zilbo.flamingSailor.TE.model; import com.scottlogic.util.SortedList; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import java.io.PrintStream; import java.io.PrintWriter; import java.util.*; /* * Copyright 2012 Zilbo.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ public class TextPage { private static final Logger logger = Logger.getLogger(TextPage.class); private static final float LINE_HEIGHT_SPACE_BETWEEN_MLB = 2.0f; public static final float MINIMUM_FONT_HEIGHT = 6.0f; // basically distance should just be the number of digits in the page# (+1) * 2 // to take into account when even/odd pages have numbers on outside pages. public static final int LEVENSHTEIN_DISTANCE = 6; // this is kind of low, but we have documents that switch between landscape&portrait private static final double template_match_percent = 0.30; long componentID; float minimumHeight; List<TextLine> lines; List<TextPiece> pagePieces; List<Component> components; List<PDLink> PDLinks; List<BufferedImage> images; int pageNum; Component header; Component footer; protected String text; Map<String, Map<Integer, Long>> fontCounts; double avgLeft = 0.0; double avgRight = 0.0; double avgWidth = 0.0; long lineCount = 0; double charDensity = 0.0; public TextPage(int pageNum, float minimumHeight) { this.pageNum = pageNum; this.minimumHeight = minimumHeight; this.PDLinks = new ArrayList<>(); this.images = new ArrayList<>(); this.fontCounts = new HashMap<>(); header = new MultiPartBlock(-1); footer = new MultiPartBlock(-2); } public void constructPageComponents(double highestFreqSize, double minFontSize, double maxFontSize, Map<String, Map<Integer, Double>> normalizedFontCounts, Map<String, Double> normalizedFonts, Map<Integer, Double> normalizedSizes, double avgLeft, double avgRight, double avgWidth, double charDensity, double linesPerPage, Double[] normalizedHistogram) { components = new ArrayList<>(); // this.components.clear(); MultiPartBlock currentTable = null; boolean first = true; double previousEndY = -1; for (TextLine l : lines) { l.categorizeLine(highestFreqSize, minFontSize, maxFontSize, normalizedFontCounts, normalizedFonts, normalizedSizes, avgLeft, avgRight, avgWidth, charDensity, linesPerPage); // for regular lines we would be expecting a p() of over 40% Double[] lineNormHistogram = l.getNormalizedHistogram(); double thisHeight = l.height(); double thisDensity = l.density(); if (l.getLineIsRegularProbability() < 0.3 && Math.round(thisHeight) >= highestFreqSize) { if (thisDensity >= (charDensity - 0.02) || thisHeight - highestFreqSize > 2.0) { // headings usually start at the left if (l.getGeom().getMinX() < (avgLeft + (avgWidth / 10))) { l.setIsHeading(true); } // or they are in the center of the line if (l.width() < avgWidth / 3) { l.setIsHeading(true); } /* logger.info("Heading:Y\t"+String.format("Density %+3.2f Height: %+3.2f Prob: %3.2f %s", thisDensity-charDensity,thisHeight-highestFreqSize,l.getLineIsRegularProbability(), l.getText())); } else { logger.info("Heading:N\t"+String.format("Density %+3.2f Height: %+3.2f Prob: %3.2f %s", thisDensity-charDensity,thisHeight-highestFreqSize,l.getLineIsRegularProbability(), l.getText())); */ } } // gaps of 4 lines usually mean tables have ended. possible short page if (!first && (l.getGeom().getMinY() - previousEndY) > (4 * l.height())) { if (currentTable != null) { if (currentTable.size() < 2) { for (Component c : currentTable.getChildren()) { components.add(c); } } else { components.add(currentTable); } currentTable = null; } } previousEndY = l.getGeom().getMaxY(); first = false; if (l.isHeading()) { if (currentTable != null) { if (currentTable.size() < 2) { for (Component c : currentTable.getChildren()) { components.add(c); } } else { components.add(currentTable); } currentTable = null; } components.add(l); continue; } // a regular line. if (l.getLineIsRegularProbability() > 0.4 && l.width() > avgWidth && l.density() > charDensity) { if (currentTable != null) { if (currentTable.size() < 2) { for (Component c : currentTable.getChildren()) { components.add(c); } } else { components.add(currentTable); } currentTable = null; } components.add(l); continue; } // big font is usually a section heading, not in a table double height = l.height(); if (height > (highestFreqSize * 1.25)) { if (currentTable != null) { if (currentTable.size() < 2) { for (Component c : currentTable.getChildren()) { components.add(c); } currentTable = null; } else { components.add(currentTable); currentTable = null; } } components.add(l); continue; } String text = l.getText(); if (text.toLowerCase().startsWith("notes:") || text.toLowerCase().startsWith("note:")) { if (currentTable != null) { if (currentTable.size() < 2) { for (Component c : currentTable.getChildren()) { components.add(c); } currentTable = null; } else { components.add(currentTable); currentTable = null; } } components.add(l); continue; } /* if (text.startsWith("Total")) { if (currentTable != null) { currentTable.addChild(l); if (currentTable.size() < 2) { for (Component c : currentTable.getChildren()) { components.add(c); } currentTable = null; } else { components.add(currentTable); currentTable = null; } } continue; } */ // low density means lots of spaces between words. if (l.density() < charDensity) { if (currentTable == null) { currentTable = new MultiPartBlock(getNextComponentID()); } currentTable.addChild(l); continue; } // long line, regular size font, good density if (l.getGeom().getWidth() >= avgWidth) { if (currentTable != null) { if (currentTable.size() < 2) { for (Component c : currentTable.getChildren()) { components.add(c); } currentTable = null; } else { components.add(currentTable); currentTable = null; } } components.add(l); continue; } if (l.getGeom().getMinX() > avgLeft && l.getGeom().getMaxX() < avgRight) { if (currentTable == null) { currentTable = new MultiPartBlock(getNextComponentID()); } currentTable.addChild(l); continue; } // just a bit on the right hand side if (l.getGeom().getMinX() > avgLeft) { if (currentTable == null) { currentTable = new MultiPartBlock(getNextComponentID()); } currentTable.addChild(l); continue; } // if we are here, we have a short line, average density, and slightly larger? fontsize. // so just copy what is above us if (currentTable != null) { currentTable.addChild(l); } else { components.add(l); } } if (currentTable != null) { components.add(currentTable); currentTable = null; } List<Component> currentComponents = new ArrayList<>(); currentComponents.addAll(components); for (Component c : currentComponents) { if (c instanceof MultiPartBlock) { List<Component> children = c.getChildren(); // prevent looking at 'list of figures / list of tables' if (((MultiPartBlock) c).linesStartWith("figure", false) > (0.5 * children.size())) { continue; } if (((MultiPartBlock) c).linesStartWith("table", false) > (0.5 * children.size())) { continue; } boolean seenTableCaption = false; int lastSplit = 0; boolean haveSplit = false; for (int i = 0; i < children.size(); i++) { Component l = children.get(i); String text = l.getText(); if (text.contains("Table") || text.contains("Figure") || text.contains("TABLE") || text.contains("FIGURE")) { if (seenTableCaption && (i - lastSplit) > 2) { MultiPartBlock mlb = new MultiPartBlock(getNextComponentID()); for (int j = lastSplit; j < i; j++) { mlb.addChild(children.get(j)); } lastSplit = i; int index = components.indexOf(c); haveSplit = true; components.add(index, mlb); } seenTableCaption = true; } } if (haveSplit) { MultiPartBlock mlb = new MultiPartBlock(getNextComponentID()); for (int j = lastSplit; j < children.size(); j++) { mlb.addChild(children.get(j)); } if (!mlb.isEmpty()) { int index = components.indexOf(c); components.add(index, mlb); } components.remove(c); } } } } public void processPage(List<TextPiece> pieces, Map<String, Map<Integer, Long>> fontCounts) { pagePieces = new SortedList<>(new Component.topleft_comparator()); // pagePieces = new ArrayList<>(); this.fontCounts = fontCounts; lines = new ArrayList<>(); this.componentID = 0; for (TextPiece bit : pieces) { componentID = Math.max(componentID, bit.getID()); } getNextComponentID(); shrinkPieces(pieces); // eventually we want to go back to JDK-lists.. and we will need this then // Collections.sort(pagePieces); constructLines(); } long histogram[] = null; protected void calcLineStats() { lineCount = lines.size(); histogram = null; for (TextLine l : lines) { avgLeft += l.getGeom().getMinX(); avgRight += l.getGeom().getMaxX(); charDensity += l.density(); avgWidth += l.getGeom().getWidth(); histogram = l.mergeHistogram(histogram); } avgLeft /= lineCount; avgRight /= lineCount; charDensity /= lineCount; avgWidth /= lineCount; } public long[] getHistogram() { return histogram; } public void dumpPage(PrintWriter pw) { pw.printf("--------\n"); pw.printf("PAGE:%d\n", pageNum); pw.printf("--------\n"); for (Component c : components) { pw.print(c.getText()); pw.print("\n"); } pw.print('\n'); } protected void shrinkPieces(List<TextPiece> pieces) { boolean first = true; TextPiece currentPiece = null; for (TextPiece bit : pieces) { // if ( currentPiece!=null &¤tPiece.getText().contains("TITLE PAGE")) { // logger.info("debug"); // } if (bit.getHeight() < this.minimumHeight) { continue; } if (first) { currentPiece = bit; first = false; } else { if (currentPiece.isNextToX(bit)) { // periods like this are in a Table of Contents. we want to split out the page number from the other part of the heading. if (currentPiece.isTOCPart(bit)) { currentPiece.categorize(); pagePieces.add(currentPiece); currentPiece = bit; } else { currentPiece.appendX(bit); } } else { currentPiece.categorize(); pagePieces.add(currentPiece); currentPiece = bit; } } } if (!first) { currentPiece.categorize(); pagePieces.add(currentPiece); } } protected void constructLines() { TextLine currentLine = null; for (TextPiece piece : pagePieces) { if (currentLine == null) { currentLine = new TextLine(getNextComponentID(), piece); } else { if (currentLine.onSameLine(piece)) { currentLine.addChild(piece); } else { if (!currentLine.isEmpty()) { lines.add(currentLine); } currentLine = new TextLine(getNextComponentID(), piece); } } } if (currentLine != null) { if (!currentLine.isEmpty()) { lines.add(currentLine); } } calcLineStats(); } public int getPageNum() { return pageNum; } public String getText() { int end = this.getComponents().size(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < end; i++) { if (i != 0) { sb.append('\n'); } sb.append(this.getComponents().get(i).getText()); } return sb.toString(); } public void setText(String s) { this.text = s; } @Override public String toString() { return "TextPage{" + "pageNum=" + pageNum + // ",mlbs=\n\t" + multis + ",components=\n\t" + components + '}'; } public List<Component> getComponents() { return this.components; } public void addLink(PDLink l) { this.PDLinks.add(l); } public void addImage(BufferedImage i) { this.images.add(i); } public void resolveLinks(List<TextPage> pages) { for (PDLink link : PDLinks) { List<Component> matching = findByGeom(link.geom); logger.info("Link from" + GeomUtil.getRectangleDebug(link.geom) + " -> " + link.pageTo + " " + GeomUtil.getPointDebug(link.to) + "-" + matching.size()); } } public List<Component> findByGeom(Rectangle2D box) { List<Component> ret = new ArrayList<>(); for (Component c : getComponents()) { ret.addAll(c.findByGeom(box)); } return ret; } //TODO move to 'component' class /* public void splitComponent(Component old, Component p1, Component p2) { int index = this.components.indexOf(old); if (index >= 0) { components.remove(index); components.add(index, p2); components.add(index, p1); } if (old instanceof MultiPartBlock && p1 instanceof MultiPartBlock && p2 instanceof MultiPartBlock) { index = this.multis.indexOf(old); if (index >= 0) { multis.remove(index); multis.add(index, (MultiPartBlock) p2); multis.add(index, (MultiPartBlock) p1); } } if (old instanceof TextLine && p1 instanceof TextLine && p2 instanceof TextLine) { index = this.lines.indexOf(old); if (index >= 0) { lines.remove(index); lines.add(index, (TextLine) p2); lines.add(index, (TextLine) p1); } } } */ public void dumpChildren(PrintStream out) { out.println("** Page:" + pageNum); for (Component component : getComponents()) { component.dumpChildren(out, 0); } } private String componentListToString(List<Component> parts) { StringBuilder p2 = new StringBuilder(); for (Component p : parts) { p2.append(p.getText()); } return p2.toString(); } private List<Component> findByGeomByLines(Rectangle2D box) { List<Component> ret = new ArrayList<>(); for (TextLine c : lines) { if (c.onSameLine(box)) { ret.add(c); } } return ret; } private void adjustFontTally(TextPiece tp) { int tpHeight = (int) Math.round(tp.getHeight()); String fontName = tp.getFontName(); Map<Integer, Long> fontTally = fontCounts.get(fontName); if (fontTally != null) { Long tally = fontTally.get(tpHeight); if (tally == null) { tally = 0L; } tally -= tp.getText().length(); if (tally <= 0) { fontTally.remove(tpHeight); } else { fontTally.put(tpHeight, tally); } if (fontTally.isEmpty()) { fontCounts.remove(fontName); } else { fontCounts.put(fontName, fontTally); } } } public final static boolean BOILERPLATE_HEADER = true; public final static boolean BOILERPLATE_FOOTER = false; /** * Try and remove boilerplate text from the page. * this was designed to be called just after the pieces have been converted to lines, * and before higher order structures (tables/headings) have been determined. * * @param headerTemplate potential template text (in components) * @param maxLevenshteinDistance max distance to allow. (to take into account page-numbers) * @param headerTemplateString the template text (in a string) * @param boundingBox the bounding box of the template text * @param isHeader is this at the top of the page (true) or bottom. * @param doUpdate actually modify the header/footer. * @return true if matched the boilerplate */ public boolean removeBoilerPlateComponent(Component headerTemplate, int maxLevenshteinDistance, String headerTemplateString, Rectangle2D boundingBox, boolean isHeader, boolean doUpdate) { List<Component> topC = this.findByGeomByLines(boundingBox); if (topC.size() == 0) { if (lines.size() < 2) { return false; } if (isHeader) { topC.add(this.getTopLine()); } else { topC.add(this.getBottomLine()); } } String topCAsString = componentListToString(topC); int distance = StringUtils.getLevenshteinDistance(headerTemplateString, topCAsString); if (distance <= maxLevenshteinDistance) { if (doUpdate) { for (Component c : topC) { assert (c instanceof TextLine); if (isHeader) { header.addChild(c); } else { footer.addChildAtTop(c); } for (Component tpC : c.getChildren()) { assert (tpC instanceof TextPiece); adjustFontTally((TextPiece) tpC); } if (c instanceof TextLine) { int index = lines.indexOf(c); if (index >= 0) { lines.remove(index); } else { logger.error("Component not found in lines?"); } } else { logger.error("BoilerPlate! need to remove other components than lines:"); } } } return true; } else { // logger.info(headerTemplateString + "\t" + topCAsString + "\t distance:" + distance); } return false; } protected TextLine getTopLine() { if (lines.size() > 0) { return lines.get(0); } else { return null; } } protected TextLine getBottomLine() { int size = lines.size(); if (size > 0) { return lines.get(size - 1); } else { return null; } } protected static double removeBoilerplateLine(List<TextPage> pages, TextPage templatePage, int distance, boolean headerFooter, Set<Integer> pageTemplatePossibles) { Component t; if (headerFooter) { t = templatePage.getTopLine(); } else { t = templatePage.getBottomLine(); } if (t == null) { return 0.0; } Rectangle2D boundry = t.getGeom(); String templateText = t.getText(); // +'\n'; int matchedTemplate = 0; for (int i = 0; i < pages.size(); i++) { TextPage p = pages.get(i); if (p.removeBoilerPlateComponent(t, distance, templateText, boundry, headerFooter, false)) { matchedTemplate++; } else { pageTemplatePossibles.add(i); } } double matched = 1.0 * matchedTemplate / pages.size(); if (matched > template_match_percent) { for (TextPage p : pages) { p.removeBoilerPlateComponent(t, distance, templateText, boundry, headerFooter, true); } } return matched; } protected static void removeBoilerplate(List<TextPage> pages, int distance, int pageNum, Set<Integer> pageTemplatePossible) { // this won't work easily on small documents, so just don't do them if (pages.size() < 4) { return; } TextPage page = pages.get(pageNum); int iterations = 0; double matched = 1.0; while (matched > template_match_percent && iterations++ < 10) { Set<Integer> matchPages; if (iterations == 1) { matchPages = pageTemplatePossible; } else { matchPages = new HashSet<>(); } matched = removeBoilerplateLine(pages, page, distance, BOILERPLATE_HEADER, matchPages); } matched = 1.0; iterations = 0; while (matched > template_match_percent && iterations++ < 10) { Set<Integer> matchPages; if (iterations == 1) { matchPages = pageTemplatePossible; } else { matchPages = new HashSet<>(); } matched = removeBoilerplateLine(pages, page, distance, BOILERPLATE_FOOTER, matchPages); } } public static void removeBoilerplate(List<TextPage> pages, int distance) { if (pages.size() < 4) { return; } Set<Integer> pageTemplatePossibles = new HashSet<>(); // pick a random page to start with. ideally it will match alot. removeBoilerplate(pages, distance, (int) (Math.round(pages.size()) * 0.33), pageTemplatePossibles); if (pageTemplatePossibles.size() == (pages.size() - 1)) { pageTemplatePossibles = new HashSet<>(); removeBoilerplate(pages, distance, (int) (Math.round(pages.size()) * 0.66), pageTemplatePossibles); } //TODO find a better candidate to get some more boilerplate. /* List<Integer> workThrough = new ArrayList<>(); workThrough.addAll(pageTemplatePossibles); // work through all the pages that didn't match, as the first choice might not have been a good candidate. // 3 iterations should be sufficent int iterations = 3; while (workThrough.size() > 0 && iterations-- > 0) { int page = workThrough.get(0); workThrough.remove(0); pageTemplatePossibles = new HashSet<>(); removeBoilerplate(pages, distance, page, pageTemplatePossibles); // clean out the list of pages that matched this one (retainAll is a set intersect) workThrough.retainAll(pageTemplatePossibles); } */ } public Component getFooter() { return footer; } public Component getHeader() { return header; } public Map<String, Map<Integer, Long>> getFontCounts() { return fontCounts; } public double getAvgLeft() { return avgLeft; } public double getAvgWidth() { return avgWidth; } public double getAvgRight() { return avgRight; } public double getCharDensity() { return charDensity; } public long getLineCount() { return lineCount; } public long getNextComponentID() { return componentID++; } // used for testcases int getLineSize() { return this.lines.size(); } // also used for testcases List<TextLine> getLines() { return this.lines; } }