Java tutorial
/////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2012 Assaf Urieli // //This file is part of Jochre. // //Jochre is free software: you can redistribute it and/or modify //it under the terms of the GNU Affero General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //Jochre is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Affero General Public License for more details. // //You should have received a copy of the GNU Affero General Public License //along with Jochre. If not, see <http://www.gnu.org/licenses/>. ////////////////////////////////////////////////////////////////////////////// package com.joliciel.jochre.graphics; import java.awt.BasicStroke; import java.awt.Color; import java.awt.Graphics2D; import java.awt.geom.Point2D; import java.awt.image.BufferedImage; import java.util.ArrayList; import java.util.BitSet; import java.util.Collection; import java.util.HashSet; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Stack; import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.math.stat.descriptive.DescriptiveStatistics; import org.apache.commons.math.stat.descriptive.moment.Mean; import org.apache.commons.math.stat.descriptive.moment.StandardDeviation; import org.apache.commons.math.stat.regression.SimpleRegression; import com.joliciel.jochre.stats.CardinalityComparator; import com.joliciel.jochre.stats.DBSCANClusterer; import com.joliciel.jochre.stats.MeanAbsoluteDeviation; import com.joliciel.talismane.utils.ProgressMonitor; import com.joliciel.talismane.utils.SimpleProgressMonitor; class SegmenterImpl implements Segmenter { private static final Log LOG = LogFactory.getLog(SegmenterImpl.class); private GraphicsServiceInternal graphicsService; private boolean drawSegmentation = false; private SourceImage sourceImage = null; private BufferedImage segmentedImage = null; private Graphics2D graphics2D = null; private SimpleProgressMonitor currentMonitor; private boolean splitAndJoin = false; public SegmenterImpl(SourceImage sourceImage) { this.sourceImage = sourceImage; } /* (non-Javadoc) * @see com.joliciel.ochre.graphics.ShapeExtractor#getLines(com.joliciel.ochre.graphics.ImageGrid) */ public void segment() { LOG.debug("########## segment #########"); if (currentMonitor != null) { currentMonitor.setCurrentAction("imageMonitor.findingShapes"); } List<Shape> shapes = this.findContiguousShapes(sourceImage); if (this.isDrawSegmentation()) { segmentedImage = new BufferedImage(sourceImage.getWidth(), sourceImage.getHeight(), BufferedImage.TYPE_INT_ARGB); graphics2D = segmentedImage.createGraphics(); graphics2D.drawImage(sourceImage.getOriginalImage(), 0, 0, sourceImage.getWidth(), sourceImage.getHeight(), null); } this.removeSpecks(sourceImage, shapes); this.removeOversizedShapes(shapes); if (currentMonitor != null) { currentMonitor.setCurrentAction("imageMonitor.groupingShapesIntoRows"); currentMonitor.setPercentComplete(0.2); } List<Rectangle> whiteAreas = sourceImage.getWhiteAreas(shapes); // if (this.drawSegmentation) { // graphics2D.setStroke(new BasicStroke(1)); // graphics2D.setPaint(Color.ORANGE); // for (Rectangle whiteArea : whiteAreas) { // graphics2D.drawRect(whiteArea.getLeft(), whiteArea.getTop(), whiteArea.getRight() - whiteArea.getLeft(), whiteArea.getBottom()-whiteArea.getTop()); // } // } // first we group shapes into rows based on white areas which don't rely on knowledge of page slope // having the rows allows us to estimate page slope List<RowOfShapes> rows = this.groupShapesIntoRows(sourceImage, shapes, whiteAreas, false); this.addRowsToJochreImage(sourceImage, rows); this.findGuideLines(sourceImage); List<Rectangle> columnSeparators = sourceImage.findColumnSeparators(); if (this.drawSegmentation) { graphics2D.setStroke(new BasicStroke(3)); graphics2D.setPaint(Color.ORANGE); for (Rectangle whiteArea : columnSeparators) { int topLeft = (int) Math.round(whiteArea.getLeft() + sourceImage.getXAdjustment(whiteArea.getTop())) + 3; int bottomLeft = (int) Math .round(whiteArea.getLeft() + sourceImage.getXAdjustment(whiteArea.getBottom())) + 3; int topRight = (int) Math .round(whiteArea.getRight() + sourceImage.getXAdjustment(whiteArea.getTop())) - 3; int bottomRight = (int) Math .round(whiteArea.getRight() + sourceImage.getXAdjustment(whiteArea.getBottom())) - 3; graphics2D.drawLine(topLeft, whiteArea.getTop() + 3, bottomLeft, whiteArea.getBottom() - 3); graphics2D.drawLine(topRight, whiteArea.getTop() + 3, bottomRight, whiteArea.getBottom() - 3); graphics2D.drawLine(topLeft, whiteArea.getTop() + 3, topRight, whiteArea.getTop() + 3); graphics2D.drawLine(bottomLeft, whiteArea.getBottom() - 3, bottomRight, whiteArea.getBottom() - 3); } } // now we re-do the grouping of shapes into rows, this time with proper column breaks to avoid // rows that cross-over columns rows = this.groupShapesIntoRows(sourceImage, shapes, columnSeparators, true); this.addRowsToJochreImage(sourceImage, rows); this.findGuideLines(sourceImage); this.splitRows(sourceImage); if (this.splitAndJoin) { // figure out if the shapes contain a lot of "holes" // if they do, join them together // if they don't, try to split them int fillFactor = this.getFillFactor(sourceImage); if (fillFactor >= 2) { this.joinShapesHorizontally(sourceImage); } if (currentMonitor != null) { currentMonitor.setCurrentAction("imageMonitor.splittingShapes"); currentMonitor.setPercentComplete(0.4); } this.splitShapes(sourceImage, fillFactor); } //this.removeSpecks(sourceImage); this.joinShapesVertically(sourceImage); this.findGuideLines(sourceImage); this.combineRows(sourceImage); this.removeOrphans(sourceImage, false); if (currentMonitor != null) { currentMonitor.setCurrentAction("imageMonitor.groupingShapesIntoWords"); currentMonitor.setPercentComplete(0.6); } this.groupShapesIntoWords(sourceImage); this.removeOrphans(sourceImage, true); this.cleanMargins(sourceImage); if (currentMonitor != null) { currentMonitor.setCurrentAction("imageMonitor.analysingFontSize"); currentMonitor.setPercentComplete(0.7); } this.splitRowsByFontSize(sourceImage); if (currentMonitor != null) { currentMonitor.setCurrentAction("imageMonitor.groupingRowsIntoParagraphs"); currentMonitor.setPercentComplete(0.9); } this.groupRowsIntoParagraphs(sourceImage); sourceImage.setShapeCount(this.getShapeCount(sourceImage)); if (this.isDrawSegmentation()) { this.drawSegmentation(sourceImage); } if (currentMonitor != null) { currentMonitor.setFinished(true); } } /** * Split rows if they're particularly high, and contain considerable white space in the middle. * Shapes causing the join will be removed if too high, or attached to the closest row otherwise. * @param sourceImage * @param regressions * @return */ void splitRows(SourceImage sourceImage) { LOG.debug("########## splitRows #########"); // Calculate the min row height to be considered for splitting double minHeightForSplit = sourceImage.getAverageShapeHeight(); LOG.debug("minHeightForSplit: " + minHeightForSplit); double slopeMean = sourceImage.getMeanHorizontalSlope(); List<RowOfShapes> candidateRows = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { if (row.getRight() == row.getLeft()) continue; int height = row.getBottom() - row.getTop(); if (height >= minHeightForSplit) { LOG.debug("Adding candidate " + row.toString()); candidateRows.add(row); } } // For each row to be considered for splitting, see if there are lines of white space inside it. Hashtable<RowOfShapes, List<RowOfShapes>> splitRows = new Hashtable<RowOfShapes, List<RowOfShapes>>(); for (RowOfShapes row : candidateRows) { SimpleRegression regression = new SimpleRegression(); // y = intercept + slope * x LOG.debug("Left point: (" + row.getLeft() + " , " + row.getTop() + ")"); regression.addData(row.getLeft(), row.getTop()); double rightHandY = row.getTop() + ((double) (row.getRight() - row.getLeft()) * slopeMean); LOG.debug("Right point: (" + row.getRight() + " , " + rightHandY + ")"); regression.addData(row.getRight(), rightHandY); int yDelta = (int) Math.ceil(Math.abs(rightHandY - (double) row.getTop())); int yInterval = yDelta + (row.getBottom() - row.getTop() + 1) + yDelta; LOG.debug("yDelta: " + yDelta); LOG.debug("yInterval: " + yInterval); // let's get pixel counts shape by shape, and leave out the rest (in case rows overlap vertically) int[] pixelCounts = new int[yInterval]; for (Shape shape : row.getShapes()) { LOG.trace("Shape " + shape); int yDeltaAtLeft = (int) Math.round(regression.predict(shape.getLeft())); LOG.trace("yDeltaAtLeft: " + yDeltaAtLeft); // the shape offset + the offset between the regression line and the row top // + the delta we left at the start in case the line slopes upwards to the right int topIndex = (shape.getTop() - row.getTop()) + (row.getTop() - yDeltaAtLeft) + yDelta; LOG.trace("topIndex: (" + shape.getTop() + " - " + row.getTop() + ") + (" + row.getTop() + " - " + yDeltaAtLeft + ") + " + yDelta + " = " + topIndex); for (int x = 0; x < shape.getWidth(); x++) { for (int y = 0; y < shape.getHeight(); y++) { if (shape.isPixelBlack(x, y, sourceImage.getBlackThreshold())) { pixelCounts[topIndex + y]++; } } } } Mean pixelCountMean = new Mean(); StandardDeviation pixelCountStdDev = new StandardDeviation(); for (int i = 0; i < yInterval; i++) { LOG.debug("Pixel count " + i + ": " + pixelCounts[i]); pixelCountMean.increment(pixelCounts[i]); pixelCountStdDev.increment(pixelCounts[i]); } LOG.debug("pixel count mean: " + pixelCountMean.getResult() + ", std dev: " + pixelCountStdDev.getResult()); // If there's a split required, we're going to go considerably above and below the mean several times double lowThreshold = pixelCountMean.getResult() / 2.0; double highThreshold = pixelCountMean.getResult() * 2.0; boolean inRow = false; List<Integer> switches = new ArrayList<Integer>(); for (int i = 0; i < yInterval; i++) { if (!inRow && pixelCounts[i] > highThreshold) { LOG.debug("In row at " + i + ", pixel count " + pixelCounts[i]); inRow = true; switches.add(i); } else if (inRow && pixelCounts[i] < lowThreshold) { LOG.debug("Out of row at " + i + ", pixel count " + pixelCounts[i]); inRow = false; switches.add(i); } } if (switches.size() > 2) { // we have more than one row List<Integer> rowSeparations = new ArrayList<Integer>(); // find the row separators for (int switchIndex = 1; switchIndex < switches.size() - 2; switchIndex = switchIndex + 2) { int outOfRow = switches.get(switchIndex); int intoRow = switches.get(switchIndex + 1); int minPixelCount = (int) Math.ceil(highThreshold); int minIndex = -1; // find the row with the lowest pixel count for (int i = outOfRow; i <= intoRow; i++) { if (pixelCounts[i] < minPixelCount) { minPixelCount = pixelCounts[i]; minIndex = i; } } rowSeparations.add(minIndex); } // separate the shapes among the rows List<RowOfShapes> newRows = new ArrayList<RowOfShapes>(rowSeparations.size() + 1); for (int i = 0; i <= rowSeparations.size(); i++) { newRows.add(graphicsService.getEmptyRow(sourceImage)); } // add a separator at the beginning and end rowSeparations.add(0, 0); rowSeparations.add(yInterval + 1); for (Shape shape : row.getShapes()) { int yDeltaAtLeft = (int) Math.round(regression.predict(shape.getLeft())); int topIndex = (shape.getTop() - row.getTop()) + (row.getTop() - yDeltaAtLeft) + yDelta; int firstSepAfterShapeBottom = rowSeparations.size(); int lastSepBeforeShapeTop = -1; for (int i = rowSeparations.size() - 1; i >= 0; i--) { int rowSeparation = rowSeparations.get(i); if (rowSeparation <= topIndex) { lastSepBeforeShapeTop = i; break; } } for (int i = 0; i < rowSeparations.size(); i++) { int rowSeparation = rowSeparations.get(i); if (rowSeparation >= topIndex + shape.getHeight()) { firstSepAfterShapeBottom = i; break; } } if (lastSepBeforeShapeTop == firstSepAfterShapeBottom - 1) { // shape clearly belongs to one row RowOfShapes newRow = newRows.get(lastSepBeforeShapeTop); newRow.addShape(shape); } else { // is the shape much closer to one row than another? // if yes, add it to then add it to this row int[] yPixelsPerRow = new int[newRows.size()]; for (int i = 0; i < newRows.size(); i++) { int separatorTop = rowSeparations.get(i); int separatorBottom = rowSeparations.get(i + 1); int top = topIndex < separatorTop ? separatorTop : topIndex; int bottom = topIndex + shape.getHeight() < separatorBottom ? topIndex + shape.getHeight() : separatorBottom; yPixelsPerRow[i] = bottom - top; } int pixelsInMaxRow = 0; int maxPixelRowIndex = -1; for (int i = 0; i < newRows.size(); i++) { if (yPixelsPerRow[i] > pixelsInMaxRow) { pixelsInMaxRow = yPixelsPerRow[i]; maxPixelRowIndex = i; } } double minPercentage = 0.8; if (((double) pixelsInMaxRow / (double) shape.getHeight()) >= minPercentage) { RowOfShapes newRow = newRows.get(maxPixelRowIndex); newRow.addShape(shape); } else { // otherwise, the shape needs to be got rid of // as it's causing massive confusion // do this by simply not adding it anywhere } } // is the shape in one row exactly? } // next shape splitRows.put(row, newRows); } // do we have more than one row? } // next row for (RowOfShapes row : splitRows.keySet()) { List<RowOfShapes> newRows = splitRows.get(row); sourceImage.replaceRow(row, newRows); } } void removeOversizedShapes(List<Shape> shapes) { LOG.debug("########## removeOversizedShapes #########"); Mean shapeHeightMean = new Mean(); Mean shapeWidthMean = new Mean(); for (Shape shape : shapes) { shapeHeightMean.increment(shape.getHeight()); shapeWidthMean.increment(shape.getWidth()); } double heightMean = shapeHeightMean.getResult(); double widthMean = shapeWidthMean.getResult(); LOG.debug("heightMean: " + heightMean); LOG.debug("widthMean: " + widthMean); shapeHeightMean = new Mean(); shapeWidthMean = new Mean(); StandardDeviation shapeHeightStdDev = new StandardDeviation(); for (Shape shape : shapes) { if (shape.getHeight() > heightMean && shape.getHeight() < (heightMean * 2.0) && shape.getWidth() > widthMean && shape.getWidth() < (widthMean * 2.0)) { shapeHeightMean.increment(shape.getHeight()); shapeHeightStdDev.increment(shape.getHeight()); shapeWidthMean.increment(shape.getWidth()); } } heightMean = shapeHeightMean.getResult(); widthMean = shapeWidthMean.getResult(); LOG.debug("average shape heightMean: " + heightMean); LOG.debug("average shape widthMean: " + widthMean); double minHeightBigShape = heightMean * 6; double minWidthWideShape = widthMean * 6; double minHeightWideShape = heightMean * 1.5; double minHeightTallShape = heightMean * 2.5; double maxWidthTallShape = widthMean / 2; LOG.debug("minHeightBigShape: " + minHeightBigShape); LOG.debug("minWidthWideShape: " + minWidthWideShape); LOG.debug("minHeightWideShape: " + minHeightWideShape); LOG.debug("minHeightTallShape: " + minHeightTallShape); LOG.debug("maxWidthTallShape: " + maxWidthTallShape); List<Shape> largeShapes = new ArrayList<Shape>(); List<Shape> horizontalRules = new ArrayList<Shape>(); for (Shape shape : shapes) { if (shape.getHeight() > minHeightBigShape) { LOG.debug("Removing " + shape + " (height)"); largeShapes.add(shape); } else if (shape.getWidth() > minWidthWideShape && shape.getHeight() > minHeightWideShape) { // we don't want to remove horizontal bars, but we do want to remove other shapes. // why not? I suppose horizontal bars are easily represented as characters? LOG.debug("Removing " + shape + " (width)"); largeShapes.add(shape); } else if (shape.getWidth() > minWidthWideShape) { // ok, we will remove horizontal rules after all LOG.debug("Removing " + shape + " (horizontal rule)"); largeShapes.add(shape); horizontalRules.add(shape); } else if (shape.getWidth() <= maxWidthTallShape && shape.getHeight() > minHeightTallShape) { LOG.debug("Removing " + shape + " (narrow)"); largeShapes.add(shape); } } // Only want to remove enclosed shapes if the large shape isn't a frame/grid // A) first reduce the shape by 5 percent and see it's cardinality reduces vastly (in which case it's a frame) // if so, don't remove enclosed shapes // B) next, detect white rectangles within the shape - if they're big enough, don't remove enclosed shapes LOG.debug("Are large shapes frames or illustrations?"); double maxFrameCardinalityRatio = 0.5; double minFrameWhiteAreaSizeRatio = 0.9; List<Shape> illustrations = new ArrayList<Shape>(largeShapes); for (Shape largeShape : largeShapes) { LOG.debug(largeShape.toString()); int xOrigin = largeShape.getStartingPoint()[0] - largeShape.getLeft(); int yOrigin = largeShape.getStartingPoint()[1] - largeShape.getTop(); Shape dummyShape = graphicsService.getDot(sourceImage, xOrigin, yOrigin); // We want to fill up a mirror of the contiguous pixels within this shape, // which is what we'll use for further analysis to know // if it's a frame or not. WritableImageGrid mirror = graphicsService.getEmptyMirror(largeShape); this.findContiguousPixels(largeShape, mirror, dummyShape, xOrigin, yOrigin, sourceImage.getSeparationThreshold()); int adjustedLeft = (int) Math.round((double) mirror.getWidth() * 0.05); int adjustedRight = (int) Math.round((double) mirror.getWidth() * 0.95); int adjustedTop = (int) Math.round((double) mirror.getHeight() * 0.05); int adjustedBottom = (int) Math.round((double) mirror.getHeight() * 0.95); int cardinality = 0; int innerCardinality = 0; for (int x = 0; x < mirror.getWidth(); x++) { for (int y = 0; y < mirror.getHeight(); y++) { if (mirror.getPixel(x, y) > 0) { cardinality++; if (x >= adjustedLeft && x <= adjustedRight && y >= adjustedTop && y <= adjustedBottom) innerCardinality++; } } } LOG.debug("cardinality: " + cardinality); LOG.debug("innerCardinality: " + innerCardinality); double ratio = (double) innerCardinality / (double) cardinality; LOG.debug("ratio: " + ratio); if (ratio <= maxFrameCardinalityRatio) { LOG.debug("maxFrameCardinalityRatio: " + maxFrameCardinalityRatio); LOG.debug("Frame by cardinality! Removing from illustrations"); illustrations.remove(largeShape); } else { // Now, it could still be a grid // to find this out we need to detect white areas inside the shape. WhiteAreaFinder whiteAreaFinder = new WhiteAreaFinder(); double minWhiteAreaWidth = widthMean * 10; double minWhiteAreaHeight = heightMean * 4; List<Rectangle> whiteAreas = whiteAreaFinder.getWhiteAreas(mirror, 0, 0, 0, mirror.getWidth() - 1, mirror.getHeight() - 1, minWhiteAreaWidth, minWhiteAreaHeight); int whiteAreaSize = 0; for (Rectangle whiteArea : whiteAreas) { whiteAreaSize += (whiteArea.getWidth() * whiteArea.getHeight()); } int totalSize = mirror.getWidth() * mirror.getHeight(); LOG.debug("whiteAreaSize: " + whiteAreaSize); LOG.debug("totalSize: " + totalSize); double sizeRatio = (double) whiteAreaSize / (double) totalSize; LOG.debug("sizeRatio: " + sizeRatio); if (sizeRatio >= minFrameWhiteAreaSizeRatio) { LOG.debug("minFrameWhiteAreaSizeRatio: " + minFrameWhiteAreaSizeRatio); LOG.debug("Frame by white area size! Removing from illustrations"); illustrations.remove(largeShape); } } } for (Shape largeShape : illustrations) { // Add this to large shapes if it's not a "frame" // large shapes are used for paragraph detection sourceImage.getLargeShapes().add(largeShape); } // remove shapes that are enclosed inside illustrations List<Shape> enclosedShapesToDelete = new ArrayList<Shape>(); int extension = 5; for (Shape shape : shapes) { for (Shape shapeToDelete : illustrations) { if (shape.getLeft() >= shapeToDelete.getLeft() - extension && shape.getRight() <= shapeToDelete.getRight() + extension && shape.getTop() >= shapeToDelete.getTop() - extension && shape.getBottom() <= shapeToDelete.getBottom() + extension) { LOG.debug("Enclosed shape: " + shape); LOG.debug(" enclosed by " + shapeToDelete); enclosedShapesToDelete.add(shape); } } } shapes.removeAll(largeShapes); shapes.removeAll(enclosedShapesToDelete); // remove shapes that are practically touching horizontal rules (probably segments of the rule that got split) extension = 3; List<Shape> listToTestAgainst = horizontalRules; for (int i = 0; i < 3; i++) { List<Shape> horizontalRuleSegments = new ArrayList<Shape>(); for (Shape horizontalRule : listToTestAgainst) { for (Shape shape : shapes) { if ((shape.getLeft() <= horizontalRule.getRight() + extension || shape.getRight() >= horizontalRule.getLeft() - extension) && shape.getTop() >= horizontalRule.getTop() - extension && shape.getBottom() <= horizontalRule.getBottom() + extension) { LOG.debug("Horizontal rule segment: " + shape); LOG.debug(" touching " + horizontalRule); horizontalRuleSegments.add(shape); enclosedShapesToDelete.add(shape); } } } shapes.removeAll(horizontalRuleSegments); listToTestAgainst = horizontalRuleSegments; if (listToTestAgainst.size() == 0) break; } } /** * If any two shapes in the same line are only separated by a thin line, * join them together * @param sourceImage */ void joinShapesHorizontally(SourceImage sourceImage) { LOG.debug("########## joinShapesHorizontally #########"); for (RowOfShapes row : sourceImage.getRows()) { this.joinShapesHorizontally(row); } // next row } void joinShapesHorizontally(RowOfShapes row) { LOG.debug("joinShapesHorizontally Row " + row.getIndex()); List<Shape> shapesToDelete = new ArrayList<Shape>(); int threshold = 2; int maxPreviousShapes = 4; List<Shape> previousShapes = new ArrayList<Shape>(); for (Shape shape : row.getShapes()) { LOG.trace(shape); for (Shape previousShape : previousShapes) { int space = 0; if (sourceImage.isLeftToRight()) space = shape.getLeft() - previousShape.getRight(); else space = previousShape.getLeft() - shape.getRight(); LOG.trace("previousShape: " + previousShape); LOG.trace("Space : " + space); int singleShapeThresholdWidth = (int) Math.round(sourceImage.getAverageShapeWidth() * 1.5); if (space <= threshold && previousShape.getTop() <= shape.getBottom() && previousShape.getBottom() >= shape.getTop() && (shape.getWidth() + previousShape.getWidth() <= singleShapeThresholdWidth)) { // check that the two shapes have dark areas near each other LOG.trace("Candidate."); List<Integer> shape1BorderPoints = new ArrayList<Integer>(); int shape1MinBorder = sourceImage.isLeftToRight() ? previousShape.getWidth() - threshold : 0; int shape1MaxBorder = sourceImage.isLeftToRight() ? previousShape.getWidth() : threshold; LOG.trace("shape1MinBorder" + shape1MinBorder); LOG.trace("shape1MaxBorder" + shape1MaxBorder); StringBuilder sb = new StringBuilder(); for (int x = shape1MinBorder; x < shape1MaxBorder; x++) { for (int y = 0; y < previousShape.getHeight(); y++) { if (previousShape.isPixelBlack(x, y, sourceImage.getBlackThreshold())) { shape1BorderPoints.add(previousShape.getTop() + y); sb.append(previousShape.getTop() + y); sb.append(','); } } } LOG.trace(sb.toString()); List<Integer> shape2BorderPoints = new ArrayList<Integer>(); sb = new StringBuilder(); int shape2MinBorder = sourceImage.isLeftToRight() ? 0 : shape.getWidth() - threshold; int shape2MaxBorder = sourceImage.isLeftToRight() ? threshold : shape.getWidth(); LOG.trace("shape2MinBorder" + shape2MinBorder); LOG.trace("shape2MaxBorder" + shape2MaxBorder); for (int x = shape2MinBorder; x < shape2MaxBorder; x++) { for (int y = 0; y < shape.getHeight(); y++) { if (shape.isPixelBlack(x, y, sourceImage.getBlackThreshold())) { shape2BorderPoints.add(shape.getTop() + y); sb.append(shape.getTop() + y); sb.append(','); } } } LOG.trace(sb.toString()); boolean haveNeighbour = false; for (int shape1BorderPoint : shape1BorderPoints) { for (int shape2BorderPoint : shape2BorderPoints) { if (Math.abs(shape2BorderPoint - shape1BorderPoint) <= threshold) { LOG.trace("haveNeighbour"); haveNeighbour = true; break; } } if (haveNeighbour) break; } if (haveNeighbour) { LOG.debug("Combining " + shape); LOG.debug(" with " + previousShape); int minLeft = previousShape.getLeft() <= shape.getLeft() ? previousShape.getLeft() : shape.getLeft(); int maxRight = previousShape.getRight() >= shape.getRight() ? previousShape.getRight() : shape.getRight(); int minTop = previousShape.getTop() <= shape.getTop() ? previousShape.getTop() : shape.getTop(); int maxBottom = previousShape.getBottom() >= shape.getBottom() ? previousShape.getBottom() : shape.getBottom(); shape.setLeft(minLeft); shape.setTop(minTop); shape.setRight(maxRight); shape.setBottom(maxBottom); shapesToDelete.add(previousShape); } } } previousShapes.add(shape); if (previousShapes.size() > maxPreviousShapes) previousShapes.remove(0); } // next shape if (shapesToDelete.size() > 0) { for (Shape shapeToDelete : shapesToDelete) row.removeShape(shapeToDelete); row.recalculate(); } } private int getFillFactor(SourceImage sourceImage) { LOG.debug("########## getFillFactor #########"); List<Shape> sample = this.getSample(sourceImage.getRows(), 40, true); Mean mean = new Mean(); ShapeFiller shapeFiller = this.graphicsService.getShapeFiller(); for (Shape shape : sample) { LOG.debug("Shape: " + shape); int fillFactor = shapeFiller.getFillFactor(shape, sourceImage.getBlackThreshold()); LOG.debug("fillFactor: " + fillFactor); mean.increment(fillFactor); } double meanFillFactor = mean.getResult(); LOG.debug("meanFillFactor: " + meanFillFactor); int imageFillFactor = (int) Math.round(mean.getResult()); LOG.debug("imageFillFactor: " + imageFillFactor); return imageFillFactor; } /** * Get all contiguous shapes out of the image grid. */ List<Shape> findContiguousShapes(SourceImage sourceImage) { LOG.debug("########## findContiguousShapes #########"); //As we get them out of the image grid, we write them to a writeable grid so as to avoid duplicate extraction WritableImageGrid mirror = this.graphicsService.getEmptyMirror(sourceImage); List<Shape> shapes = new ArrayList<Shape>(); for (int y = 0; y < sourceImage.getHeight(); y++) { for (int x = 0; x < sourceImage.getWidth(); x++) { if (sourceImage.isPixelBlack(x, y, sourceImage.getSeparationThreshold())) { // if this pixel has already been found, ignore it if (mirror.getPixel(x, y) > 0) continue; // get the shape surrounding this pixel Shape shape = this.getShape(sourceImage, mirror, x, y); shapes.add(shape); //List<Shape> splitShapes = this.getShapes(sourceImage, mirror, x, y); //shapes.addAll(splitShapes); } } } return shapes; } List<RowOfShapes> groupShapesIntoRows(SourceImage sourceImage, List<Shape> shapes, List<Rectangle> whiteAreas, boolean useSlope) { LOG.debug("########## groupShapesIntoRows #########"); LOG.debug("useSlope? " + useSlope); List<RowOfShapes> rows = new ArrayList<RowOfShapes>(); for (Shape shape : shapes) shape.setRow(null); List<Shape> shapesToRemove = new ArrayList<Shape>(); for (Shape shape : shapes) { for (Rectangle whiteArea : whiteAreas) { double whiteAreaRight = whiteArea.getRight(); double whiteAreaLeft = whiteArea.getLeft(); if (useSlope) { double xAdjustment = sourceImage.getXAdjustment(shape.getTop()); whiteAreaRight += xAdjustment; whiteAreaLeft += xAdjustment; } if (whiteAreaRight > shape.getRight() && whiteAreaLeft < shape.getLeft() && whiteArea.getTop() < shape.getTop() && whiteArea.getBottom() > shape.getBottom()) { // shape is surrounded shapesToRemove.add(shape); LOG.debug("Removing shape " + shape); LOG.debug("Surrounded by white area: " + whiteArea); } } } shapes.removeAll(shapesToRemove); // calculate the means // get average shape width & height DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); for (Shape shape : shapes) { shapeWidthStats.addValue(shape.getWidth()); } double averageShapeWidth = shapeWidthStats.getPercentile(50); LOG.debug("averageShapeWidth: " + averageShapeWidth); // now, arrange the shapes in rows // we're guaranteed that no two shapes overlap at this point. // Now, it's possible that two shapes in the same line have no vertical overlap (e.g. a comma and an apostrophe) // so we have to go searching a bit further afield, say five shapes in each direction // but if we go too far, we may end up joining two lines together if the page isn't quite straight // let's begin with any old shape and find the shapes closest to it horizontally // e.g. up to 8 horizontal means to the right and left // as we find shapes that go with it, we add them to the same line int i = 0; int j = 0; int numberOfMeanWidthsForSearch = 8; LOG.debug("numberOfMeanWidthsForSearch: " + numberOfMeanWidthsForSearch); LOG.debug("search distance: " + averageShapeWidth * numberOfMeanWidthsForSearch); for (Shape shape : shapes) { if (shape.getRow() == null) { RowOfShapes row = graphicsService.getEmptyRow(sourceImage); row.addShape(shape); row.setIndex(j++); rows.add(row); LOG.trace("========= New row " + row.getIndex() + "============"); LOG.trace("Adding " + shape + " to row " + row.getIndex()); } int searchLeft = (int) ((double) shape.getLeft() - (numberOfMeanWidthsForSearch * averageShapeWidth)); int searchRight = (int) ((double) shape.getRight() + (numberOfMeanWidthsForSearch * averageShapeWidth)); LOG.trace("Shape " + i++ + ": " + shape + "(row " + shape.getRow().getIndex() + ")"); LOG.trace("searchLeft: " + searchLeft); LOG.trace("searchRight: " + searchRight); // construct an array to represent where white areas overlap with the search area int[][] leftSearchArea = new int[shape.getLeft() - searchLeft][2]; int[][] rightSearchArea = new int[searchRight - shape.getRight()][2]; for (int k = 0; k < leftSearchArea.length; k++) { leftSearchArea[k][0] = shape.getTop(); leftSearchArea[k][1] = shape.getBottom(); } for (int k = 0; k < rightSearchArea.length; k++) { rightSearchArea[k][0] = shape.getTop(); rightSearchArea[k][1] = shape.getBottom(); } int newSearchLeft = searchLeft; int newSearchRight = searchRight; for (Rectangle whiteArea : whiteAreas) { double whiteAreaRight = whiteArea.getRight(); double whiteAreaLeft = whiteArea.getLeft(); if (useSlope) { double xAdjustment = sourceImage.getXAdjustment(shape.getTop()); whiteAreaRight += xAdjustment; whiteAreaLeft += xAdjustment; LOG.trace(whiteArea + ", xAdjustment=" + xAdjustment + " , whiteAreaLeft=" + whiteAreaLeft + " , whiteAreaRight=" + whiteAreaRight); } if (whiteAreaRight > newSearchLeft && whiteAreaLeft < shape.getLeft() && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) { LOG.trace("overlap on left with: " + whiteArea.toString()); if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom() && whiteAreaRight > newSearchLeft) { newSearchLeft = (int) Math.round(whiteAreaRight); LOG.trace("Complete, newSearchLeft = " + newSearchLeft); } else { LOG.trace("Partial, starting at " + whiteArea.getRight()); for (int k = whiteArea.getRight() - searchLeft; k >= 0; k--) { if (k < leftSearchArea.length) { if (whiteArea.getBottom() < shape.getBottom() && leftSearchArea[k][0] < whiteArea.getBottom()) leftSearchArea[k][0] = whiteArea.getBottom() + 1; else if (whiteArea.getTop() > shape.getTop() && leftSearchArea[k][1] > whiteArea.getTop()) leftSearchArea[k][1] = whiteArea.getTop() - 1; if (leftSearchArea[k][0] >= leftSearchArea[k][1] && searchLeft + k > newSearchLeft) { newSearchLeft = searchLeft + k; LOG.trace("Complete from " + newSearchLeft); break; } } } // if (LOG.isTraceEnabled()) { // StringBuilder sb = new StringBuilder(); // for (int k=0;k<leftSearchArea.length;k++) { // String top = "" + (leftSearchArea[k][0]-shape.getTop()); // sb.append(String.format("%1$#" + 3 + "s", top)+ ","); // } // LOG.trace(sb.toString()); // sb = new StringBuilder(); // for (int k=0;k<leftSearchArea.length;k++) { // String bottom = "" + (leftSearchArea[k][1]-shape.getTop()); // sb.append(String.format("%1$#" + 3 + "s", bottom)+ ","); // } // LOG.trace(sb.toString()); // } } } else if (whiteAreaLeft < newSearchRight && whiteAreaRight > shape.getRight() && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) { LOG.trace("overlap on right with: " + whiteArea.toString()); if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom() && newSearchRight > whiteAreaLeft) { newSearchRight = (int) Math.round(whiteAreaLeft); LOG.trace("Complete, newSearchRight = " + newSearchRight); } else { LOG.trace("Partial, starting at " + whiteArea.getLeft()); for (int k = whiteArea.getLeft() - shape.getRight(); k < rightSearchArea.length; k++) { if (k > 0 && k < leftSearchArea.length && k < rightSearchArea.length) { if (whiteArea.getBottom() < shape.getBottom() && leftSearchArea[k][0] < whiteArea.getBottom()) rightSearchArea[k][0] = whiteArea.getBottom() + 1; else if (whiteArea.getTop() > shape.getTop() && leftSearchArea[k][1] > whiteArea.getTop()) rightSearchArea[k][1] = whiteArea.getTop() - 1; if (rightSearchArea[k][0] >= rightSearchArea[k][1] && newSearchRight > shape.getRight() + k) { newSearchRight = shape.getRight() + k; LOG.trace("Complete from " + newSearchRight); break; } } } // if (LOG.isTraceEnabled()) { // StringBuilder sb = new StringBuilder(); // for (int k=0;k<rightSearchArea.length;k++) { // String top = "" + (rightSearchArea[k][0]-shape.getTop()); // sb.append(String.format("%1$#" + 3 + "s", top)+ ","); // } // LOG.trace(sb.toString()); // sb = new StringBuilder(); // for (int k=0;k<rightSearchArea.length;k++) { // String bottom = "" + (rightSearchArea[k][1]-shape.getTop()); // sb.append(String.format("%1$#" + 3 + "s", bottom)+ ","); // } // LOG.trace(sb.toString()); // } } } } LOG.trace("searchLeft adjusted for white columns: " + newSearchLeft); LOG.trace("searchRight adjusted for white columns: " + newSearchRight); // min 10% overlap to assume same row double minOverlap = 0.10; for (Shape otherShape : shapes) { boolean haveSomeOverlap = false; if (!shape.getRow().equals(otherShape.getRow()) && !otherShape.equals(shape)) { // shapes are arranged from the top down if (otherShape.getTop() > shape.getBottom()) { break; } if (otherShape.getRight() > newSearchLeft && otherShape.getRight() < shape.getLeft() && otherShape.getTop() <= shape.getBottom() && otherShape.getBottom() >= shape.getTop()) { int k = otherShape.getRight() - searchLeft; if (otherShape.getTop() <= leftSearchArea[k][1] && otherShape.getBottom() >= leftSearchArea[k][0]) haveSomeOverlap = true; } else if (otherShape.getLeft() < newSearchRight && otherShape.getLeft() > shape.getRight() && otherShape.getTop() <= shape.getBottom() && otherShape.getBottom() >= shape.getTop()) { int k = otherShape.getLeft() - shape.getRight(); if (otherShape.getTop() <= rightSearchArea[k][1] && otherShape.getBottom() >= rightSearchArea[k][0]) haveSomeOverlap = true; } if (haveSomeOverlap) { int overlap1 = shape.getBottom() - otherShape.getTop() + 1; int overlap2 = otherShape.getBottom() - shape.getTop() + 1; int overlap = overlap1 < overlap2 ? overlap1 : overlap2; boolean addShapeToRow = false; if ((((double) overlap / (double) shape.getHeight()) > minOverlap) || (((double) overlap / (double) otherShape.getHeight()) > minOverlap)) { addShapeToRow = true; } if (addShapeToRow) { LOG.debug("Adding " + otherShape + " to row " + shape.getRow().getIndex()); if (otherShape.getRow() == null) { shape.getRow().addShape(otherShape); } else { // two rows need to be merged LOG.debug("========= Merge rows " + shape.getRow().getIndex() + " with " + otherShape.getRow().getIndex() + "=========="); RowOfShapes otherRow = otherShape.getRow(); shape.getRow().addShapes(otherRow.getShapes()); rows.remove(otherRow); } } } // add shape to row ? } // should shape be considered? } // next other shape } // next shape return rows; } void addRowsToJochreImage(SourceImage sourceImage, List<RowOfShapes> rows) { LOG.debug("########## addRowsToJochreImage #########"); sourceImage.getRows().clear(); TreeSet<RowOfShapes> rowSet = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator()); rowSet.addAll(rows); int i = 0; LOG.debug("====== Row list ========"); for (RowOfShapes row : rowSet) { // order the shapes within the rows // here is where left-to-right or right-to-left matters row.reorderShapes(); sourceImage.addRow(row); int oldIndex = row.getIndex(); row.setIndex(i++); LOG.debug(row.toString() + " (old index = " + oldIndex + ")"); } } /** * We attempt to remove specks, where a speck is defined as * a relatively small shape at a relatively large distance from other shapes. * @param sourceImage */ void removeSpecks(SourceImage sourceImage, List<Shape> shapes) { LOG.debug("########## removeSpecks #########"); DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : shapes) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); } double shapeWidthMedian = shapeWidthStats.getPercentile(65); double shapeHeightMedian = shapeHeightStats.getPercentile(65); LOG.debug("meanShapeWidth: " + shapeWidthMedian); LOG.debug("meanShapeHeight: " + shapeHeightMedian); int maxSpeckHeightFloor = (int) Math.ceil(shapeHeightMedian / 6.0); int maxSpeckWidthFloor = (int) Math.ceil(shapeWidthMedian / 6.0); int maxSpeckHeightCeiling = maxSpeckHeightFloor * 2; int maxSpeckWidthCeiling = maxSpeckWidthFloor * 2; int speckXDistanceThresholdFloor = (int) Math.floor(shapeWidthMedian); int speckYDistanceThresholdFloor = (int) Math.floor(shapeHeightMedian / 4.0); int speckXDistanceThresholdCeiling = speckXDistanceThresholdFloor * 2; int speckYDistanceThresholdCeiling = speckYDistanceThresholdFloor * 2; LOG.debug("maxSpeckHeightFloor=" + maxSpeckHeightFloor); LOG.debug("maxSpeckWidthFloor=" + maxSpeckWidthFloor); LOG.debug("speckXDistanceThresholdFloor=" + speckXDistanceThresholdFloor); LOG.debug("speckYDistanceThresholdFloor=" + speckYDistanceThresholdFloor); LOG.debug("maxSpeckHeightCeiling=" + maxSpeckHeightCeiling); LOG.debug("maxSpeckWidthCeiling=" + maxSpeckWidthCeiling); LOG.debug("speckXDistanceThresholdCeiling=" + speckXDistanceThresholdCeiling); LOG.debug("speckYDistanceThresholdCeiling=" + speckYDistanceThresholdCeiling); List<Shape> specks = new ArrayList<Shape>(); List<double[]> speckCoordinates = new ArrayList<double[]>(); for (Shape shape : shapes) { if (shape.getHeight() < maxSpeckHeightCeiling && shape.getWidth() < maxSpeckWidthCeiling) { specks.add(shape); speckCoordinates.add(shape.getCentrePoint()); } } // group the specks into clusters, which will be added or removed as a whole // Note that a cluster could be a valid diacritic that's split into a few specks // or just a bunch of specks off on their own DBSCANClusterer<Shape> clusterer = new DBSCANClusterer<Shape>(specks, speckCoordinates); Set<Set<Shape>> speckClusters = clusterer.cluster(speckXDistanceThresholdFloor, 2, true); List<Shape> specksToRemove = new ArrayList<Shape>(); for (Set<Shape> speckCluster : speckClusters) { int speckHeight = 0; int speckWidth = 0; int clusterTop = -1; int clusterBottom = -1; int clusterRight = -1; int clusterLeft = -1; for (Shape speck : speckCluster) { LOG.debug("Speck?, " + speck); if (speck.getWidth() > speckWidth) speckWidth = speck.getWidth(); if (speck.getHeight() > speckHeight) speckHeight = speck.getHeight(); if (clusterTop < 0 || speck.getTop() < clusterTop) clusterTop = speck.getTop(); if (clusterLeft < 0 || speck.getLeft() < clusterLeft) clusterLeft = speck.getLeft(); if (speck.getBottom() > clusterBottom) clusterBottom = speck.getBottom(); if (speck.getRight() > clusterRight) clusterRight = speck.getRight(); } boolean useWidth = speckWidth > speckHeight; double scale = 1.0; if (useWidth) scale = speckWidth < maxSpeckWidthFloor ? 0.0 : (speckWidth > maxSpeckWidthCeiling ? 1.0 : ((double) speckWidth - maxSpeckWidthFloor) / (maxSpeckWidthCeiling - maxSpeckWidthFloor)); else scale = speckHeight < maxSpeckHeightFloor ? 0.0 : (speckHeight > maxSpeckHeightCeiling ? 1.0 : ((double) speckHeight - maxSpeckHeightFloor) / (maxSpeckHeightCeiling - maxSpeckHeightFloor)); int speckXDistanceThreshold = (int) Math.ceil(speckXDistanceThresholdFloor + scale * (speckXDistanceThresholdCeiling - speckXDistanceThresholdFloor)); int speckYDistanceThreshold = (int) Math.ceil(speckYDistanceThresholdFloor + scale * (speckYDistanceThresholdCeiling - speckYDistanceThresholdFloor)); LOG.debug("speckHeight=" + speckHeight); LOG.debug("speckWidth=" + speckWidth); LOG.debug("speckXDistanceThreshold=" + speckXDistanceThreshold); LOG.debug("speckYDistanceThreshold=" + speckYDistanceThreshold); Shape nearestShape = null; double minDistance = 0.0; int nearestShapeXDiff = 0; int nearestShapeYDiff = 0; for (Shape otherShape : shapes) { // limit to nearby shapes if (otherShape.getTop() > clusterBottom + speckYDistanceThreshold + 1) break; if (otherShape.getBottom() < clusterTop - speckYDistanceThreshold - 1) continue; if (otherShape.getRight() < clusterLeft - speckXDistanceThreshold - 1) continue; if (otherShape.getLeft() > clusterRight + speckXDistanceThreshold + 1) continue; // Note: tried !specks.contains(otherShape), but sometimes we have a valid case // where a diacritic is "split" into two specks if (!specks.contains(otherShape)) { int xDiff = 0; int yDiff = 0; int leftDiff = 0; int rightDiff = 0; int topDiff = 0; int botDiff = 0; if (otherShape.getLeft() <= clusterRight && otherShape.getRight() >= clusterLeft) { xDiff = 0; } else { leftDiff = Math.abs(clusterLeft - otherShape.getRight()); rightDiff = Math.abs(clusterRight - otherShape.getLeft()); xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff; } if (otherShape.getTop() <= clusterBottom && otherShape.getBottom() >= clusterTop) { yDiff = 0; } else { int nearestTop = (otherShape.getTop() > otherShape.getTop() + otherShape.getMeanLine()) ? otherShape.getTop() + otherShape.getMeanLine() : otherShape.getTop(); int nearestBot = (otherShape.getBottom() < otherShape.getTop() + otherShape.getBaseLine()) ? otherShape.getTop() + otherShape.getBaseLine() : otherShape.getBottom(); topDiff = Math.abs(clusterTop - nearestBot); botDiff = Math.abs(clusterBottom - nearestTop); yDiff = (topDiff < botDiff) ? topDiff : botDiff; } double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff)); if (nearestShape == null || distance < minDistance) { nearestShape = otherShape; minDistance = distance; nearestShapeXDiff = xDiff; nearestShapeYDiff = yDiff; LOG.trace("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff); LOG.trace("topDiff=" + topDiff + ", botDiff=" + botDiff); } // found closer shape? } // is this the speck? } // loop shapes around the reference shape if (nearestShape != null) { LOG.trace("Nearest shape, top(" + nearestShape.getTop() + ") " + "left(" + nearestShape.getLeft() + ") " + "bot(" + nearestShape.getBottom() + ") " + "right(" + nearestShape.getRight() + ")"); LOG.trace("Distance=" + minDistance + ", xDiff=" + nearestShapeXDiff + ", yDiff=" + nearestShapeYDiff); } boolean removeSpecks = false; if (nearestShape == null) removeSpecks = true; else { // calculate the shortest distance from the nearest shape to the speck cluster for (Shape speck : speckCluster) { int xDiff = 0; int yDiff = 0; int leftDiff = 0; int rightDiff = 0; int topDiff = 0; int botDiff = 0; if (nearestShape.getLeft() <= speck.getRight() && nearestShape.getRight() >= speck.getLeft()) { xDiff = 0; } else { leftDiff = Math.abs(speck.getLeft() - nearestShape.getRight()); rightDiff = Math.abs(speck.getRight() - nearestShape.getLeft()); xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff; } if (nearestShape.getTop() <= speck.getBottom() && nearestShape.getBottom() >= speck.getTop()) { yDiff = 0; } else { int nearestTop = (nearestShape.getTop() > nearestShape.getTop() + nearestShape.getMeanLine()) ? nearestShape.getTop() + nearestShape.getMeanLine() : nearestShape.getTop(); int nearestBot = (nearestShape.getBottom() < nearestShape.getTop() + nearestShape.getBaseLine()) ? nearestShape.getTop() + nearestShape.getBaseLine() : nearestShape.getBottom(); topDiff = Math.abs(speck.getTop() - nearestBot); botDiff = Math.abs(speck.getBottom() - nearestTop); yDiff = (topDiff < botDiff) ? topDiff : botDiff; } double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff)); if (distance < minDistance) { minDistance = distance; nearestShapeXDiff = xDiff; nearestShapeYDiff = yDiff; LOG.debug("Found closer speck:"); LOG.debug("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff); LOG.debug("topDiff=" + topDiff + ", botDiff=" + botDiff); } // found closer shape? } // Then, for all of these specks, find the one that's closest to the nearest non-speck // if this distance > threshold, get rid of all of 'em // otherwise, keep 'em all if (nearestShapeXDiff > speckXDistanceThreshold || nearestShapeYDiff > speckYDistanceThreshold) removeSpecks = true; } if (removeSpecks) { for (Shape otherSpeck : speckCluster) { LOG.debug("Removing speck " + otherSpeck); specksToRemove.add(otherSpeck); } } } // next speck shapes.removeAll(specksToRemove); } void removeOrphans(SourceImage sourceImage, boolean hasGroups) { LOG.debug("########## removeOrphans #########"); LOG.debug("Average shape width" + sourceImage.getAverageShapeWidth()); LOG.debug("Average shape height" + sourceImage.getAverageShapeHeight()); int maxSpeckHeight = (int) Math.ceil(sourceImage.getAverageShapeHeight() / 6.0); int maxSpeckWidth = (int) Math.ceil(sourceImage.getAverageShapeWidth() / 6.0); LOG.debug("maxSpeckHeight: " + maxSpeckHeight); LOG.debug("maxSpeckWidth: " + maxSpeckWidth); int maxSpeckWidthAlone = (int) Math.ceil(sourceImage.getAverageShapeWidth() / 8.0); LOG.debug("maxSpeckWidthAlone: " + maxSpeckWidthAlone); Set<RowOfShapes> alteredRows = new HashSet<RowOfShapes>(); Set<GroupOfShapes> alteredGroups = new HashSet<GroupOfShapes>(); List<Shape> shapesToDelete = new ArrayList<Shape>(); List<GroupOfShapes> groupsToDelete = new ArrayList<GroupOfShapes>(); List<RowOfShapes> rowsToDelete = new ArrayList<RowOfShapes>(); if (hasGroups) { for (RowOfShapes row : sourceImage.getRows()) { for (GroupOfShapes group : row.getGroups()) { for (Shape shape : group.getShapes()) { if ((shape.getWidth() < maxSpeckWidth && shape.getHeight() < maxSpeckHeight) || (shape.getWidth() < maxSpeckWidthAlone)) { LOG.debug("Removing shape: " + shape); shapesToDelete.add(shape); alteredRows.add(row); alteredGroups.add(group); } } } } } else { for (RowOfShapes row : sourceImage.getRows()) { for (Shape shape : row.getShapes()) { if ((shape.getWidth() < maxSpeckWidth && shape.getHeight() < maxSpeckHeight) || (shape.getWidth() < maxSpeckWidthAlone)) { LOG.debug("Removing shape: " + shape); shapesToDelete.add(shape); alteredRows.add(row); } } } } for (Shape shape : shapesToDelete) { if (!hasGroups) { RowOfShapes row = shape.getRow(); row.getShapes().remove(shape); if (row.getShapes().size() == 0) rowsToDelete.add(row); } else { GroupOfShapes group = shape.getGroup(); group.getShapes().remove(shape); if (group.getShapes().size() == 0) groupsToDelete.add(group); } } if (hasGroups) { int maxGroupSpeckHeight = (int) Math.ceil(sourceImage.getAverageShapeHeight() / 4.0); int maxGroupSpeckWidth = (int) Math.ceil(sourceImage.getAverageShapeWidth() / 4.0); LOG.debug("maxGroupSpeckHeight: " + maxGroupSpeckHeight); LOG.debug("maxGroupSpeckWidth: " + maxGroupSpeckHeight); for (RowOfShapes row : sourceImage.getRows()) { for (GroupOfShapes group : row.getGroups()) { boolean hasNonSpeck = false; for (Shape shape : group.getShapes()) { if (shape.getHeight() > maxGroupSpeckHeight || shape.getWidth() > maxGroupSpeckWidth) { hasNonSpeck = true; break; } } if (!hasNonSpeck) { LOG.debug("Removing group with shapes:"); for (Shape shape : group.getShapes()) LOG.debug("Shape: " + shape); group.getShapes().clear(); groupsToDelete.add(group); alteredRows.add(row); } } } for (GroupOfShapes group : groupsToDelete) { RowOfShapes row = group.getRow(); row.getGroups().remove(group); if (row.getGroups().size() == 0) rowsToDelete.add(row); } int minRowHeight = (int) Math.ceil(sourceImage.getAverageShapeHeight()); int minRowWidth = (int) Math.ceil(sourceImage.getAverageShapeWidth()); LOG.debug("minRowHeight: " + minRowHeight); LOG.debug("minRowWidth: " + minRowWidth); int minWideRowHeight = (int) Math.ceil(sourceImage.getAverageShapeHeight() * 0.75); int minWideRowWidth = (int) Math.ceil(sourceImage.getAverageShapeWidth() * 2.0); LOG.debug("minWideRowHeight: " + minWideRowHeight); LOG.debug("minWideRowWidth: " + minWideRowWidth); int maxRowSpeckHeight = (int) Math.ceil(sourceImage.getAverageShapeHeight() / 2.0); int maxRowSpeckWidth = (int) Math.ceil(sourceImage.getAverageShapeWidth() / 2.0); LOG.debug("maxRowSpeckHeight: " + maxGroupSpeckHeight); LOG.debug("maxRowSpeckWidth: " + maxGroupSpeckHeight); for (RowOfShapes row : sourceImage.getRows()) { if (row.getBottom() - row.getTop() < minRowHeight && row.getRight() - row.getLeft() < minRowWidth) { rowsToDelete.add(row); } else if (row.getBottom() - row.getTop() < minWideRowHeight && row.getRight() - row.getLeft() < minWideRowWidth) { rowsToDelete.add(row); } else { boolean hasNonSpeck = false; for (GroupOfShapes group : row.getGroups()) { for (Shape shape : group.getShapes()) { if (shape.getHeight() > maxRowSpeckHeight || shape.getWidth() > maxRowSpeckWidth) { hasNonSpeck = true; break; } } if (hasNonSpeck) break; } if (!hasNonSpeck) { rowsToDelete.add(row); } } } } for (RowOfShapes row : sourceImage.getRows()) { if (!hasGroups && row.getShapes().size() == 0) { rowsToDelete.add(row); } } for (RowOfShapes row : rowsToDelete) { LOG.debug("Removing row with shapes:"); for (GroupOfShapes group : row.getGroups()) for (Shape shape : group.getShapes()) LOG.debug("Shape: " + shape); row.getGroups().clear(); sourceImage.getRows().remove(row); } for (GroupOfShapes group : alteredGroups) group.recalculate(); for (RowOfShapes row : alteredRows) row.recalculate(); if (alteredRows.size() > 0 || alteredGroups.size() > 0) sourceImage.recalculate(); } /** * If any two shapes in the same line take up the same horizontal space, we can join them vertically * @param sourceImage */ void joinShapesVertically(SourceImage sourceImage) { LOG.debug("########## joinShapesVertically #########"); for (RowOfShapes row : sourceImage.getRows()) { this.joinShapesVertically(row); } // next row } /** * If any two shapes in this row take up the same horizontal space, we can join them vertically */ void joinShapesVertically(RowOfShapes row) { LOG.debug("joinShapesVertically Row " + row.getIndex()); LOG.debug("Shape height: mean=" + sourceImage.getAverageShapeHeight() + ", stddev=" + sourceImage.getAverageShapeHeightMargin()); LOG.debug("Shape width: mean=" + sourceImage.getAverageShapeWidth() + ", stddev=" + sourceImage.getAverageShapeWidthMargin()); int maxSpeckHeight = (int) Math.ceil(sourceImage.getAverageShapeHeight() / 6.0); int maxSpeckWidth = (int) Math.ceil(sourceImage.getAverageShapeWidth() / 6.0); LOG.debug("maxSpeckHeight: " + maxSpeckHeight); LOG.debug("maxSpeckWidth: " + maxSpeckWidth); // remove shapes 5 times, in case we have multiple vertical overlaps for (int k = 0; k < 5; k++) { LOG.debug("k=" + k); List<Shape> shapesToDelete = new ArrayList<Shape>(); int i = 0; for (Shape shape : row.getShapes()) { // LOG.debug("Checking " + shape); if (shape.getHeight() < maxSpeckHeight && shape.getWidth() < maxSpeckWidth) { // only join other shapes to normal height shapes, not to specks i++; continue; } int j = 0; for (Shape otherShape : row.getShapes()) { if (j <= i) { j++; continue; } if (j > i + 6) break; // LOG.debug("Comparing to " + otherShape); if (otherShape.getLeft() <= shape.getRight() && otherShape.getRight() >= shape.getLeft()) { // LOG.debug("Found overlap between " + shape + " and " + otherShape); // there is some overlap... how much? int maxLeft = otherShape.getLeft() >= shape.getLeft() ? otherShape.getLeft() : shape.getLeft(); int minRight = otherShape.getRight() <= shape.getRight() ? otherShape.getRight() : shape.getRight(); int intersection = (minRight - maxLeft + 1); int intersectionMultiplied = intersection * 4; if (intersectionMultiplied >= shape.getWidth() || intersectionMultiplied >= otherShape.getWidth()) { LOG.debug("Combining " + shape); LOG.debug(" with " + otherShape); int minLeft = otherShape.getLeft() <= shape.getLeft() ? otherShape.getLeft() : shape.getLeft(); int maxRight = otherShape.getRight() >= shape.getRight() ? otherShape.getRight() : shape.getRight(); int minTop = otherShape.getTop() <= shape.getTop() ? otherShape.getTop() : shape.getTop(); int maxBottom = otherShape.getBottom() >= shape.getBottom() ? otherShape.getBottom() : shape.getBottom(); shape.setLeft(minLeft); shape.setTop(minTop); shape.setRight(maxRight); shape.setBottom(maxBottom); shape.recalculate(); shapesToDelete.add(otherShape); } } // there is a horizontal overlap j++; } // check following few shapes i++; } // next shape if (shapesToDelete.size() == 0) break; for (Shape shapeToDelete : shapesToDelete) row.removeShape(shapeToDelete); } // do the whole thing several times in a row } /** * Find the baseline, meanline and capline for each shape, based on other shapes on the same row * this is likely to depend on the alphabet, e.g. the hebrew alphabet has no capline as such. * Returns a List of SimpleRegression representing the centerline for each of the rows. */ void findGuideLines(SourceImage sourceImage) { LOG.debug("########## findGuideLines #########"); for (RowOfShapes row : sourceImage.getRows()) { row.assignGuideLines(); } } /** * If a row begins with a larger font and then suddenly reduces (e.g. dictionary entries), * we split it into two separate rows and recalculate guidelines for each. * @param sourceImage */ void splitRowsByFontSize(SourceImage sourceImage) { LOG.debug("########## splitRowsByFontSize #########"); for (RowOfShapes row : sourceImage.getRows()) { row.splitByFontSize(); } } /** * Combine rows that represent thin lines directly above or below another row * (e.g. diacritics) */ void combineRows(SourceImage sourceImage) { LOG.debug("########## combineRows #########"); // We thought of using row height, but mean row height is not a good enough // indicator when there are title rows with very big characters. // Instead, we need to go with Distance between rows when compared to mean - baseline // where distance between rows is measured between the tops and bottoms of nearby shapes. int maxRowHeight = 0; for (RowOfShapes row : sourceImage.getRows()) { int rowHeight = row.getXHeightMax(); if (rowHeight > maxRowHeight) maxRowHeight = rowHeight; } LOG.debug("maxRowHeight: " + maxRowHeight); TreeSet<RowOfShapes> rowSet = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator()); rowSet.addAll(sourceImage.getRows()); List<RowOfShapes> rows = new ArrayList<RowOfShapes>(rowSet); List<RowOfShapes> rowsToDelete = new ArrayList<RowOfShapes>(); double maxShapeWidth = sourceImage.getAverageShapeWidth() * 8.0; LOG.debug("maxShapeWidth: " + maxShapeWidth); double maxRatioForCombine = 0.6; LOG.debug("maxRatioForCombine: " + maxRatioForCombine); int i = 0; while (i < rows.size()) { RowOfShapes currentRow = rows.get(i); boolean rowsCombined = false; if (!rowsToDelete.contains(currentRow)) { LOG.trace("Checking " + currentRow.toString()); int currentRowHeight = currentRow.getXHeightMax(); LOG.trace("xHeightMax = " + currentRowHeight); RowOfShapes nearestRow = null; double shortestDistance = Double.MAX_VALUE; int masterRowHeight = -1; int j = 0; for (RowOfShapes otherRow : rows) { if (!rowsToDelete.contains(otherRow) && !(currentRow.equals(otherRow))) { // limit our search to nearby rows if (Math.abs(currentRow.getBaseLineMiddlePoint() - otherRow.getBaseLineMiddlePoint()) < (2.0 * maxRowHeight) && (currentRow.getRight() >= otherRow.getLeft()) && (otherRow.getRight() >= currentRow.getLeft())) { LOG.trace("Comparing to " + otherRow.toString()); int otherRowHeight = otherRow.getXHeightMax(); LOG.trace("xHeightMax = " + otherRowHeight); RowOfShapes masterRow = currentRowHeight > otherRowHeight ? currentRow : otherRow; RowOfShapes slaveRow = currentRowHeight > otherRowHeight ? otherRow : currentRow; double heightRatio = ((double) slaveRow.getXHeightMax() / (double) masterRow.getXHeightMax()); LOG.trace("height ratio (" + slaveRow.getXHeightMax() + " / " + masterRow.getXHeightMax() + "): " + heightRatio); if (heightRatio > maxRatioForCombine) continue; // avoid combining very long horizontal rules with other rows // their top gives a false impression of being closer to the other row's bottom. if ((masterRow.getMaxShapeWidth() > maxShapeWidth || slaveRow.getMaxShapeWidth() > maxShapeWidth)) continue; double distance = 0; if (currentRow.getBaseLineMiddlePoint() < otherRow.getBaseLineMiddlePoint()) { distance = (otherRow.getBaseLineMiddlePoint() - otherRow.getXHeightMax()) - currentRow.getBaseLineMiddlePoint(); LOG.trace("(otherRow.baseLineMiddlePoint() " + otherRow.getBaseLineMiddlePoint() + " - otherRow.getXHeightMax() " + otherRow.getXHeightMax() + ") - currentRow.baseLineMiddlePoint() " + currentRow.getBaseLineMiddlePoint()); } else { distance = (currentRow.getBaseLineMiddlePoint() - currentRow.getXHeightMax()) - otherRow.getBaseLineMiddlePoint(); LOG.trace("(currentRow.baseLineMiddlePoint() " + currentRow.getBaseLineMiddlePoint() + " - currentRow.getXHeightMax() " + currentRow.getXHeightMax() + ") - otherRow.baseLineMiddlePoint() " + otherRow.getBaseLineMiddlePoint()); } LOG.debug("Distance between rows: " + distance); if (distance < shortestDistance) { LOG.trace("Found new closest row: " + otherRow); nearestRow = otherRow; shortestDistance = distance; masterRowHeight = (currentRowHeight >= otherRowHeight) ? currentRowHeight : otherRowHeight; } } } j++; } if (nearestRow != null) { // The number 3 below is chosen arbitrarily - basically we want a // relative way of indicating that the rows are very near to each other. double minDistanceForCombine = ((double) masterRowHeight / 3); LOG.trace("minDistanceForCombine: " + minDistanceForCombine); if (shortestDistance < minDistanceForCombine) { LOG.debug("Combining the two rows"); LOG.debug(currentRow.toString()); LOG.debug(nearestRow.toString()); rowsToDelete.add(nearestRow); currentRow.addShapes(nearestRow.getShapes()); currentRow.reorderShapes(); currentRow.recalculate(); this.joinShapesVertically(currentRow); currentRow.assignGuideLines(); LOG.debug("Resulting row: " + currentRow.toString()); rowsCombined = true; } } } // We may need to combine multiple rows // so we only advance if no combination has taken place if (!rowsCombined) i++; } // actually delete the rows for (RowOfShapes rowToDelete : rowsToDelete) { sourceImage.getRows().remove(rowToDelete); } LOG.debug("########## end combineRows #########"); } /** * Group the shapes into words. */ void groupShapesIntoWords(SourceImage sourceImage) { LOG.debug("########## groupShapesIntoWords #########"); for (Set<RowOfShapes> rowCluster : sourceImage.getRowClusters()) { this.groupShapesIntoWords(rowCluster); } } void groupShapesIntoWords(Set<RowOfShapes> rowCluster) { LOG.debug("Next row cluster of size " + rowCluster.size()); // group the shapes together into words Mean spaceMean = new Mean(); StandardDeviation spaceStdDev = new StandardDeviation(); int maxSpaceLog = 120; int[] spaceCounts = new int[maxSpaceLog]; List<Integer> spaces = new ArrayList<Integer>(); for (RowOfShapes row : rowCluster) { Shape previousShape = null; for (Shape shape : row.getShapes()) { if (previousShape != null) { int space = 0; if (sourceImage.isLeftToRight()) space = shape.getLeft() - previousShape.getRight(); else space = previousShape.getLeft() - shape.getRight(); LOG.trace(shape); LOG.trace("Space : " + space); if (space < maxSpaceLog && space >= 0) spaceCounts[space]++; if (space >= 0) { spaces.add(space); spaceMean.increment(space); spaceStdDev.increment(space); } } previousShape = shape; } // next shape } for (int i = 0; i < maxSpaceLog; i++) { //LOG.debug("Space count " + i + ": " + spaceCounts[i]); } double spaceMeanVal = spaceMean.getResult(); double spaceStdDevVal = spaceStdDev.getResult(); LOG.debug("Space mean: " + spaceMeanVal); LOG.debug("Space std dev: " + spaceStdDevVal); // If however there is only a single word on the row, the // standard deviation will be very low. boolean singleWord = false; if (spaceStdDevVal * 2 < spaceMeanVal) { LOG.debug("Assuming a single word per row"); singleWord = true; } // Since there should be two groups, one for letters and one for words, // the mean should be somewhere in between. We now look for the mean on the // lesser group and will use it as the basis for comparison. spaceMean = new Mean(); spaceStdDev = new StandardDeviation(); for (int space : spaces) { if (space < spaceMeanVal && space >= 0) { spaceMean.increment(space); spaceStdDev.increment(space); } } spaceMeanVal = spaceMean.getResult(); spaceStdDevVal = spaceStdDev.getResult(); LOG.debug("Letter space mean: " + spaceMeanVal); LOG.debug("Letter space std dev: " + spaceStdDevVal); int letterSpaceThreshold = 0; if (singleWord) letterSpaceThreshold = Integer.MAX_VALUE; else letterSpaceThreshold = (int) Math.round(spaceMeanVal + (4.0 * spaceStdDevVal)); for (RowOfShapes row : rowCluster) { LOG.debug(row.toString()); //row.getGroups().clear(); row.organiseShapesInGroups(letterSpaceThreshold); } // next row } /** * Clear out anything found in the right & left margins * @param sourceImage */ void cleanMargins(SourceImage sourceImage) { LOG.debug("########## cleanMargins #########"); int minCardinalityForMargin = 8; double averageShapeWidth = sourceImage.getAverageShapeWidth(); LOG.debug("Finding right margin"); double rightLimit = (double) sourceImage.getWidth() * 0.67; // first, create a DBScan cluster of all rows near the right-hand side List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>(); List<double[]> rightCoordinates = new ArrayList<double[]>(); for (RowOfShapes row : sourceImage.getRows()) { double right = row.getRight(); if (right >= rightLimit) { LOG.trace(row.toString()); LOG.trace( "Right: " + right + " + " + row.getXAdjustment() + " = " + (right - row.getXAdjustment())); right -= row.getXAdjustment(); rightHandRows.add(row); rightCoordinates.add(new double[] { right }); } } DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows, rightCoordinates); Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClusters.addAll(rowClusters); int i = 0; // find the right-most cluster with sufficient cardinality, and assume it's the right margin DescriptiveStatistics rightMarginStats = null; for (Set<RowOfShapes> cluster : orderedRowClusters) { DescriptiveStatistics rightStats = new DescriptiveStatistics(); for (RowOfShapes row : cluster) rightStats.addValue(row.getRight() - row.getXAdjustment()); LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Right mean : " + rightStats.getMean()); LOG.debug("Right std dev: " + rightStats.getStandardDeviation()); if (cluster.size() >= minCardinalityForMargin && (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean())) { rightMarginStats = rightStats; } i++; } // see how many rows would violate this margin - if too many, assume no margin // these rows are only rows which extend across the margin if (rightMarginStats != null) { LOG.debug("Right margin mean : " + rightMarginStats.getMean()); LOG.debug("Right margin std dev: " + rightMarginStats.getStandardDeviation()); double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth(); LOG.debug("rightMarginLimit: " + rightMarginLimit); int numRowsToChop = 0; for (RowOfShapes row : sourceImage.getRows()) { if (row.getRight() >= rightLimit) { if (row.getRight() - row.getXAdjustment() >= rightMarginLimit && row.getLeft() - row.getXAdjustment() <= rightMarginLimit) { LOG.debug("Found overlapping row : " + row); LOG.debug("Adjusted right : " + (row.getRight() - row.getXAdjustment())); numRowsToChop++; } } } if (numRowsToChop >= 3) { LOG.debug("Too many overlapping rows - ignoring margin"); rightMarginStats = null; } } if (rightMarginStats != null) { double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth(); List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { double right = row.getRight() - row.getXAdjustment(); LOG.trace(row.toString()); LOG.trace("Adjusted right: " + right); if (right >= rightMarginLimit) { LOG.trace("Has out-of-margin stuff!"); // need to chop off groups to the right of this threshold List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>(); for (GroupOfShapes group : row.getGroups()) { if (group.getLeft() - row.getXAdjustment() > rightMarginLimit) { groupsToChop.add(group); LOG.debug("Chopping group outside of right margin: " + group); } } for (GroupOfShapes group : groupsToChop) { row.getShapes().removeAll(group.getShapes()); } row.getGroups().removeAll(groupsToChop); if (row.getGroups().size() == 0) { LOG.debug("Removing empty " + row); rowsToRemove.add(row); } else { row.recalculate(); row.assignGuideLines(); } } // does this row extend beyond the margin? } // next row sourceImage.getRows().removeAll(rowsToRemove); } // have a right margin LOG.debug("Finding left margin"); double leftLimit = (double) sourceImage.getWidth() * 0.33; // first, create a DBScan cluster of all rows near the left-hand side List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>(); List<double[]> leftCoordinates = new ArrayList<double[]>(); for (RowOfShapes row : sourceImage.getRows()) { double left = row.getLeft(); if (left <= leftLimit) { LOG.trace(row.toString()); LOG.trace("Left: " + left + " - " + row.getXAdjustment() + " = " + (left - row.getXAdjustment())); left -= row.getXAdjustment(); leftHandRows.add(row); leftCoordinates.add(new double[] { left }); } } DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows, leftCoordinates); Set<Set<RowOfShapes>> rowClustersLeft = leftMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClustersLeft = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClustersLeft.addAll(rowClustersLeft); i = 0; // find the left-most cluster with sufficient cardinality, and assume it's the left margin DescriptiveStatistics leftMarginStats = null; for (Set<RowOfShapes> cluster : orderedRowClustersLeft) { DescriptiveStatistics leftStats = new DescriptiveStatistics(); for (RowOfShapes row : cluster) leftStats.addValue(row.getLeft() - row.getXAdjustment()); LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Left mean : " + leftStats.getMean()); LOG.debug("Left std dev: " + leftStats.getStandardDeviation()); if (cluster.size() >= minCardinalityForMargin && (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean())) { leftMarginStats = leftStats; } i++; } // see how many rows would violate this margin - if too many, assume no margin // these rows are only rows which extend across the margin if (leftMarginStats != null) { LOG.debug("Left margin mean : " + leftMarginStats.getMean()); LOG.debug("Left margin std dev: " + leftMarginStats.getStandardDeviation()); double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth(); LOG.debug("leftMarginLimit: " + leftMarginLimit); int numRowsToChop = 0; for (RowOfShapes row : sourceImage.getRows()) { if (row.getLeft() <= leftLimit) { if (row.getLeft() - row.getXAdjustment() <= leftMarginLimit && row.getRight() - row.getXAdjustment() >= leftMarginLimit) { LOG.debug("Found overlapping row : " + row); LOG.debug("Adjusted left : " + (row.getLeft() - row.getXAdjustment())); numRowsToChop++; } } } if (numRowsToChop >= 3) { LOG.debug("Too many overlapping rows - ignoring margin"); leftMarginStats = null; } } if (leftMarginStats != null) { double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth(); List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { double left = row.getLeft() - row.getXAdjustment(); LOG.trace(row.toString()); LOG.trace("Adjusted left: " + left); if (left <= leftMarginLimit) { LOG.trace("Has out-of-margin stuff!"); // need to chop off groups to the left of this threshold List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>(); for (GroupOfShapes group : row.getGroups()) { if (group.getRight() - row.getXAdjustment() < leftMarginLimit) { groupsToChop.add(group); LOG.debug("Chopping group outside of left margin: " + group); } } for (GroupOfShapes group : groupsToChop) { row.getShapes().removeAll(group.getShapes()); } row.getGroups().removeAll(groupsToChop); if (row.getGroups().size() == 0) { LOG.debug("Removing empty " + row); rowsToRemove.add(row); } else { row.recalculate(); row.assignGuideLines(); } } // does this row extend beyond the margin? } // next row sourceImage.getRows().removeAll(rowsToRemove); } // have a left margin } /** * Detects paragraph splits and assign rows to correct paragraphs. * @param sourceImage */ void groupRowsIntoParagraphs(SourceImage sourceImage) { LOG.debug("########## groupRowsIntoParagraphs #########"); // We'll use various possible indicators, including // indented start, indented end, and spacing between rows. // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end // This means we cannot use deviation. Instead, we use the average shape width on the page. // We also adjust maxLeft & minRight to match the vertical line slope // This is now complicated by the possibility of multiple columns // Need to take into account a big horizontal space - Pietrushka page 14 // Find horizontal spaces that go all the way across and are wider than a certain threshold // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold // Columns are thus arranged into "areas", separated by white-space. boolean[] fullRows = new boolean[sourceImage.getHeight()]; for (RowOfShapes row : sourceImage.getRows()) { for (int y = row.getTop(); y <= row.getBottom(); y++) { fullRows[y] = true; } } DescriptiveStatistics rowHeightStats = new DescriptiveStatistics(); for (RowOfShapes row : sourceImage.getRows()) { int height = row.getXHeight(); rowHeightStats.addValue(height); } double avgRowHeight = rowHeightStats.getPercentile(50); LOG.debug("meanRowHeight: " + avgRowHeight); double minHeightForWhiteSpace = avgRowHeight * 1.3; LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace); // find the "white rows" - any horizontal white space // in the page which is sufficiently high List<int[]> whiteRows = new ArrayList<int[]>(); boolean inWhite = false; int startWhite = 0; for (int y = 0; y < sourceImage.getHeight(); y++) { if (!inWhite && !fullRows[y]) { inWhite = true; startWhite = y; } else if (inWhite && fullRows[y]) { int length = y - startWhite; if (length > minHeightForWhiteSpace) { LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1)); whiteRows.add(new int[] { startWhite, y - 1 }); } inWhite = false; } } if (inWhite) whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 }); whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() }); // place rows in "areas" defined by the "white rows" found above List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>(); int startY = -1; for (int[] whiteRow : whiteRows) { List<RowOfShapes> area = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) { area.add(row); } } if (area.size() > 0) { areas.add(area); } startY = whiteRow[1]; } // break up each area into vertical columns LOG.debug("break up each area into vertical columns"); List<Column> columns = new ArrayList<Column>(); List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>(); for (List<RowOfShapes> area : areas) { LOG.debug("Next area"); List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>(); columnsPerAreaList.add(columnsPerArea); TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator()); rows.addAll(area); for (RowOfShapes row : rows) { // try to place this row in one of the columns directly above it. // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered List<Column> overlappingColumns = new ArrayList<Column>(); for (Column column : columnsPerArea) { if (!column.closed) { RowOfShapes lastRowInColumn = column.get(column.size() - 1); if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft() - lastRowInColumn.getXAdjustment() && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight() - lastRowInColumn.getXAdjustment()) { overlappingColumns.add(column); } } } if (overlappingColumns.size() == 1) { Column myColumn = overlappingColumns.get(0); RowOfShapes lastRowInMyColumn = myColumn.get(0); // close any columns that are now at a distance of more than one row for (Column column : columnsPerArea) { if (!column.closed && !column.equals(myColumn)) { RowOfShapes lastRowInColumn = column.get(column.size() - 1); if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) { column.closed = true; LOG.debug("Closing distant column " + lastRowInColumn); } } } myColumn.add(row); LOG.debug(row.toString()); LOG.debug(" added to column " + lastRowInMyColumn); } else { for (Column overlappingColumn : overlappingColumns) { overlappingColumn.closed = true; RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1); LOG.debug("Closing overlapping column " + lastRowInColumn); } Column myColumn = new Column(sourceImage); myColumn.add(row); LOG.debug("Found new column"); LOG.debug(row.toString()); columns.add(myColumn); columnsPerArea.add(myColumn); } } } // next area for (Column column : columns) column.recalculate(); // Intermediate step to reform the vertical columns, if they exist // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents // should be shared, to increase the statistical sample size and reduce anomalies. // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally // and don't overlap with any other column in the other column's area. List<List<Column>> columnGroups = new ArrayList<List<Column>>(); List<Column> columnsInPrevArea = null; for (List<Column> columnsPerArea : columnsPerAreaList) { if (columnsInPrevArea != null) { for (Column prevColumn : columnsInPrevArea) { LOG.debug("Checking " + prevColumn); // find the column group containing the previous column List<Column> myColumnGroup = null; for (List<Column> columnGroup : columnGroups) { if (columnGroup.contains(prevColumn)) { myColumnGroup = columnGroup; break; } } if (myColumnGroup == null) { myColumnGroup = new ArrayList<SegmenterImpl.Column>(); LOG.debug("Creating column group for column " + prevColumn.toString()); columnGroups.add(myColumnGroup); myColumnGroup.add(prevColumn); } // does only one column overlap with this one? Column overlappingColumn = null; for (Column column : columnsPerArea) { if (column.adjustedRight >= prevColumn.adjustedLeft && column.adjustedLeft <= prevColumn.adjustedRight) { if (overlappingColumn == null) { LOG.debug("I overlap with " + column); overlappingColumn = column; } else { LOG.debug("But I overlap also with " + column); overlappingColumn = null; break; } } } if (overlappingColumn != null) { // does it overlap with only me? for (Column otherPrevColumn : columnsInPrevArea) { if (otherPrevColumn.equals(prevColumn)) continue; if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) { LOG.debug("But it overlaps also with " + otherPrevColumn); overlappingColumn = null; break; } } } if (overlappingColumn != null) { myColumnGroup.add(overlappingColumn); LOG.debug("Adding " + overlappingColumn); LOG.debug(" to group with " + prevColumn); } } // next previous column } // have previous columns columnsInPrevArea = columnsPerArea; } // next area if (columnsInPrevArea != null) { for (Column prevColumn : columnsInPrevArea) { // find the column group containing the previous column List<Column> myColumnGroup = null; for (List<Column> columnGroup : columnGroups) { if (columnGroup.contains(prevColumn)) { myColumnGroup = columnGroup; break; } } if (myColumnGroup == null) { myColumnGroup = new ArrayList<SegmenterImpl.Column>(); LOG.debug("Creating column group for column " + prevColumn.toString()); columnGroups.add(myColumnGroup); myColumnGroup.add(prevColumn); } } } // What we really want here is, for each column (in the case of right-to-left), // two clusters on the right // and one relatively big cluster on the left. // anything outside of the cluster on the left is an EOP. boolean hasTab = false; for (List<Column> columnGroup : columnGroups) { LOG.debug("Next column group"); double averageShapeWidth = sourceImage.getAverageShapeWidth(); LOG.debug("averageShapeWidth: " + averageShapeWidth); double epsilon = averageShapeWidth / 2.0; LOG.debug("epsilon: " + epsilon); int columnGroupTop = sourceImage.getHeight(); int columnGroupBottom = 0; int columnGroupLeft = sourceImage.getWidth(); int columnGroupRight = 0; for (Column column : columnGroup) { if (column.top < columnGroupTop) columnGroupTop = (int) Math.round(column.top); if (column.bottom > columnGroupBottom) columnGroupBottom = (int) Math.round(column.bottom); if (column.adjustedLeft < columnGroupLeft) columnGroupLeft = (int) Math.round(column.adjustedLeft); if (column.adjustedRight > columnGroupRight) columnGroupRight = (int) Math.round(column.adjustedRight); } // right thresholds LOG.debug("Calculating right thresholds"); // first, create a DBScan cluster of all rows by their adjusted right coordinate List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>(); List<double[]> rightCoordinates = new ArrayList<double[]>(); for (Column column : columnGroup) { for (RowOfShapes row : column) { double right = row.getRight() - row.getXAdjustment(); // double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage); // if (rightOverlap==0) { // // leave out any right-overlapping rows here // // since we need accurate statistics for margin detection // // This is questionable - especially since a long vertical bar (see Petriushka) // // tends to give all rows a left overlap. Also, because the overlap is calculated based // // on the mean right & mean left, not based on any sort of margin clusters. // rightHandRows.add(row); // rightCoordinates.add(new double[] {right}); // } rightHandRows.add(row); rightCoordinates.add(new double[] { right }); } } int minCardinalityForRightMargin = 5; DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows, rightCoordinates); Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClusters.addAll(rowClusters); int i = 0; // find the two right-most clusters, and assume they are the margin & the tab DescriptiveStatistics rightMarginStats = null; DescriptiveStatistics rightTabStats = null; for (Set<RowOfShapes> cluster : orderedRowClusters) { DescriptiveStatistics rightStats = new DescriptiveStatistics(); MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation(); for (RowOfShapes row : cluster) { int rowIndex = rightHandRows.indexOf(row); double right = rightCoordinates.get(rowIndex)[0]; rightStats.addValue(right); rightDev.increment(right); } LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Right mean : " + rightStats.getMean()); LOG.debug("Right dev: " + rightDev.getResult()); if (cluster.size() >= minCardinalityForRightMargin) { if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) { if (rightMarginStats != null) rightTabStats = rightMarginStats; rightMarginStats = rightStats; } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) { rightTabStats = rightStats; } } else { break; } i++; } // next right-coordinate cluster double rightMargin = sourceImage.getWidth(); double rightTab = sourceImage.getWidth(); if (rightMarginStats != null) { rightMargin = rightMarginStats.getMean(); } else { List<Rectangle> columnSeparators = sourceImage.findColumnSeparators(); for (Rectangle columnSeparator : columnSeparators) { if (columnSeparator.getTop() <= columnGroupTop && columnSeparator.getBottom() >= columnGroupBottom && columnSeparator.getLeft() >= columnGroupRight) { if (columnSeparator.getLeft() < rightMargin) rightMargin = columnSeparator.getLeft(); } } } if (rightTabStats != null) { rightTab = rightTabStats.getMean(); } LOG.debug("rightMargin: " + rightMargin); LOG.debug("rightTab: " + rightTab); // left thresholds LOG.debug("Calculating left thresholds"); // first, create a DBScan cluster of all rows by their adjusted left coordinate List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>(); List<double[]> leftCoordinates = new ArrayList<double[]>(); for (Column column : columnGroup) { for (RowOfShapes row : column) { double left = row.getLeft() - row.getXAdjustment(); // double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage); // if (leftOverlap == 0) { // // leave out any overlapping rows from margin calcs, // // since we need accurate statistics here // leftHandRows.add(row); // leftCoordinates.add(new double[] {left}); // } leftHandRows.add(row); leftCoordinates.add(new double[] { left }); } } int minCardinalityForLeftMargin = 5; DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows, leftCoordinates); Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon, minCardinalityForLeftMargin, true); TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedLeftRowClusters.addAll(leftRowClusters); i = 0; // find the two left-most clusters, and assume they are the margin & the tab DescriptiveStatistics leftMarginStats = null; DescriptiveStatistics leftTabStats = null; for (Set<RowOfShapes> cluster : orderedLeftRowClusters) { DescriptiveStatistics leftStats = new DescriptiveStatistics(); MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation(); for (RowOfShapes row : cluster) { int rowIndex = leftHandRows.indexOf(row); double left = leftCoordinates.get(rowIndex)[0]; leftStats.addValue(left); leftDev.increment(left); } LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Left mean : " + leftStats.getMean()); LOG.debug("Left dev: " + leftDev.getResult()); if (cluster.size() >= minCardinalityForLeftMargin) { if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) { if (leftMarginStats != null) leftTabStats = leftMarginStats; leftMarginStats = leftStats; } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) { leftTabStats = leftStats; } } else { break; } i++; } // next left-coordinate cluster double leftMargin = 0; double leftTab = 0; if (leftMarginStats != null) { leftMargin = leftMarginStats.getMean(); } else { List<Rectangle> columnSeparators = sourceImage.findColumnSeparators(); for (Rectangle columnSeparator : columnSeparators) { if (columnSeparator.getTop() <= columnGroupTop && columnSeparator.getBottom() >= columnGroupBottom && columnSeparator.getRight() <= columnGroupLeft) { if (columnSeparator.getRight() > leftMargin) leftMargin = columnSeparator.getRight(); } } } if (leftTabStats != null) { leftTab = leftTabStats.getMean(); } LOG.debug("leftMargin: " + leftMargin); LOG.debug("leftTab: " + leftTab); for (Column column : columnGroup) { if (sourceImage.isLeftToRight()) { column.startMargin = leftMargin; if (leftTabStats != null) { column.startTab = leftTab; column.hasTab = true; } else { LOG.debug("No left tab - setting based on left margin"); column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth()); column.hasTab = false; } column.endMargin = rightMargin; } else { column.startMargin = rightMargin; if (rightTabStats != null) { column.startTab = rightTab; column.hasTab = true; } else { LOG.debug("No right tab - setting based on right margin"); column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth()); column.hasTab = false; } column.endMargin = leftMargin; } LOG.debug("Margins for " + column); LOG.debug("startMargin: " + column.startMargin); LOG.debug("startTab: " + column.startTab); LOG.debug("endMargin: " + column.endMargin); } // next column } // next column group LOG.debug("hasTab: " + hasTab); double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth(); // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs). // This applies to the entire page. // To recognise indenting vs. outdenting, we have to see if the row preceding each // indent/outdent is full or partial. In the case of indentation, partial rows will // typically be followed by an indent. In the case of outdentation, partial rows will // typically be followed by an outdent. boolean isIndented = true; int indentCount = 0; int outdentCount = 0; for (List<Column> columnGroup : columnGroups) { LOG.debug("Next column group"); boolean prevRowPartial = false; for (Column column : columnGroup) { if (column.hasTab) { for (RowOfShapes row : column) { if (sourceImage.isLeftToRight()) { if (prevRowPartial) { if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) { indentCount++; } else if (row.getLeft() - row.getXAdjustment() < column.startMargin + safetyMargin) { outdentCount++; } } if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) { prevRowPartial = true; } else { prevRowPartial = false; } } else { if (prevRowPartial) { if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) { indentCount++; } else if (row.getRight() - row.getXAdjustment() > column.startMargin - safetyMargin) { outdentCount++; } } if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) { prevRowPartial = true; } else { prevRowPartial = false; } } // left-to-right? } // next row } // column has tab } // next column } // next column group isIndented = (indentCount + 2 >= outdentCount); LOG.debug("indentCount: " + indentCount); LOG.debug("outdentCount: " + outdentCount); LOG.debug("isIndented: " + isIndented); // order the columns TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns); columns.clear(); columns.addAll(orderedColumns); // find the paragraphs found in each column for (Column column : columns) { LOG.debug("--- Next column ---"); // break up the column into paragraphs Paragraph paragraph = null; RowOfShapes previousRow = null; int maxShapesForStandaloneParagraph = 2; List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>(); Point2D previousPointStartMargin = null; Point2D previousPointStartTab = null; Point2D previousPointEndMargin = null; for (RowOfShapes row : column) { boolean rowForStandaloneParagraph = false; boolean newParagraph = false; if (row.getShapes().size() <= maxShapesForStandaloneParagraph) { rowsForStandaloneParagraphs.add(row); rowForStandaloneParagraph = true; } else { double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage); double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage); if (drawSegmentation) { double rowVerticalMidPoint = row.getBaseLineMiddlePoint(); double startMarginX = column.startMargin + row.getXAdjustment(); double startTabX = column.startTab + row.getXAdjustment(); double endMarginX = column.endMargin + row.getXAdjustment(); if (sourceImage.isLeftToRight()) { startMarginX += safetyMargin; startTabX -= safetyMargin; endMarginX -= safetyMargin; startMarginX += leftOverlap; startTabX += leftOverlap; endMarginX -= rightOverlap; } else { startMarginX -= safetyMargin; startTabX += safetyMargin; endMarginX += safetyMargin; startMarginX -= rightOverlap; startTabX -= rightOverlap; endMarginX += leftOverlap; } Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX, rowVerticalMidPoint); Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint); Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint); if (previousPointStartMargin != null) { graphics2D.setStroke(new BasicStroke(1)); graphics2D.setPaint(Color.BLUE); graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()), (int) Math.round(previousPointStartMargin.getY()), (int) Math.round(currentPointStartMargin.getX()), (int) Math.round(currentPointStartMargin.getY())); graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()), (int) Math.round(previousPointEndMargin.getY()), (int) Math.round(currentPointEndMargin.getX()), (int) Math.round(currentPointEndMargin.getY())); graphics2D.setPaint(Color.RED); graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()), (int) Math.round(previousPointStartTab.getY()), (int) Math.round(currentPointStartTab.getX()), (int) Math.round(currentPointStartTab.getY())); graphics2D.setPaint(Color.RED); graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()), (int) Math.round(previousPointEndMargin.getY()), (int) Math.round(currentPointEndMargin.getX()), (int) Math.round(currentPointEndMargin.getY())); } previousPointStartMargin = currentPointStartMargin; previousPointStartTab = currentPointStartTab; previousPointEndMargin = currentPointEndMargin; } if (previousRow == null) { LOG.debug("New paragraph (first)"); newParagraph = true; } else { if (sourceImage.isLeftToRight()) { if (previousRow.getRight() - previousRow.getXAdjustment() - rightOverlap < column.endMargin - safetyMargin) { LOG.debug("New paragraph (previous EOP)"); newParagraph = true; } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment() + leftOverlap > column.startTab - safetyMargin) { LOG.debug("New paragraph (indent)"); newParagraph = true; } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment() + leftOverlap < column.startMargin + safetyMargin) { LOG.debug("New paragraph (outdent)"); newParagraph = true; } } else { if (previousRow.getLeft() - previousRow.getXAdjustment() + leftOverlap > column.endMargin + safetyMargin) { LOG.debug("New paragraph (previous EOP)"); newParagraph = true; } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment() - rightOverlap < column.startTab + safetyMargin) { LOG.debug("New paragraph (indent)"); newParagraph = true; } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment() - rightOverlap > column.startMargin - safetyMargin) { LOG.debug("New paragraph (outdent)"); newParagraph = true; } } // left-to-right? } // have previous row } // standalone paragraph? if (!rowForStandaloneParagraph) LOG.debug(row.toString()); if (newParagraph) { if (rowsForStandaloneParagraphs.size() > 0) { for (RowOfShapes oneRow : rowsForStandaloneParagraphs) { LOG.debug("Standalone paragraph"); LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")"); Paragraph standaloneParagraph = sourceImage.newParagraph(); standaloneParagraph.getRows().add(oneRow); } rowsForStandaloneParagraphs.clear(); } paragraph = sourceImage.newParagraph(); } //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")"); if (!rowForStandaloneParagraph) { paragraph.getRows().add(row); previousRow = row; } } // next row in column if (rowsForStandaloneParagraphs.size() > 0) { for (RowOfShapes oneRow : rowsForStandaloneParagraphs) { LOG.debug("Standalone paragraph"); LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")"); Paragraph standaloneParagraph = sourceImage.newParagraph(); standaloneParagraph.getRows().add(oneRow); } rowsForStandaloneParagraphs.clear(); } } // next column } private double findLargeShapeOverlapOnLeft(RowOfShapes row, Column column, SourceImage sourceImage) { double overlap = 0; double leftMargin = 0; if (sourceImage.isLeftToRight()) leftMargin = column.startMargin; else leftMargin = column.endMargin; for (Rectangle whiteArea : sourceImage.getWhiteAreasAroundLargeShapes()) { if (whiteArea.getTop() <= row.getBottom() && whiteArea.getBottom() >= row.getTop()) { if (whiteArea.getLeft() - row.getXAdjustment() < leftMargin && whiteArea.getRight() - row.getXAdjustment() > leftMargin) { overlap = (whiteArea.getRight() - row.getXAdjustment()) - leftMargin; LOG.debug("Overlaps large shape (" + whiteArea.getLeft() + "," + whiteArea.getTop() + "," + whiteArea.getRight() + "," + whiteArea.getBottom() + ")" + " on left by " + overlap); } } } return overlap; } private double findLargeShapeOverlapOnRight(RowOfShapes row, Column column, SourceImage sourceImage) { double overlap = 0; double rightMargin = 0; if (sourceImage.isLeftToRight()) rightMargin = column.endMargin; else rightMargin = column.startMargin; for (Rectangle whiteArea : sourceImage.getWhiteAreasAroundLargeShapes()) { if (whiteArea.getTop() <= row.getBottom() && whiteArea.getBottom() >= row.getTop()) { if (whiteArea.getLeft() - row.getXAdjustment() < rightMargin && whiteArea.getRight() - row.getXAdjustment() > rightMargin) { overlap = rightMargin - (whiteArea.getLeft() - row.getXAdjustment()); LOG.debug("Overlaps large shape (" + whiteArea.getLeft() + "," + whiteArea.getTop() + "," + whiteArea.getRight() + "," + whiteArea.getBottom() + ")" + " on right by " + overlap); } } } return overlap; } /** * A vertical group of rows which will be analysed together for paragraphs * @author Assaf Urieli * */ @SuppressWarnings("serial") private static final class Column extends ArrayList<RowOfShapes> implements Comparable<Column> { private SourceImage sourceImage; public double startMargin; public double startTab; public double endMargin; public double adjustedLeft; public double adjustedRight; public double top; public double bottom; public boolean closed = false; public boolean hasTab = false; private Column(SourceImage sourceImage) { super(); this.sourceImage = sourceImage; } public void recalculate() { adjustedLeft = sourceImage.getWidth(); adjustedRight = 0; top = sourceImage.getHeight(); bottom = 0; for (RowOfShapes row : this) { double left = row.getLeft() - row.getXAdjustment(); double right = row.getRight() - row.getXAdjustment(); if (left < adjustedLeft) { adjustedLeft = left; } if (right > adjustedRight) { adjustedRight = right; } if (row.getTop() < top) top = row.getTop(); if (row.getBottom() > bottom) bottom = row.getBottom(); } if (sourceImage.isLeftToRight()) { this.startMargin = adjustedLeft; this.endMargin = adjustedRight; } else { this.startMargin = adjustedRight; this.endMargin = adjustedLeft; } } @Override public String toString() { return "Column [adjustedLeft=" + (int) Math.round(adjustedLeft) + ", adjustedRight=" + (int) Math.round(adjustedRight) + ", top=" + top + ", bottom=" + bottom + "]"; } @Override public int compareTo(Column o) { if (this.equals(o)) return 0; boolean verticalOverlap = this.top < o.bottom && o.top < this.bottom; if (sourceImage.isLeftToRight()) { if (this.adjustedRight < o.adjustedLeft && verticalOverlap) { return -1; } else if (o.adjustedRight < this.adjustedLeft && verticalOverlap) { return 1; } else if (this.top < o.top) { return -1; } else { return 1; } } else { if (this.adjustedLeft > o.adjustedRight && verticalOverlap) { return -1; } else if (o.adjustedLeft > this.adjustedRight && verticalOverlap) { return 1; } else if (this.top < o.top) { return -1; } else { return 1; } } } } /** * Draw an image of the segmentation performed. * @param sourceImage */ void drawSegmentation(SourceImage sourceImage) { LOG.debug("########## drawSegmentation #########"); for (Paragraph paragraph : sourceImage.getParagraphs()) { for (RowOfShapes row : paragraph.getRows()) { int[] lastMeanLine = null; int[] lastBaseLine = null; for (GroupOfShapes group : row.getGroups()) { int groupLeft = 0; int groupTop = 0; int groupRight = 0; int groupBottom = 0; boolean firstShape = true; graphics2D.setStroke(new BasicStroke(1)); graphics2D.setPaint(Color.BLUE); for (Shape shape : group.getShapes()) { if (firstShape) { groupLeft = shape.getLeft(); groupTop = shape.getTop(); groupRight = shape.getRight(); groupBottom = shape.getBottom(); firstShape = false; } else { if (shape.getLeft() < groupLeft) groupLeft = shape.getLeft(); if (shape.getTop() < groupTop) groupTop = shape.getTop(); if (shape.getRight() > groupRight) groupRight = shape.getRight(); if (shape.getBottom() > groupBottom) groupBottom = shape.getBottom(); } graphics2D.drawRect(shape.getLeft(), shape.getTop(), shape.getWidth(), shape.getHeight()); } // next shape groupLeft -= 2; groupTop -= 2; groupRight += 2; groupBottom += 2; graphics2D.setStroke(new BasicStroke(2)); graphics2D.setPaint(Color.GREEN); graphics2D.drawRect(groupLeft, groupTop, (groupRight - groupLeft) + 1, (groupBottom - groupTop) + 1); graphics2D.setStroke(new BasicStroke(1)); graphics2D.setPaint(Color.RED); if (lastBaseLine != null) { int xHeight = group.getBaseLine()[1] - group.getMeanLine()[1]; int lastXHeight = lastBaseLine[1] - lastMeanLine[1]; if (xHeight == lastXHeight) { if (sourceImage.isLeftToRight()) { graphics2D.drawLine(group.getMeanLine()[2], group.getMeanLine()[3], lastMeanLine[0], lastMeanLine[1]); graphics2D.drawLine(group.getBaseLine()[2], group.getBaseLine()[3], lastBaseLine[0], lastBaseLine[1]); } else { graphics2D.drawLine(group.getMeanLine()[0], group.getMeanLine()[1], lastMeanLine[2], lastMeanLine[3]); graphics2D.drawLine(group.getBaseLine()[0], group.getBaseLine()[1], lastBaseLine[2], lastBaseLine[3]); } } } graphics2D.drawLine(group.getMeanLine()[0], group.getMeanLine()[1], group.getMeanLine()[2], group.getMeanLine()[3]); graphics2D.drawLine(group.getBaseLine()[0], group.getBaseLine()[1], group.getBaseLine()[2], group.getBaseLine()[3]); lastBaseLine = group.getBaseLine(); lastMeanLine = group.getMeanLine(); } // next group } // next row graphics2D.setStroke(new BasicStroke(2)); graphics2D.setPaint(Color.DARK_GRAY); graphics2D.drawRect(paragraph.getLeft() - 2, paragraph.getTop() - 2, paragraph.getRight() - paragraph.getLeft() + 4, paragraph.getBottom() - paragraph.getTop() + 4); } // next paragraph } Shape getShape(SourceImage sourceImage, WritableImageGrid mirror, int x, int y) { Shape shape = this.graphicsService.getDot(sourceImage, x, y); // recursively expand out to the 9 pixel square surrounding this pixel // until no other contiguous black pixels have been found. this.findContiguousPixels(sourceImage, mirror, shape, x, y, sourceImage.getSeparationThreshold()); LOG.trace("Got shape for pixel (" + x + "," + y + "): " + shape); return shape; } void findContiguousPixels(ImageGrid sourceImage, WritableImageGrid mirror, Shape shape, int x, int y, int blackThreshold) { // let's imagine // 0 X 0 0 x x // x x x 0 0 x // 0 0 x x x x // so we have to go up and to the left to keep finding contiguous black pixels. Stack<int[]> pointStack = new Stack<int[]>(); pointStack.push(new int[] { x, y }); while (!pointStack.isEmpty()) { int[] point = pointStack.pop(); x = point[0]; y = point[1]; // Add this pixel to the mirror so that we don't touch it again. mirror.setPixel(x, y, 1); for (int rely = y - 1; rely <= y + 1; rely++) { for (int relx = x - 1; relx <= x + 1; relx++) { if (mirror.getPixel(relx, rely) > 0) continue; if (sourceImage.isPixelBlack(relx, rely, blackThreshold)) { if (relx < shape.getLeft()) shape.setLeft(relx); if (relx > shape.getRight()) shape.setRight(relx); if (rely > shape.getBottom()) shape.setBottom(rely); // we don't have to check top, cause it's all going // from top to bottom. pointStack.push(new int[] { relx, rely }); } } } } } void splitShapes(SourceImage sourceImage, int fillFactor) { LOG.debug("########## splitShapes #########"); // Cluster rows into rows of a similar height // Once we have this, we look for any shapes that are wider than average // and attempt to split them by looking for any bridges that are considerable thinner // than the stroke thickness and yet have big pixel counts on either side. // In order to split, we need four parameters // 1) minShapeWidth: the minimum shape width to consider for a split // 2) maxBridgeWidth: the maximum bridge width to use as a dividing bridge between two shapes when splitting // 3) minLetterWeight: the minimum pixel count that can represent a separate letter when splitting // 4) maxHorizontalOverlap: the maximum horizontal overlap between the left-hand and right-hand shape // These parameters are different for different font sizes // Therefore, we first need to group the rows on the image into clusters by height double imageShapeMean = sourceImage.getAverageShapeWidth(); double maxWidthForSplit = imageShapeMean * 6.0; // avoid splitting horizontal rules! Set<Set<RowOfShapes>> rowClusters = sourceImage.getRowClusters(); for (Set<RowOfShapes> rowCluster : rowClusters) { LOG.debug("Analysing row cluster"); // 1) minShapeWidth: calculate the minimum shape width to be considered for splitting // first get the mean Mean meanWidth = new Mean(); List<Shape> shapes = new ArrayList<Shape>(); for (RowOfShapes row : rowCluster) { for (Shape shape : row.getShapes()) { meanWidth.increment(shape.getWidth()); shapes.add(shape); } } double shapeWidthMean = meanWidth.getResult(); LOG.debug("Mean width: " + shapeWidthMean); meanWidth.clear(); // Note: there is much trial and error for these numbers // but the general guideline is that it is easier to deal downstream // with bad joins than with bad splits // so we prefer to err on the upper side double fillFactorScale = 0.15 * fillFactor; double widthForSplittingLower = shapeWidthMean * (1.6 + fillFactorScale); double widthForSplittingUpper = shapeWidthMean * (2.2 + fillFactorScale); LOG.debug("widthForSplittingLower: " + widthForSplittingLower); LOG.debug("widthForSplittingUpper: " + widthForSplittingUpper); LOG.debug("maxWidthForSplit: " + maxWidthForSplit); List<Shape> candidates = new ArrayList<Shape>(); for (RowOfShapes row : rowCluster) { LOG.debug("Next row " + row.getIndex()); for (Shape shape : row.getShapes()) { LOG.trace("Shape width " + shape.getWidth()); if (shape.getWidth() > widthForSplittingLower && shape.getWidth() < maxWidthForSplit) { candidates.add(shape); LOG.debug("Found candidate with width " + shape.getWidth() + ": " + shape); } } } if (candidates.size() > 0) { // we'll take a random sampling of shapes for the next parameters int sampleSize = 30; List<Shape> sample = this.getSample(rowCluster, sampleSize, true); Mean meanPixelCount = new Mean(); Vectorizer vectorizer = this.graphicsService.getVectorizer(); List<Integer> thicknesses = new ArrayList<Integer>(); for (Shape shape : sample) { BitSet bitset = shape.getBlackAndWhiteBitSet(sourceImage.getSeparationThreshold(), 0); meanPixelCount.increment(bitset.cardinality()); List<LineSegment> vectors = vectorizer.vectorize(shape); int height = shape.getHeight(); int sampleStep = (int) Math.ceil(height / 8); for (LineSegment vector : vectors) { List<Integer> vectorThickness = vector.getLineDefinition().findArrayListThickness(shape, vector.getStartX(), vector.getStartY(), vector.getLength(), sourceImage.getSeparationThreshold(), 0, sampleStep); thicknesses.addAll(vectorThickness); } } double pixelCountMean = meanPixelCount.getResult(); Mean meanThickness = new Mean(); for (int thickness : thicknesses) { meanThickness.increment(thickness); } double thicknessMean = meanThickness.getResult(); meanThickness = new Mean(); for (int thickness : thicknesses) { if (thickness < thicknessMean) meanThickness.increment(thickness); } thicknessMean = meanThickness.getResult(); LOG.debug("thicknessMean: " + thicknessMean); // 2) maxBridgeWidth: the maximum bridge width to use as a dividing bridge between two shapes when splitting double maxBridgeWidthLower = thicknessMean * 0.5; double maxBridgeWidthUpper = thicknessMean * 0.8; LOG.debug("maxBridgeWidthLower: " + maxBridgeWidthLower); LOG.debug("maxBridgeWidthUpper: " + maxBridgeWidthUpper); // 3) minLetterWeight: the minimum pixel count that can represent a separate letter when splitting int minLetterWeight = (int) Math.floor(pixelCountMean / 4.0); LOG.debug("minLetterWeight: " + minLetterWeight); // 4) maxHorizontalOverlap: the maximum horizontal overlap between the left-hand and right-hand shape int maxOverlap = (int) Math.ceil(shapeWidthMean / 8.0); LOG.debug("maxOverlap: " + maxOverlap); Map<Shape, List<Shape>> shapesToSplit = new Hashtable<Shape, List<Shape>>(); for (Shape candidate : candidates) { LOG.debug("Trying to split candidate " + candidate); for (int y = 0; y < candidate.getHeight(); y++) { String line = ""; if (y == candidate.getMeanLine()) line += "M"; else if (y == candidate.getBaseLine()) line += "B"; else line += y; for (int x = 0; x < candidate.getWidth(); x++) { if (candidate.isPixelBlack(x, y, sourceImage.getBlackThreshold())) line += "x"; else line += "o"; } LOG.debug(line); } if (candidate.getHeight() < 3.0 * maxBridgeWidthUpper) { LOG.debug("Shape too narrow - probably a long dash."); continue; } int maxBridgeWidth; if (candidate.getWidth() > widthForSplittingUpper) maxBridgeWidth = (int) Math.ceil(maxBridgeWidthUpper); else { // since many bridges are thicker than expected // add a rule that the thicker the bridge is, the wider the image needs to be maxBridgeWidth = (int) Math.ceil( maxBridgeWidthLower + (((double) candidate.getWidth() - widthForSplittingLower) / (widthForSplittingUpper - widthForSplittingLower) * (maxBridgeWidthUpper - maxBridgeWidthLower))); } List<Shape> splitShapes = this.splitShape(candidate, sourceImage, maxBridgeWidth, minLetterWeight, maxOverlap); if (splitShapes.size() > 1) { LOG.debug("Split found"); for (Shape splitShape : splitShapes) { splitShape.setRow(candidate.getRow()); } shapesToSplit.put(candidate, splitShapes); } } LOG.debug("Replacing shapes with split shapes"); List<RowOfShapes> rowsToReorder = new ArrayList<RowOfShapes>(); for (Shape shape : shapesToSplit.keySet()) { List<Shape> newShapes = shapesToSplit.get(shape); RowOfShapes row = shape.getRow(); row.removeShape(shape); row.addShapes(newShapes); rowsToReorder.add(row); } for (RowOfShapes row : rowsToReorder) row.reorderShapes(); } } LOG.debug("splitShapes complete"); } /** * Split a shape into 2 or more shapes, in the case where two letters have been mistakenly joined together. * @param shape the shape to split * @param sourceImage the source image containing this shape * @param maxBridgeWidth maximum width of a bridge between the two letters (measured vertically) * @param minLetterWeight minimum pixel count for a shape portion to be counted a separate letter * @param maxOverlap maximum vertical overlap (in pixels) between a right-hand and left-hand shape to be counted as separate letters * @return List of Shape, where the list is empty if no split was performed */ List<Shape> splitShape(Shape shape, SourceImage sourceImage, int maxBridgeWidth, int minLetterWeight, int maxOverlap) { LOG.debug("Trying to split shape: " + shape.toString()); LOG.debug("maxBridgeWidth " + maxBridgeWidth); LOG.debug("minLetterWeight " + minLetterWeight); LOG.debug("maxOverlap " + maxOverlap); Collection<BridgeCandidate> bridgeCandidates = ((ShapeInternal) shape).getBridgeCandidates(maxBridgeWidth); if (bridgeCandidates.size() > 0) { // (B) weight of right shape & weight of left shape > a certain threshold // (C) little overlap right boundary of left shape, left boundary of right shape // check if the right and left weight of each bridge candidate is sufficiently big LOG.debug("minLetterWeight: " + minLetterWeight); LOG.debug("maxOverlap: " + maxOverlap); LOG.debug("Eliminating candidates based on pixel count and overlap"); Set<BridgeCandidate> candidatesToEliminate = new HashSet<BridgeCandidate>(); for (BridgeCandidate candidate : bridgeCandidates) { LOG.debug("Bridge candidate: leftPixels = " + candidate.leftPixels + ", rightPixels = " + candidate.rightPixels); LOG.debug("leftShapeRightBoundary = " + candidate.leftShapeRightBoundary + ", rightShapeLeftBoundary = " + candidate.rightShapeLeftBoundary); boolean isBridge = true; if (candidate.rightPixels < minLetterWeight || candidate.leftPixels < minLetterWeight) isBridge = false; if (candidate.leftShapeRightBoundary - candidate.rightShapeLeftBoundary > maxOverlap) isBridge = false; if (!isBridge) candidatesToEliminate.add(candidate); } bridgeCandidates.removeAll(candidatesToEliminate); LOG.debug("Remaining bridge candidate size: " + bridgeCandidates.size()); } // have candidates List<Shape> shapes = new ArrayList<Shape>(); // apply any splits detected if (bridgeCandidates.size() > 0) { int[] startingPoint = shape.getStartingPoint(); int startX = startingPoint[0]; int startY = startingPoint[1]; for (BridgeCandidate bridge : bridgeCandidates) { bridge.leftGroup.touched = false; bridge.rightGroup.touched = false; } // perform split for (BridgeCandidate bridge : bridgeCandidates) { Shape leftShape = graphicsService.getDot(sourceImage, startX, startY); leftShape.setLeft(shape.getRight()); leftShape.setRight(shape.getLeft()); leftShape.setTop(shape.getBottom()); leftShape.setBottom(shape.getTop()); Shape rightShape = graphicsService.getDot(sourceImage, startX, startY); rightShape.setLeft(shape.getRight()); rightShape.setRight(shape.getLeft()); rightShape.setTop(shape.getBottom()); rightShape.setBottom(shape.getTop()); Stack<VerticalLineGroup> groupStack = new Stack<VerticalLineGroup>(); groupStack.push(bridge.leftGroup); while (!groupStack.isEmpty()) { VerticalLineGroup lineGroup = groupStack.pop(); if (lineGroup.touched) continue; lineGroup.touched = true; LOG.debug("Touching group, pixelCount: " + lineGroup.pixelCount + ", leftBoundary: " + lineGroup.leftBoundary + ", rightBoundary: " + lineGroup.rightBoundary); if (shape.getLeft() + lineGroup.leftBoundary < leftShape.getLeft()) leftShape.setLeft(shape.getLeft() + lineGroup.leftBoundary); if (shape.getLeft() + lineGroup.rightBoundary > leftShape.getRight()) leftShape.setRight(shape.getLeft() + lineGroup.rightBoundary); if (shape.getTop() + lineGroup.topBoundary < leftShape.getTop()) leftShape.setTop(shape.getTop() + lineGroup.topBoundary); if (shape.getTop() + lineGroup.bottomBoundary > leftShape.getBottom()) leftShape.setBottom(shape.getTop() + lineGroup.bottomBoundary); for (BridgeCandidate leftCandidate : lineGroup.leftCandidates) { if (!bridge.equals(leftCandidate) && !(bridgeCandidates.contains(leftCandidate))) { groupStack.push(leftCandidate.leftGroup); } } for (BridgeCandidate rightCandidate : lineGroup.rightCandidates) { if (!bridge.equals(rightCandidate) && !(bridgeCandidates.contains(rightCandidate))) { groupStack.push(rightCandidate.rightGroup); } } } // next left group groupStack.push(bridge.rightGroup); while (!groupStack.isEmpty()) { VerticalLineGroup lineGroup = groupStack.pop(); if (lineGroup.touched) continue; lineGroup.touched = true; LOG.debug("Touching group, pixelCount: " + lineGroup.pixelCount + ", leftBoundary: " + lineGroup.leftBoundary + ", rightBoundary: " + lineGroup.rightBoundary); if (shape.getLeft() + lineGroup.leftBoundary < rightShape.getLeft()) rightShape.setLeft(shape.getLeft() + lineGroup.leftBoundary); if (shape.getLeft() + lineGroup.rightBoundary > rightShape.getRight()) rightShape.setRight(shape.getLeft() + lineGroup.rightBoundary); if (shape.getTop() + lineGroup.topBoundary < rightShape.getTop()) rightShape.setTop(shape.getTop() + lineGroup.topBoundary); if (shape.getTop() + lineGroup.bottomBoundary > rightShape.getBottom()) rightShape.setBottom(shape.getTop() + lineGroup.bottomBoundary); for (BridgeCandidate leftCandidate : lineGroup.leftCandidates) { if (!bridge.equals(leftCandidate) && !(bridgeCandidates.contains(leftCandidate))) { groupStack.push(leftCandidate.leftGroup); } } for (BridgeCandidate rightCandidate : lineGroup.rightCandidates) { if (!bridge.equals(rightCandidate) && !(bridgeCandidates.contains(rightCandidate))) { groupStack.push(rightCandidate.rightGroup); } } } // next right group if (leftShape.getWidth() > 0) { LOG.debug("Adding left split: " + leftShape); shapes.add(leftShape); } if (rightShape.getWidth() > 0) { LOG.debug("Adding right split: " + rightShape); shapes.add(rightShape); } } // next bridge } // do we have any bridges? // TODO: we need to join split shapes back together when more than 1 split is applied // and the shape in the middle is too small on its own (< minPixelCount) return shapes; } public GraphicsServiceInternal getGraphicsService() { return graphicsService; } public void setGraphicsService(GraphicsServiceInternal graphicsService) { this.graphicsService = graphicsService; } public boolean isDrawSegmentation() { return drawSegmentation; } public void setDrawSegmentation(boolean drawSegmentation) { this.drawSegmentation = drawSegmentation; } public SourceImage getSourceImage() { return sourceImage; } public BufferedImage getSegmentedImage() { return segmentedImage; } /** * Get a random sample (with replacement) of shapes on this image. * @param sourceImage * @param sampleSize * @return */ List<Shape> getSample(Collection<RowOfShapes> rows, int sampleSize, boolean bigShapesOnly) { double minShapeWidth = 0; double minShapeHeight = 0; double maxShapeWidth = Double.MAX_VALUE; double maxShapeHeight = Double.MAX_VALUE; if (bigShapesOnly) { Mean widthMean = new Mean(); Mean heightMean = new Mean(); for (RowOfShapes row : rows) { for (Shape shape : row.getShapes()) { widthMean.increment(shape.getWidth()); heightMean.increment(shape.getHeight()); } } minShapeWidth = widthMean.getResult(); minShapeHeight = heightMean.getResult(); maxShapeWidth = minShapeWidth * 2.5; maxShapeHeight = minShapeHeight * 2.5; } List<Shape> sample = new ArrayList<Shape>(sampleSize); int countBad = 0; while (sample.size() < sampleSize) { if (countBad >= 10) { minShapeWidth = 0; minShapeHeight = 0; maxShapeWidth = Double.MAX_VALUE; maxShapeHeight = Double.MAX_VALUE; } double random = Math.random(); int rowIndex = (int) Math.floor(random * rows.size()); Iterator<RowOfShapes> iRows = rows.iterator(); RowOfShapes row = null; for (int i = 0; i <= rowIndex; i++) { row = iRows.next(); } random = Math.random(); int index = (int) Math.floor(random * row.getShapes().size()); Shape shape = row.getShapes().get(index); if (shape.getWidth() > minShapeWidth && shape.getHeight() > minShapeHeight && shape.getWidth() < maxShapeWidth && shape.getHeight() < maxShapeHeight) { sample.add(shape); countBad = 0; } else { countBad++; } } return sample; } int getShapeCount(SourceImage sourceImage) { int totalShapeCount = 0; for (Paragraph paragraph : sourceImage.getParagraphs()) { for (RowOfShapes row : paragraph.getRows()) { for (GroupOfShapes group : row.getGroups()) { totalShapeCount += group.getShapes().size(); } } } return totalShapeCount; } @Override public ProgressMonitor monitorTask() { currentMonitor = new SimpleProgressMonitor(); return currentMonitor; } /** * Should we split and join shapes initially or not. * If not, it is assumed a machine learning algorithm will do this later for us. * @return */ public boolean isSplitAndJoin() { return splitAndJoin; } public void setSplitAndJoin(boolean splitAndJoin) { this.splitAndJoin = splitAndJoin; } }