List of usage examples for java.util TreeSet addAll
public boolean addAll(Collection<? extends E> c)
From source file:com.datatorrent.stram.StreamingContainerManager.java
/** * Compute checkpoints required for a given operator instance to be recovered. * This is done by looking at checkpoints available for downstream dependencies first, * and then selecting the most recent available checkpoint that is smaller than downstream. * * @param operator Operator instance for which to find recovery checkpoint * @param ctx Context into which to collect traversal info */// w w w. j ava2s . c o m public void updateRecoveryCheckpoints(PTOperator operator, UpdateCheckpointsContext ctx) { if (operator.getRecoveryCheckpoint().windowId < ctx.committedWindowId.longValue()) { ctx.committedWindowId.setValue(operator.getRecoveryCheckpoint().windowId); } if (operator.getState() == PTOperator.State.ACTIVE && (ctx.currentTms - operator.stats.lastWindowIdChangeTms) > operator.stats.windowProcessingTimeoutMillis) { // if the checkpoint is ahead, then it is not blocked but waiting for activation (state-less recovery, at-most-once) if (ctx.committedWindowId.longValue() >= operator.getRecoveryCheckpoint().windowId) { LOG.debug("Marking operator {} blocked committed window {}, recovery window {}", operator, Codec.getStringWindowId(ctx.committedWindowId.longValue()), Codec.getStringWindowId(operator.getRecoveryCheckpoint().windowId)); ctx.blocked.add(operator); } } // the most recent checkpoint eligible for recovery based on downstream state Checkpoint maxCheckpoint = Checkpoint.INITIAL_CHECKPOINT; Set<OperatorMeta> checkpointGroup = ctx.checkpointGroups.get(operator.getOperatorMeta()); if (checkpointGroup == null) { checkpointGroup = Collections.singleton(operator.getOperatorMeta()); } // find intersection of checkpoints that group can collectively move to TreeSet<Checkpoint> commonCheckpoints = new TreeSet<>(new Checkpoint.CheckpointComparator()); synchronized (operator.checkpoints) { commonCheckpoints.addAll(operator.checkpoints); } Set<PTOperator> groupOpers = new HashSet<>(checkpointGroup.size()); boolean pendingDeploy = operator.getState() == PTOperator.State.PENDING_DEPLOY; if (checkpointGroup.size() > 1) { for (OperatorMeta om : checkpointGroup) { Collection<PTOperator> operators = plan.getAllOperators(om); for (PTOperator groupOper : operators) { synchronized (groupOper.checkpoints) { commonCheckpoints.retainAll(groupOper.checkpoints); } // visit all downstream operators of the group ctx.visited.add(groupOper); groupOpers.add(groupOper); pendingDeploy |= operator.getState() == PTOperator.State.PENDING_DEPLOY; } } // highest common checkpoint if (!commonCheckpoints.isEmpty()) { maxCheckpoint = commonCheckpoints.last(); } } else { // without logical grouping, treat partitions as independent // this is especially important for parallel partitioning ctx.visited.add(operator); groupOpers.add(operator); maxCheckpoint = operator.getRecentCheckpoint(); if (ctx.recovery && maxCheckpoint.windowId == Stateless.WINDOW_ID && operator.isOperatorStateLess()) { long currentWindowId = WindowGenerator.getWindowId(ctx.currentTms, this.vars.windowStartMillis, this.getLogicalPlan().getValue(LogicalPlan.STREAMING_WINDOW_SIZE_MILLIS)); maxCheckpoint = new Checkpoint(currentWindowId, 0, 0); } } // DFS downstream operators for (PTOperator groupOper : groupOpers) { for (PTOperator.PTOutput out : groupOper.getOutputs()) { for (PTOperator.PTInput sink : out.sinks) { PTOperator sinkOperator = sink.target; if (groupOpers.contains(sinkOperator)) { continue; // downstream operator within group } if (!ctx.visited.contains(sinkOperator)) { // downstream traversal updateRecoveryCheckpoints(sinkOperator, ctx); } // recovery window id cannot move backwards // when dynamically adding new operators if (sinkOperator.getRecoveryCheckpoint().windowId >= operator .getRecoveryCheckpoint().windowId) { maxCheckpoint = Checkpoint.min(maxCheckpoint, sinkOperator.getRecoveryCheckpoint()); } if (ctx.blocked.contains(sinkOperator)) { if (sinkOperator.stats.getCurrentWindowId() == operator.stats.getCurrentWindowId()) { // downstream operator is blocked by this operator ctx.blocked.remove(sinkOperator); } } } } } // find the common checkpoint that is <= downstream recovery checkpoint if (!commonCheckpoints.contains(maxCheckpoint)) { if (!commonCheckpoints.isEmpty()) { maxCheckpoint = Objects.firstNonNull(commonCheckpoints.floor(maxCheckpoint), maxCheckpoint); } } for (PTOperator groupOper : groupOpers) { // checkpoint frozen during deployment if (!pendingDeploy || ctx.recovery) { // remove previous checkpoints Checkpoint c1 = Checkpoint.INITIAL_CHECKPOINT; LinkedList<Checkpoint> checkpoints = groupOper.checkpoints; synchronized (checkpoints) { if (!checkpoints.isEmpty() && (checkpoints.getFirst()).windowId <= maxCheckpoint.windowId) { c1 = checkpoints.getFirst(); Checkpoint c2; while (checkpoints.size() > 1 && ((c2 = checkpoints.get(1)).windowId) <= maxCheckpoint.windowId) { checkpoints.removeFirst(); //LOG.debug("Checkpoint to delete: operator={} windowId={}", operator.getName(), c1); this.purgeCheckpoints.add(new Pair<PTOperator, Long>(groupOper, c1.windowId)); c1 = c2; } } else { if (ctx.recovery && checkpoints.isEmpty() && groupOper.isOperatorStateLess()) { LOG.debug("Adding checkpoint for stateless operator {} {}", groupOper, Codec.getStringWindowId(maxCheckpoint.windowId)); c1 = groupOper.addCheckpoint(maxCheckpoint.windowId, this.vars.windowStartMillis); } } } //LOG.debug("Operator {} checkpoints: commit {} recent {}", new Object[] {operator.getName(), c1, operator.checkpoints}); groupOper.setRecoveryCheckpoint(c1); } else { LOG.debug("Skipping checkpoint update {} during {}", groupOper, groupOper.getState()); } } }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
void addRowsToJochreImage(SourceImage sourceImage, List<RowOfShapes> rows) { LOG.debug("########## addRowsToJochreImage #########"); sourceImage.getRows().clear();/*from ww w .j av a2 s. c om*/ TreeSet<RowOfShapes> rowSet = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator()); rowSet.addAll(rows); int i = 0; LOG.debug("====== Row list ========"); for (RowOfShapes row : rowSet) { // order the shapes within the rows // here is where left-to-right or right-to-left matters row.reorderShapes(); sourceImage.addRow(row); int oldIndex = row.getIndex(); row.setIndex(i++); LOG.debug(row.toString() + " (old index = " + oldIndex + ")"); } }
From source file:net.spfbl.core.User.java
public synchronized Set<Long> headSet(long threshold) { if (queryMap == null) { return new TreeSet<Long>(); } else {/* w w w .j a va 2 s.com*/ TreeSet<Long> set = new TreeSet<Long>(); set.addAll(queryMap.headMap(threshold).keySet()); return set; } }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
/** * Combine rows that represent thin lines directly above or below another row * (e.g. diacritics)//from w w w . j a va 2 s . c o m */ void combineRows(SourceImage sourceImage) { LOG.debug("########## combineRows #########"); // We thought of using row height, but mean row height is not a good enough // indicator when there are title rows with very big characters. // Instead, we need to go with Distance between rows when compared to mean - baseline // where distance between rows is measured between the tops and bottoms of nearby shapes. int maxRowHeight = 0; for (RowOfShapes row : sourceImage.getRows()) { int rowHeight = row.getXHeightMax(); if (rowHeight > maxRowHeight) maxRowHeight = rowHeight; } LOG.debug("maxRowHeight: " + maxRowHeight); TreeSet<RowOfShapes> rowSet = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator()); rowSet.addAll(sourceImage.getRows()); List<RowOfShapes> rows = new ArrayList<RowOfShapes>(rowSet); List<RowOfShapes> rowsToDelete = new ArrayList<RowOfShapes>(); double maxShapeWidth = sourceImage.getAverageShapeWidth() * 8.0; LOG.debug("maxShapeWidth: " + maxShapeWidth); double maxRatioForCombine = 0.6; LOG.debug("maxRatioForCombine: " + maxRatioForCombine); int i = 0; while (i < rows.size()) { RowOfShapes currentRow = rows.get(i); boolean rowsCombined = false; if (!rowsToDelete.contains(currentRow)) { LOG.trace("Checking " + currentRow.toString()); int currentRowHeight = currentRow.getXHeightMax(); LOG.trace("xHeightMax = " + currentRowHeight); RowOfShapes nearestRow = null; double shortestDistance = Double.MAX_VALUE; int masterRowHeight = -1; int j = 0; for (RowOfShapes otherRow : rows) { if (!rowsToDelete.contains(otherRow) && !(currentRow.equals(otherRow))) { // limit our search to nearby rows if (Math.abs(currentRow.getBaseLineMiddlePoint() - otherRow.getBaseLineMiddlePoint()) < (2.0 * maxRowHeight) && (currentRow.getRight() >= otherRow.getLeft()) && (otherRow.getRight() >= currentRow.getLeft())) { LOG.trace("Comparing to " + otherRow.toString()); int otherRowHeight = otherRow.getXHeightMax(); LOG.trace("xHeightMax = " + otherRowHeight); RowOfShapes masterRow = currentRowHeight > otherRowHeight ? currentRow : otherRow; RowOfShapes slaveRow = currentRowHeight > otherRowHeight ? otherRow : currentRow; double heightRatio = ((double) slaveRow.getXHeightMax() / (double) masterRow.getXHeightMax()); LOG.trace("height ratio (" + slaveRow.getXHeightMax() + " / " + masterRow.getXHeightMax() + "): " + heightRatio); if (heightRatio > maxRatioForCombine) continue; // avoid combining very long horizontal rules with other rows // their top gives a false impression of being closer to the other row's bottom. if ((masterRow.getMaxShapeWidth() > maxShapeWidth || slaveRow.getMaxShapeWidth() > maxShapeWidth)) continue; double distance = 0; if (currentRow.getBaseLineMiddlePoint() < otherRow.getBaseLineMiddlePoint()) { distance = (otherRow.getBaseLineMiddlePoint() - otherRow.getXHeightMax()) - currentRow.getBaseLineMiddlePoint(); LOG.trace("(otherRow.baseLineMiddlePoint() " + otherRow.getBaseLineMiddlePoint() + " - otherRow.getXHeightMax() " + otherRow.getXHeightMax() + ") - currentRow.baseLineMiddlePoint() " + currentRow.getBaseLineMiddlePoint()); } else { distance = (currentRow.getBaseLineMiddlePoint() - currentRow.getXHeightMax()) - otherRow.getBaseLineMiddlePoint(); LOG.trace("(currentRow.baseLineMiddlePoint() " + currentRow.getBaseLineMiddlePoint() + " - currentRow.getXHeightMax() " + currentRow.getXHeightMax() + ") - otherRow.baseLineMiddlePoint() " + otherRow.getBaseLineMiddlePoint()); } LOG.debug("Distance between rows: " + distance); if (distance < shortestDistance) { LOG.trace("Found new closest row: " + otherRow); nearestRow = otherRow; shortestDistance = distance; masterRowHeight = (currentRowHeight >= otherRowHeight) ? currentRowHeight : otherRowHeight; } } } j++; } if (nearestRow != null) { // The number 3 below is chosen arbitrarily - basically we want a // relative way of indicating that the rows are very near to each other. double minDistanceForCombine = ((double) masterRowHeight / 3); LOG.trace("minDistanceForCombine: " + minDistanceForCombine); if (shortestDistance < minDistanceForCombine) { LOG.debug("Combining the two rows"); LOG.debug(currentRow.toString()); LOG.debug(nearestRow.toString()); rowsToDelete.add(nearestRow); currentRow.addShapes(nearestRow.getShapes()); currentRow.reorderShapes(); currentRow.recalculate(); this.joinShapesVertically(currentRow); currentRow.assignGuideLines(); LOG.debug("Resulting row: " + currentRow.toString()); rowsCombined = true; } } } // We may need to combine multiple rows // so we only advance if no combination has taken place if (!rowsCombined) i++; } // actually delete the rows for (RowOfShapes rowToDelete : rowsToDelete) { sourceImage.getRows().remove(rowToDelete); } LOG.debug("########## end combineRows #########"); }
From source file:net.spfbl.core.User.java
public synchronized TreeSet<Long> getTimeSet() { if (queryMap == null) { return new TreeSet<Long>(); } else {//from w w w . j a v a 2s. c o m TreeSet<Long> timeSet = new TreeSet<Long>(); timeSet.addAll(queryMap.keySet()); return timeSet; } }
From source file:net.spfbl.core.User.java
public synchronized TreeSet<Long> getTimeSet(long begin, long end) { if (queryMap == null) { return new TreeSet<Long>(); } else {/*from w ww. ja va 2 s . c o m*/ TreeSet<Long> timeSet = new TreeSet<Long>(); timeSet.addAll(queryMap.subMap(begin, end).keySet()); return timeSet; } }
From source file:org.chiba.tools.schemabuilder.AbstractSchemaFormBuilder.java
/** * Build the type tree//from w ww . ja v a 2s .c o m */ /*private void buildTypeTree(XSTypeDefinition type, TreeSet descendents) { if (type != null) { if (descendents.size() > 0) { TreeSet compatibleTypes = (TreeSet) typeTree.get(type.getName()); if (compatibleTypes == null) { compatibleTypes = new TreeSet(descendents); typeTree.put(type.getName(), compatibleTypes); } else { compatibleTypes.addAll(descendents); } } XSTypeDefinition parentType = type.getBaseType(); if (parentType != null && type.getTypeCategory() == parentType.getTypeCategory()) { String typeName = type.getName(); String parentTypeName = parentType.getName(); if ((typeName == null && parentTypeName != null) || (typeName != null && parentTypeName == null) || (typeName != null && parentTypeName != null && !type.getName().equals(parentType.getName()) && !parentType.getName().equals("anyType"))) { TreeSet newDescendents=new TreeSet(descendents); //extension (we only add it to "newDescendants" because we don't want //to have a type descendant to itself, but to consider it for the parent if (type.getTypeCategory() == XSTypeDefinition.COMPLEX_TYPE) { XSComplexTypeDefinition complexType = (XSComplexTypeDefinition) type; if (complexType.getDerivationMethod() == XSConstants.DERIVATION_EXTENSION && !complexType.getAbstract() && !descendents.contains(type.getName()) //to be tested ) { newDescendents.add(type.getName()); } } //note: extensions are impossible on simpleTypes ! buildTypeTree(parentType, newDescendents); } } } }*/ private void buildTypeTree(XSTypeDefinition type, TreeSet descendents) { if (type != null) { if (descendents.size() > 0) { //TreeSet compatibleTypes = (TreeSet) typeTree.get(type.getName()); TreeSet compatibleTypes = (TreeSet) typeTree.get(type.getName()); if (compatibleTypes == null) { //compatibleTypes = new TreeSet(descendents); compatibleTypes = new TreeSet(TypeExtensionSorter.getInstance()); compatibleTypes.addAll(descendents); //typeTree.put(type.getName(), compatibleTypes); typeTree.put(type.getName(), compatibleTypes); } else { compatibleTypes.addAll(descendents); } } XSTypeDefinition parentType = type.getBaseType(); if (parentType != null && type.getTypeCategory() == parentType.getTypeCategory()) { /*String typeName = type.getName(); String parentTypeName = parentType.getName(); if ((typeName == null && parentTypeName != null) || (typeName != null && parentTypeName == null) || (typeName != null && parentTypeName != null && !type.getName().equals(parentType.getName()) && !parentType.getName().equals("anyType"))) {*/ if (type != parentType && (parentType.getName() == null || !parentType.getName().equals("anyType"))) { //TreeSet newDescendents=new TreeSet(descendents); TreeSet newDescendents = new TreeSet(TypeExtensionSorter.getInstance()); newDescendents.addAll(descendents); //extension (we only add it to "newDescendants" because we don't want //to have a type descendant to itself, but to consider it for the parent if (type.getTypeCategory() == XSTypeDefinition.COMPLEX_TYPE) { XSComplexTypeDefinition complexType = (XSComplexTypeDefinition) type; if (complexType.getDerivationMethod() == XSConstants.DERIVATION_EXTENSION && !complexType.getAbstract() && !descendents.contains(type) //to be tested //&& !descendents.contains(type.getName()) //to be tested ) { //newDescendents.add(type.getName()); newDescendents.add(type); } } //note: extensions are impossible on simpleTypes ! buildTypeTree(parentType, newDescendents); } } } }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
/** * Clear out anything found in the right & left margins * @param sourceImage/* ww w .j ava2 s. c o m*/ */ void cleanMargins(SourceImage sourceImage) { LOG.debug("########## cleanMargins #########"); int minCardinalityForMargin = 8; double averageShapeWidth = sourceImage.getAverageShapeWidth(); LOG.debug("Finding right margin"); double rightLimit = (double) sourceImage.getWidth() * 0.67; // first, create a DBScan cluster of all rows near the right-hand side List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>(); List<double[]> rightCoordinates = new ArrayList<double[]>(); for (RowOfShapes row : sourceImage.getRows()) { double right = row.getRight(); if (right >= rightLimit) { LOG.trace(row.toString()); LOG.trace( "Right: " + right + " + " + row.getXAdjustment() + " = " + (right - row.getXAdjustment())); right -= row.getXAdjustment(); rightHandRows.add(row); rightCoordinates.add(new double[] { right }); } } DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows, rightCoordinates); Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClusters.addAll(rowClusters); int i = 0; // find the right-most cluster with sufficient cardinality, and assume it's the right margin DescriptiveStatistics rightMarginStats = null; for (Set<RowOfShapes> cluster : orderedRowClusters) { DescriptiveStatistics rightStats = new DescriptiveStatistics(); for (RowOfShapes row : cluster) rightStats.addValue(row.getRight() - row.getXAdjustment()); LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Right mean : " + rightStats.getMean()); LOG.debug("Right std dev: " + rightStats.getStandardDeviation()); if (cluster.size() >= minCardinalityForMargin && (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean())) { rightMarginStats = rightStats; } i++; } // see how many rows would violate this margin - if too many, assume no margin // these rows are only rows which extend across the margin if (rightMarginStats != null) { LOG.debug("Right margin mean : " + rightMarginStats.getMean()); LOG.debug("Right margin std dev: " + rightMarginStats.getStandardDeviation()); double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth(); LOG.debug("rightMarginLimit: " + rightMarginLimit); int numRowsToChop = 0; for (RowOfShapes row : sourceImage.getRows()) { if (row.getRight() >= rightLimit) { if (row.getRight() - row.getXAdjustment() >= rightMarginLimit && row.getLeft() - row.getXAdjustment() <= rightMarginLimit) { LOG.debug("Found overlapping row : " + row); LOG.debug("Adjusted right : " + (row.getRight() - row.getXAdjustment())); numRowsToChop++; } } } if (numRowsToChop >= 3) { LOG.debug("Too many overlapping rows - ignoring margin"); rightMarginStats = null; } } if (rightMarginStats != null) { double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth(); List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { double right = row.getRight() - row.getXAdjustment(); LOG.trace(row.toString()); LOG.trace("Adjusted right: " + right); if (right >= rightMarginLimit) { LOG.trace("Has out-of-margin stuff!"); // need to chop off groups to the right of this threshold List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>(); for (GroupOfShapes group : row.getGroups()) { if (group.getLeft() - row.getXAdjustment() > rightMarginLimit) { groupsToChop.add(group); LOG.debug("Chopping group outside of right margin: " + group); } } for (GroupOfShapes group : groupsToChop) { row.getShapes().removeAll(group.getShapes()); } row.getGroups().removeAll(groupsToChop); if (row.getGroups().size() == 0) { LOG.debug("Removing empty " + row); rowsToRemove.add(row); } else { row.recalculate(); row.assignGuideLines(); } } // does this row extend beyond the margin? } // next row sourceImage.getRows().removeAll(rowsToRemove); } // have a right margin LOG.debug("Finding left margin"); double leftLimit = (double) sourceImage.getWidth() * 0.33; // first, create a DBScan cluster of all rows near the left-hand side List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>(); List<double[]> leftCoordinates = new ArrayList<double[]>(); for (RowOfShapes row : sourceImage.getRows()) { double left = row.getLeft(); if (left <= leftLimit) { LOG.trace(row.toString()); LOG.trace("Left: " + left + " - " + row.getXAdjustment() + " = " + (left - row.getXAdjustment())); left -= row.getXAdjustment(); leftHandRows.add(row); leftCoordinates.add(new double[] { left }); } } DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows, leftCoordinates); Set<Set<RowOfShapes>> rowClustersLeft = leftMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClustersLeft = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClustersLeft.addAll(rowClustersLeft); i = 0; // find the left-most cluster with sufficient cardinality, and assume it's the left margin DescriptiveStatistics leftMarginStats = null; for (Set<RowOfShapes> cluster : orderedRowClustersLeft) { DescriptiveStatistics leftStats = new DescriptiveStatistics(); for (RowOfShapes row : cluster) leftStats.addValue(row.getLeft() - row.getXAdjustment()); LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Left mean : " + leftStats.getMean()); LOG.debug("Left std dev: " + leftStats.getStandardDeviation()); if (cluster.size() >= minCardinalityForMargin && (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean())) { leftMarginStats = leftStats; } i++; } // see how many rows would violate this margin - if too many, assume no margin // these rows are only rows which extend across the margin if (leftMarginStats != null) { LOG.debug("Left margin mean : " + leftMarginStats.getMean()); LOG.debug("Left margin std dev: " + leftMarginStats.getStandardDeviation()); double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth(); LOG.debug("leftMarginLimit: " + leftMarginLimit); int numRowsToChop = 0; for (RowOfShapes row : sourceImage.getRows()) { if (row.getLeft() <= leftLimit) { if (row.getLeft() - row.getXAdjustment() <= leftMarginLimit && row.getRight() - row.getXAdjustment() >= leftMarginLimit) { LOG.debug("Found overlapping row : " + row); LOG.debug("Adjusted left : " + (row.getLeft() - row.getXAdjustment())); numRowsToChop++; } } } if (numRowsToChop >= 3) { LOG.debug("Too many overlapping rows - ignoring margin"); leftMarginStats = null; } } if (leftMarginStats != null) { double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth(); List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { double left = row.getLeft() - row.getXAdjustment(); LOG.trace(row.toString()); LOG.trace("Adjusted left: " + left); if (left <= leftMarginLimit) { LOG.trace("Has out-of-margin stuff!"); // need to chop off groups to the left of this threshold List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>(); for (GroupOfShapes group : row.getGroups()) { if (group.getRight() - row.getXAdjustment() < leftMarginLimit) { groupsToChop.add(group); LOG.debug("Chopping group outside of left margin: " + group); } } for (GroupOfShapes group : groupsToChop) { row.getShapes().removeAll(group.getShapes()); } row.getGroups().removeAll(groupsToChop); if (row.getGroups().size() == 0) { LOG.debug("Removing empty " + row); rowsToRemove.add(row); } else { row.recalculate(); row.assignGuideLines(); } } // does this row extend beyond the margin? } // next row sourceImage.getRows().removeAll(rowsToRemove); } // have a left margin }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
/** * Detects paragraph splits and assign rows to correct paragraphs. * @param sourceImage/*from w ww . j a va 2s. c o m*/ */ void groupRowsIntoParagraphs(SourceImage sourceImage) { LOG.debug("########## groupRowsIntoParagraphs #########"); // We'll use various possible indicators, including // indented start, indented end, and spacing between rows. // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end // This means we cannot use deviation. Instead, we use the average shape width on the page. // We also adjust maxLeft & minRight to match the vertical line slope // This is now complicated by the possibility of multiple columns // Need to take into account a big horizontal space - Pietrushka page 14 // Find horizontal spaces that go all the way across and are wider than a certain threshold // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold // Columns are thus arranged into "areas", separated by white-space. boolean[] fullRows = new boolean[sourceImage.getHeight()]; for (RowOfShapes row : sourceImage.getRows()) { for (int y = row.getTop(); y <= row.getBottom(); y++) { fullRows[y] = true; } } DescriptiveStatistics rowHeightStats = new DescriptiveStatistics(); for (RowOfShapes row : sourceImage.getRows()) { int height = row.getXHeight(); rowHeightStats.addValue(height); } double avgRowHeight = rowHeightStats.getPercentile(50); LOG.debug("meanRowHeight: " + avgRowHeight); double minHeightForWhiteSpace = avgRowHeight * 1.3; LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace); // find the "white rows" - any horizontal white space // in the page which is sufficiently high List<int[]> whiteRows = new ArrayList<int[]>(); boolean inWhite = false; int startWhite = 0; for (int y = 0; y < sourceImage.getHeight(); y++) { if (!inWhite && !fullRows[y]) { inWhite = true; startWhite = y; } else if (inWhite && fullRows[y]) { int length = y - startWhite; if (length > minHeightForWhiteSpace) { LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1)); whiteRows.add(new int[] { startWhite, y - 1 }); } inWhite = false; } } if (inWhite) whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 }); whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() }); // place rows in "areas" defined by the "white rows" found above List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>(); int startY = -1; for (int[] whiteRow : whiteRows) { List<RowOfShapes> area = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) { area.add(row); } } if (area.size() > 0) { areas.add(area); } startY = whiteRow[1]; } // break up each area into vertical columns LOG.debug("break up each area into vertical columns"); List<Column> columns = new ArrayList<Column>(); List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>(); for (List<RowOfShapes> area : areas) { LOG.debug("Next area"); List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>(); columnsPerAreaList.add(columnsPerArea); TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator()); rows.addAll(area); for (RowOfShapes row : rows) { // try to place this row in one of the columns directly above it. // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered List<Column> overlappingColumns = new ArrayList<Column>(); for (Column column : columnsPerArea) { if (!column.closed) { RowOfShapes lastRowInColumn = column.get(column.size() - 1); if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft() - lastRowInColumn.getXAdjustment() && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight() - lastRowInColumn.getXAdjustment()) { overlappingColumns.add(column); } } } if (overlappingColumns.size() == 1) { Column myColumn = overlappingColumns.get(0); RowOfShapes lastRowInMyColumn = myColumn.get(0); // close any columns that are now at a distance of more than one row for (Column column : columnsPerArea) { if (!column.closed && !column.equals(myColumn)) { RowOfShapes lastRowInColumn = column.get(column.size() - 1); if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) { column.closed = true; LOG.debug("Closing distant column " + lastRowInColumn); } } } myColumn.add(row); LOG.debug(row.toString()); LOG.debug(" added to column " + lastRowInMyColumn); } else { for (Column overlappingColumn : overlappingColumns) { overlappingColumn.closed = true; RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1); LOG.debug("Closing overlapping column " + lastRowInColumn); } Column myColumn = new Column(sourceImage); myColumn.add(row); LOG.debug("Found new column"); LOG.debug(row.toString()); columns.add(myColumn); columnsPerArea.add(myColumn); } } } // next area for (Column column : columns) column.recalculate(); // Intermediate step to reform the vertical columns, if they exist // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents // should be shared, to increase the statistical sample size and reduce anomalies. // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally // and don't overlap with any other column in the other column's area. List<List<Column>> columnGroups = new ArrayList<List<Column>>(); List<Column> columnsInPrevArea = null; for (List<Column> columnsPerArea : columnsPerAreaList) { if (columnsInPrevArea != null) { for (Column prevColumn : columnsInPrevArea) { LOG.debug("Checking " + prevColumn); // find the column group containing the previous column List<Column> myColumnGroup = null; for (List<Column> columnGroup : columnGroups) { if (columnGroup.contains(prevColumn)) { myColumnGroup = columnGroup; break; } } if (myColumnGroup == null) { myColumnGroup = new ArrayList<SegmenterImpl.Column>(); LOG.debug("Creating column group for column " + prevColumn.toString()); columnGroups.add(myColumnGroup); myColumnGroup.add(prevColumn); } // does only one column overlap with this one? Column overlappingColumn = null; for (Column column : columnsPerArea) { if (column.adjustedRight >= prevColumn.adjustedLeft && column.adjustedLeft <= prevColumn.adjustedRight) { if (overlappingColumn == null) { LOG.debug("I overlap with " + column); overlappingColumn = column; } else { LOG.debug("But I overlap also with " + column); overlappingColumn = null; break; } } } if (overlappingColumn != null) { // does it overlap with only me? for (Column otherPrevColumn : columnsInPrevArea) { if (otherPrevColumn.equals(prevColumn)) continue; if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) { LOG.debug("But it overlaps also with " + otherPrevColumn); overlappingColumn = null; break; } } } if (overlappingColumn != null) { myColumnGroup.add(overlappingColumn); LOG.debug("Adding " + overlappingColumn); LOG.debug(" to group with " + prevColumn); } } // next previous column } // have previous columns columnsInPrevArea = columnsPerArea; } // next area if (columnsInPrevArea != null) { for (Column prevColumn : columnsInPrevArea) { // find the column group containing the previous column List<Column> myColumnGroup = null; for (List<Column> columnGroup : columnGroups) { if (columnGroup.contains(prevColumn)) { myColumnGroup = columnGroup; break; } } if (myColumnGroup == null) { myColumnGroup = new ArrayList<SegmenterImpl.Column>(); LOG.debug("Creating column group for column " + prevColumn.toString()); columnGroups.add(myColumnGroup); myColumnGroup.add(prevColumn); } } } // What we really want here is, for each column (in the case of right-to-left), // two clusters on the right // and one relatively big cluster on the left. // anything outside of the cluster on the left is an EOP. boolean hasTab = false; for (List<Column> columnGroup : columnGroups) { LOG.debug("Next column group"); double averageShapeWidth = sourceImage.getAverageShapeWidth(); LOG.debug("averageShapeWidth: " + averageShapeWidth); double epsilon = averageShapeWidth / 2.0; LOG.debug("epsilon: " + epsilon); int columnGroupTop = sourceImage.getHeight(); int columnGroupBottom = 0; int columnGroupLeft = sourceImage.getWidth(); int columnGroupRight = 0; for (Column column : columnGroup) { if (column.top < columnGroupTop) columnGroupTop = (int) Math.round(column.top); if (column.bottom > columnGroupBottom) columnGroupBottom = (int) Math.round(column.bottom); if (column.adjustedLeft < columnGroupLeft) columnGroupLeft = (int) Math.round(column.adjustedLeft); if (column.adjustedRight > columnGroupRight) columnGroupRight = (int) Math.round(column.adjustedRight); } // right thresholds LOG.debug("Calculating right thresholds"); // first, create a DBScan cluster of all rows by their adjusted right coordinate List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>(); List<double[]> rightCoordinates = new ArrayList<double[]>(); for (Column column : columnGroup) { for (RowOfShapes row : column) { double right = row.getRight() - row.getXAdjustment(); // double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage); // if (rightOverlap==0) { // // leave out any right-overlapping rows here // // since we need accurate statistics for margin detection // // This is questionable - especially since a long vertical bar (see Petriushka) // // tends to give all rows a left overlap. Also, because the overlap is calculated based // // on the mean right & mean left, not based on any sort of margin clusters. // rightHandRows.add(row); // rightCoordinates.add(new double[] {right}); // } rightHandRows.add(row); rightCoordinates.add(new double[] { right }); } } int minCardinalityForRightMargin = 5; DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows, rightCoordinates); Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClusters.addAll(rowClusters); int i = 0; // find the two right-most clusters, and assume they are the margin & the tab DescriptiveStatistics rightMarginStats = null; DescriptiveStatistics rightTabStats = null; for (Set<RowOfShapes> cluster : orderedRowClusters) { DescriptiveStatistics rightStats = new DescriptiveStatistics(); MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation(); for (RowOfShapes row : cluster) { int rowIndex = rightHandRows.indexOf(row); double right = rightCoordinates.get(rowIndex)[0]; rightStats.addValue(right); rightDev.increment(right); } LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Right mean : " + rightStats.getMean()); LOG.debug("Right dev: " + rightDev.getResult()); if (cluster.size() >= minCardinalityForRightMargin) { if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) { if (rightMarginStats != null) rightTabStats = rightMarginStats; rightMarginStats = rightStats; } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) { rightTabStats = rightStats; } } else { break; } i++; } // next right-coordinate cluster double rightMargin = sourceImage.getWidth(); double rightTab = sourceImage.getWidth(); if (rightMarginStats != null) { rightMargin = rightMarginStats.getMean(); } else { List<Rectangle> columnSeparators = sourceImage.findColumnSeparators(); for (Rectangle columnSeparator : columnSeparators) { if (columnSeparator.getTop() <= columnGroupTop && columnSeparator.getBottom() >= columnGroupBottom && columnSeparator.getLeft() >= columnGroupRight) { if (columnSeparator.getLeft() < rightMargin) rightMargin = columnSeparator.getLeft(); } } } if (rightTabStats != null) { rightTab = rightTabStats.getMean(); } LOG.debug("rightMargin: " + rightMargin); LOG.debug("rightTab: " + rightTab); // left thresholds LOG.debug("Calculating left thresholds"); // first, create a DBScan cluster of all rows by their adjusted left coordinate List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>(); List<double[]> leftCoordinates = new ArrayList<double[]>(); for (Column column : columnGroup) { for (RowOfShapes row : column) { double left = row.getLeft() - row.getXAdjustment(); // double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage); // if (leftOverlap == 0) { // // leave out any overlapping rows from margin calcs, // // since we need accurate statistics here // leftHandRows.add(row); // leftCoordinates.add(new double[] {left}); // } leftHandRows.add(row); leftCoordinates.add(new double[] { left }); } } int minCardinalityForLeftMargin = 5; DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows, leftCoordinates); Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon, minCardinalityForLeftMargin, true); TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedLeftRowClusters.addAll(leftRowClusters); i = 0; // find the two left-most clusters, and assume they are the margin & the tab DescriptiveStatistics leftMarginStats = null; DescriptiveStatistics leftTabStats = null; for (Set<RowOfShapes> cluster : orderedLeftRowClusters) { DescriptiveStatistics leftStats = new DescriptiveStatistics(); MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation(); for (RowOfShapes row : cluster) { int rowIndex = leftHandRows.indexOf(row); double left = leftCoordinates.get(rowIndex)[0]; leftStats.addValue(left); leftDev.increment(left); } LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Left mean : " + leftStats.getMean()); LOG.debug("Left dev: " + leftDev.getResult()); if (cluster.size() >= minCardinalityForLeftMargin) { if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) { if (leftMarginStats != null) leftTabStats = leftMarginStats; leftMarginStats = leftStats; } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) { leftTabStats = leftStats; } } else { break; } i++; } // next left-coordinate cluster double leftMargin = 0; double leftTab = 0; if (leftMarginStats != null) { leftMargin = leftMarginStats.getMean(); } else { List<Rectangle> columnSeparators = sourceImage.findColumnSeparators(); for (Rectangle columnSeparator : columnSeparators) { if (columnSeparator.getTop() <= columnGroupTop && columnSeparator.getBottom() >= columnGroupBottom && columnSeparator.getRight() <= columnGroupLeft) { if (columnSeparator.getRight() > leftMargin) leftMargin = columnSeparator.getRight(); } } } if (leftTabStats != null) { leftTab = leftTabStats.getMean(); } LOG.debug("leftMargin: " + leftMargin); LOG.debug("leftTab: " + leftTab); for (Column column : columnGroup) { if (sourceImage.isLeftToRight()) { column.startMargin = leftMargin; if (leftTabStats != null) { column.startTab = leftTab; column.hasTab = true; } else { LOG.debug("No left tab - setting based on left margin"); column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth()); column.hasTab = false; } column.endMargin = rightMargin; } else { column.startMargin = rightMargin; if (rightTabStats != null) { column.startTab = rightTab; column.hasTab = true; } else { LOG.debug("No right tab - setting based on right margin"); column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth()); column.hasTab = false; } column.endMargin = leftMargin; } LOG.debug("Margins for " + column); LOG.debug("startMargin: " + column.startMargin); LOG.debug("startTab: " + column.startTab); LOG.debug("endMargin: " + column.endMargin); } // next column } // next column group LOG.debug("hasTab: " + hasTab); double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth(); // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs). // This applies to the entire page. // To recognise indenting vs. outdenting, we have to see if the row preceding each // indent/outdent is full or partial. In the case of indentation, partial rows will // typically be followed by an indent. In the case of outdentation, partial rows will // typically be followed by an outdent. boolean isIndented = true; int indentCount = 0; int outdentCount = 0; for (List<Column> columnGroup : columnGroups) { LOG.debug("Next column group"); boolean prevRowPartial = false; for (Column column : columnGroup) { if (column.hasTab) { for (RowOfShapes row : column) { if (sourceImage.isLeftToRight()) { if (prevRowPartial) { if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) { indentCount++; } else if (row.getLeft() - row.getXAdjustment() < column.startMargin + safetyMargin) { outdentCount++; } } if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) { prevRowPartial = true; } else { prevRowPartial = false; } } else { if (prevRowPartial) { if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) { indentCount++; } else if (row.getRight() - row.getXAdjustment() > column.startMargin - safetyMargin) { outdentCount++; } } if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) { prevRowPartial = true; } else { prevRowPartial = false; } } // left-to-right? } // next row } // column has tab } // next column } // next column group isIndented = (indentCount + 2 >= outdentCount); LOG.debug("indentCount: " + indentCount); LOG.debug("outdentCount: " + outdentCount); LOG.debug("isIndented: " + isIndented); // order the columns TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns); columns.clear(); columns.addAll(orderedColumns); // find the paragraphs found in each column for (Column column : columns) { LOG.debug("--- Next column ---"); // break up the column into paragraphs Paragraph paragraph = null; RowOfShapes previousRow = null; int maxShapesForStandaloneParagraph = 2; List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>(); Point2D previousPointStartMargin = null; Point2D previousPointStartTab = null; Point2D previousPointEndMargin = null; for (RowOfShapes row : column) { boolean rowForStandaloneParagraph = false; boolean newParagraph = false; if (row.getShapes().size() <= maxShapesForStandaloneParagraph) { rowsForStandaloneParagraphs.add(row); rowForStandaloneParagraph = true; } else { double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage); double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage); if (drawSegmentation) { double rowVerticalMidPoint = row.getBaseLineMiddlePoint(); double startMarginX = column.startMargin + row.getXAdjustment(); double startTabX = column.startTab + row.getXAdjustment(); double endMarginX = column.endMargin + row.getXAdjustment(); if (sourceImage.isLeftToRight()) { startMarginX += safetyMargin; startTabX -= safetyMargin; endMarginX -= safetyMargin; startMarginX += leftOverlap; startTabX += leftOverlap; endMarginX -= rightOverlap; } else { startMarginX -= safetyMargin; startTabX += safetyMargin; endMarginX += safetyMargin; startMarginX -= rightOverlap; startTabX -= rightOverlap; endMarginX += leftOverlap; } Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX, rowVerticalMidPoint); Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint); Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint); if (previousPointStartMargin != null) { graphics2D.setStroke(new BasicStroke(1)); graphics2D.setPaint(Color.BLUE); graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()), (int) Math.round(previousPointStartMargin.getY()), (int) Math.round(currentPointStartMargin.getX()), (int) Math.round(currentPointStartMargin.getY())); graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()), (int) Math.round(previousPointEndMargin.getY()), (int) Math.round(currentPointEndMargin.getX()), (int) Math.round(currentPointEndMargin.getY())); graphics2D.setPaint(Color.RED); graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()), (int) Math.round(previousPointStartTab.getY()), (int) Math.round(currentPointStartTab.getX()), (int) Math.round(currentPointStartTab.getY())); graphics2D.setPaint(Color.RED); graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()), (int) Math.round(previousPointEndMargin.getY()), (int) Math.round(currentPointEndMargin.getX()), (int) Math.round(currentPointEndMargin.getY())); } previousPointStartMargin = currentPointStartMargin; previousPointStartTab = currentPointStartTab; previousPointEndMargin = currentPointEndMargin; } if (previousRow == null) { LOG.debug("New paragraph (first)"); newParagraph = true; } else { if (sourceImage.isLeftToRight()) { if (previousRow.getRight() - previousRow.getXAdjustment() - rightOverlap < column.endMargin - safetyMargin) { LOG.debug("New paragraph (previous EOP)"); newParagraph = true; } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment() + leftOverlap > column.startTab - safetyMargin) { LOG.debug("New paragraph (indent)"); newParagraph = true; } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment() + leftOverlap < column.startMargin + safetyMargin) { LOG.debug("New paragraph (outdent)"); newParagraph = true; } } else { if (previousRow.getLeft() - previousRow.getXAdjustment() + leftOverlap > column.endMargin + safetyMargin) { LOG.debug("New paragraph (previous EOP)"); newParagraph = true; } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment() - rightOverlap < column.startTab + safetyMargin) { LOG.debug("New paragraph (indent)"); newParagraph = true; } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment() - rightOverlap > column.startMargin - safetyMargin) { LOG.debug("New paragraph (outdent)"); newParagraph = true; } } // left-to-right? } // have previous row } // standalone paragraph? if (!rowForStandaloneParagraph) LOG.debug(row.toString()); if (newParagraph) { if (rowsForStandaloneParagraphs.size() > 0) { for (RowOfShapes oneRow : rowsForStandaloneParagraphs) { LOG.debug("Standalone paragraph"); LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")"); Paragraph standaloneParagraph = sourceImage.newParagraph(); standaloneParagraph.getRows().add(oneRow); } rowsForStandaloneParagraphs.clear(); } paragraph = sourceImage.newParagraph(); } //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")"); if (!rowForStandaloneParagraph) { paragraph.getRows().add(row); previousRow = row; } } // next row in column if (rowsForStandaloneParagraphs.size() > 0) { for (RowOfShapes oneRow : rowsForStandaloneParagraphs) { LOG.debug("Standalone paragraph"); LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")"); Paragraph standaloneParagraph = sourceImage.newParagraph(); standaloneParagraph.getRows().add(oneRow); } rowsForStandaloneParagraphs.clear(); } } // next column }
From source file:crawler.HackerEarthCrawler.java
@Override public void crawl() { int flag = 0; //set of urls which should be crawled TreeSet<String> linksset = new TreeSet<String>(); TreeSet<String> tempset = new TreeSet<String>(); TreeSet<String> tutorialset = new TreeSet<String>(); //final set of problem urls TreeSet<String> problemset = new TreeSet<String>(); //visited for maintaing status of if url is already crawled or not TreeMap<String, Integer> visited = new TreeMap<String, Integer>(); //add base url linksset.add(baseUrl);/*from w w w . ja v a2s. c om*/ //mark base url as not crawled visited.put(baseUrl, 0); try { while (true) { flag = 0; tempset.clear(); for (String str : linksset) { //check if url is already crawled or not and it has valid domain name if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) { System.out.println("crawling " + str); //retriving response of current url as document Document doc = Jsoup.connect(str).timeout(0).userAgent( "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0") .referrer("http://www.google.com").ignoreHttpErrors(true).get(); //retriving all urls from current page Elements links = doc.select("a[href]"); //mark url as crawled visited.put(str, 1); //mark flag as url is crawled flag = 1; //retrive all urls for (Element link : links) { if (link.absUrl("href").endsWith("/tutorial/")) { tutorialset.add(link.absUrl("href")); } //check if url is problem url then add it in problemurlset if (link.absUrl("href").startsWith("https://www.hackerearth.com/") && isProblemUrl(link.absUrl("href"))) { problemset.add(link.absUrl("href")); } //check if url has valid domain and it has problem urls or not if (link.absUrl("href").contains(("https://www.hackerearth.com/")) && isCrawlable(link.absUrl("href"))) { //if link is not visited then mark it as uncrawled if (!visited.containsKey(link.absUrl("href"))) { visited.put(link.absUrl("href"), 0); } //add it in tempsetorary set tempset.add(link.absUrl("href")); //System.out.println("\n base: "+str+" ::: link : " + link.absUrl("href")); } } } } //if nothing is left to crawl break the loop if (flag == 0) { break; } //add all retrieved links to linksset linksset.addAll(tempset); } System.out.println("\n\ntotal problem urls " + problemset.size()); int i = 0; for (String str : problemset) { System.out.println("link " + i + " : " + str); i++; } } catch (IOException ex) { Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex); } //scrap and store into database //for every problem url scrap problem page for (String problemUrl : problemset) { System.out.println("problemUrl :" + problemUrl); try { //create problem class to store in database Problem problem = new Problem(); String problemSIOC = "", problemIOC = ""; String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "", problemConstraints = ""; String sampleInput = "", sampleOutput = ""; String problemExplanation = ""; //set default timelimit to 1 second double problemTimeLimit = 1.0; ArrayList<String> tags = new ArrayList<String>(); //get response for given problem url Response response = Jsoup.connect(problemUrl).execute(); Document doc = response.parse(); //retrieve problem title from page Element elementTitle = doc.getElementsByTag("title").first(); StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|"); problemTitle = stTitle.nextToken().trim(); Element content = doc.getElementsByClass("starwars-lab").first(); problemSIOC = content.text(); Elements e = content.children(); //to find problem statement String breakloop[] = { "input", "input:", "input :", "input format:", "input format :", "input format", "Input and output", "constraints :", "constraints:", "constraints", "$$Input :$$" }; flag = 0; for (Element p : e) { String tempStatement = ""; for (Element pp : p.getAllElements()) { for (String strbreak : breakloop) { if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) { //System.out.println("strbreak :"+strbreak); tempStatement = p.text().substring(0, p.text().toLowerCase().indexOf(strbreak.toLowerCase())); // System.out.println("temp "+tempStatement); flag = 1; break; } } } if (flag == 1) { problemStatement += tempStatement; //remove extra space at end if (tempStatement.length() == 0) { problemStatement = problemStatement.substring(0, problemStatement.length() - 1); } break; } problemStatement += p.text() + " "; } System.out.println("problemSIOC :" + problemSIOC); System.out.println("problemStatement :" + problemStatement); if (problemStatement.length() <= problemSIOC.length()) { //remove problem statement from whole text and remove extra spaces at the beginning and the end problemIOC = problemSIOC.substring(problemStatement.length()).trim(); } else { problemIOC = ""; } System.out.println("problemIOC :" + problemIOC); //keywords for identifying input String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:", "inputformat :", "inputformat", "input and output", "input :", "input:", "input" }; //keywords for identifying output String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:", "outputformat :", "outputformat", "output :", "output:", "output" }; //keywords for identifying constraint String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :", "constraint:", "constraint :", "constraint", "Contraints :" }; int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0, flagcon = 0, inlen = 0, outlen = 0, conlen = 0; //find inputformat position,length of keyword for (idxin = 0; idxin < decideInput.length; idxin++) { if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) { posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase()); flaginput = 1; inlen = decideInput[idxin].length(); //decide it is keyowrd for actucal input or it is "sample input" if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) { if (posin > problemIOC.toLowerCase().indexOf("sample input")) { flaginput = 0; inlen = 0; } else { break; } } else { break; } } } //find outputformat position,length of keyword for (idxout = 0; idxout < decideOutput.length; idxout++) { if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) { posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase()); flagoutput = 1; outlen = decideOutput[idxout].length(); break; } } //find constraint position,length of keyword for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) { if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) { poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase()); flagcon = 1; conlen = decideConstraint[idxcon].length(); break; } } System.out.println("input " + flaginput + " " + inlen + " " + posin); System.out.println("output " + flagoutput + " " + outlen + " " + posoutput); System.out.println("constraint " + flagcon + " " + conlen + " " + poscon); //retrieve problem input and output if present in problem page //if input format is present if (flaginput == 1) { //if input keyword is "input and output" and contraint is present in problem page if (idxin == 6 && flagcon == 1) { problemInput = problemIOC.substring(inlen, poscon); } //if input keyword is "input and output" and contraint is not present in problem page else if (idxin == 6 && flagcon == 0) { problemInput = problemIOC.substring(inlen); } //if output format and constraint is present else if (flagoutput == 1 && flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } //if constraint is present before sample else if (poscon < posoutput) { problemInput = problemIOC.substring(inlen, poscon); problemOutput = problemIOC.substring(posoutput + outlen); } else { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen, poscon); } } //if constraint is not present else if (flagoutput == 1 && flagcon == 0) { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } else if (flagoutput == 0 && flagcon == 1) { if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen); } else { problemInput = problemIOC.substring(poscon + conlen, posin); } problemOutput = ""; } else { problemInput = problemIOC.substring(inlen); problemOutput = ""; } } //if input format and output format is not present else { problemInput = ""; problemOutput = ""; } //if constraint is present if (flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemConstraints = problemIOC.substring(0, posin); } //if constraint is present before output format else if (poscon < posoutput) { problemConstraints = problemIOC.substring(poscon + conlen, posoutput); } else { problemConstraints = problemIOC.substring(poscon + conlen); } } System.out.println("problemInput :" + problemInput); System.out.println("problemOutput :" + problemOutput); System.out.println("problemConstraints :" + problemConstraints); //retrieve problem tags from problem page Element elementtag = doc.getElementsByClass("problem-tags").first().child(1); StringTokenizer st = new StringTokenizer(elementtag.text(), ","); while (st.hasMoreTokens()) { tags.add(st.nextToken().trim()); } //retrieve sample input sample output if present Element elementSIO = doc.getElementsByClass("input-output-container").first(); //if sample input output is present if (elementSIO != null) { //find position of sample output int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT"); sampleInput = elementSIO.text().substring(12, soutpos); sampleOutput = elementSIO.text().substring(soutpos + 13); System.out.println("Sample input :\n" + sampleInput + "\n\n\n"); System.out.println("Sample Output :\n" + sampleOutput); } else { sampleInput = ""; sampleOutput = ""; } //retrieve problem explanation from problem page if present Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0); if (elementExplanation.text().toLowerCase().contains("explanation")) { problemExplanation = elementExplanation.nextElementSibling().text(); } System.out.println("Explanation :" + problemExplanation); //retrieve timelimit Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1); StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " "); problemTimeLimit = Double.parseDouble(stTL.nextToken()); //System.out.println("problemTimeLimit :"+problemTimeLimit); //set all retrieved information to problem class problem.setProblemUrl(problemUrl); if (problemTitle.length() == 0) { problemTitle = null; } if (problemStatement.length() == 0) { problemStatement = null; } if (problemInput.length() == 0) { problemInput = null; } if (problemOutput.length() == 0) { problemOutput = null; } if (problemExplanation.length() == 0) { problemExplanation = null; } if (problemConstraints.length() == 0) { problemConstraints = null; } problem.setTitle(problemTitle); problem.setProblemUrl(problemUrl); problem.setProblemStatement(problemStatement); problem.setInputFormat(problemInput); problem.setOutputFormat(problemOutput); problem.setTimeLimit(problemTimeLimit); problem.setExplanation(problemExplanation); problem.setConstraints(problemConstraints); //set sample input output to problem class SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput); problem.getSampleInputOutputs().add(sampleInputOutput); //set platform as hackerearth problem.setPlatform(Platform.HackerEarth); for (String strtag : tags) { problem.getTags().add(strtag); } //store in database Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Problem p where p.problemUrl = :problem_url"; Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem problem.setId(oldProblem.getId()); session.delete(oldProblem); session.flush(); session.save(problem); } else { task = "saved"; session.save(problem); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, problem.getProblemUrl() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + problemUrl, e); } finally { //close the session if (session != null) { session.close(); } } } catch (Exception ee) { System.out.println(ee.toString()); } } System.out.println("\n\n\n\ntutorial urls\n\n"); try { for (String tutorialurl : tutorialset) { //System.out.println(tutorialurl+"\n\n"); Response tutorialres = Jsoup.connect(tutorialurl).execute(); Document doc = tutorialres.parse(); Tutorial tutorial = new Tutorial(); tutorial.setContent(doc.getElementsByClass("tutorial").first().text()); tutorial.setName(baseUrl); tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10); StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/"); String tempstr = ""; while (tutorialtok.hasMoreTokens()) { tempstr = tutorialtok.nextToken(); } Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Tutorial p where p.name = :name"; Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem tutorial.setName(oldProblem.getName()); session.delete(oldProblem); session.flush(); session.save(tutorial); } else { task = "saved"; tutorial.setName(tempstr); session.save(tutorial); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, tutorial.getName() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + tempstr, ee); } finally { //close the session if (session != null) { session.close(); } } } } catch (Exception e) { System.out.println(e.getMessage()); } }