Example usage for java.util TreeSet addAll

Introduction

In this page you can find the example usage for java.util TreeSet addAll.

Prototype

public boolean addAll(Collection<? extends E> c)

Source Link

Document

Adds all of the elements in the specified collection to this set.

Usage

From source file:com.datatorrent.stram.StreamingContainerManager.java

/**
 * Compute checkpoints required for a given operator instance to be recovered.
 * This is done by looking at checkpoints available for downstream dependencies first,
 * and then selecting the most recent available checkpoint that is smaller than downstream.
 *
 * @param operator Operator instance for which to find recovery checkpoint
 * @param ctx      Context into which to collect traversal info
 */// w w w.  j ava2s  .  c o  m
public void updateRecoveryCheckpoints(PTOperator operator, UpdateCheckpointsContext ctx) {
    if (operator.getRecoveryCheckpoint().windowId < ctx.committedWindowId.longValue()) {
        ctx.committedWindowId.setValue(operator.getRecoveryCheckpoint().windowId);
    }

    if (operator.getState() == PTOperator.State.ACTIVE && (ctx.currentTms
            - operator.stats.lastWindowIdChangeTms) > operator.stats.windowProcessingTimeoutMillis) {
        // if the checkpoint is ahead, then it is not blocked but waiting for activation (state-less recovery, at-most-once)
        if (ctx.committedWindowId.longValue() >= operator.getRecoveryCheckpoint().windowId) {
            LOG.debug("Marking operator {} blocked committed window {}, recovery window {}", operator,
                    Codec.getStringWindowId(ctx.committedWindowId.longValue()),
                    Codec.getStringWindowId(operator.getRecoveryCheckpoint().windowId));
            ctx.blocked.add(operator);
        }
    }

    // the most recent checkpoint eligible for recovery based on downstream state
    Checkpoint maxCheckpoint = Checkpoint.INITIAL_CHECKPOINT;

    Set<OperatorMeta> checkpointGroup = ctx.checkpointGroups.get(operator.getOperatorMeta());
    if (checkpointGroup == null) {
        checkpointGroup = Collections.singleton(operator.getOperatorMeta());
    }
    // find intersection of checkpoints that group can collectively move to
    TreeSet<Checkpoint> commonCheckpoints = new TreeSet<>(new Checkpoint.CheckpointComparator());
    synchronized (operator.checkpoints) {
        commonCheckpoints.addAll(operator.checkpoints);
    }
    Set<PTOperator> groupOpers = new HashSet<>(checkpointGroup.size());
    boolean pendingDeploy = operator.getState() == PTOperator.State.PENDING_DEPLOY;
    if (checkpointGroup.size() > 1) {
        for (OperatorMeta om : checkpointGroup) {
            Collection<PTOperator> operators = plan.getAllOperators(om);
            for (PTOperator groupOper : operators) {
                synchronized (groupOper.checkpoints) {
                    commonCheckpoints.retainAll(groupOper.checkpoints);
                }
                // visit all downstream operators of the group
                ctx.visited.add(groupOper);
                groupOpers.add(groupOper);
                pendingDeploy |= operator.getState() == PTOperator.State.PENDING_DEPLOY;
            }
        }
        // highest common checkpoint
        if (!commonCheckpoints.isEmpty()) {
            maxCheckpoint = commonCheckpoints.last();
        }
    } else {
        // without logical grouping, treat partitions as independent
        // this is especially important for parallel partitioning
        ctx.visited.add(operator);
        groupOpers.add(operator);
        maxCheckpoint = operator.getRecentCheckpoint();
        if (ctx.recovery && maxCheckpoint.windowId == Stateless.WINDOW_ID && operator.isOperatorStateLess()) {
            long currentWindowId = WindowGenerator.getWindowId(ctx.currentTms, this.vars.windowStartMillis,
                    this.getLogicalPlan().getValue(LogicalPlan.STREAMING_WINDOW_SIZE_MILLIS));
            maxCheckpoint = new Checkpoint(currentWindowId, 0, 0);
        }
    }

    // DFS downstream operators
    for (PTOperator groupOper : groupOpers) {
        for (PTOperator.PTOutput out : groupOper.getOutputs()) {
            for (PTOperator.PTInput sink : out.sinks) {
                PTOperator sinkOperator = sink.target;
                if (groupOpers.contains(sinkOperator)) {
                    continue; // downstream operator within group
                }
                if (!ctx.visited.contains(sinkOperator)) {
                    // downstream traversal
                    updateRecoveryCheckpoints(sinkOperator, ctx);
                }
                // recovery window id cannot move backwards
                // when dynamically adding new operators
                if (sinkOperator.getRecoveryCheckpoint().windowId >= operator
                        .getRecoveryCheckpoint().windowId) {
                    maxCheckpoint = Checkpoint.min(maxCheckpoint, sinkOperator.getRecoveryCheckpoint());
                }

                if (ctx.blocked.contains(sinkOperator)) {
                    if (sinkOperator.stats.getCurrentWindowId() == operator.stats.getCurrentWindowId()) {
                        // downstream operator is blocked by this operator
                        ctx.blocked.remove(sinkOperator);
                    }
                }
            }
        }
    }

    // find the common checkpoint that is <= downstream recovery checkpoint
    if (!commonCheckpoints.contains(maxCheckpoint)) {
        if (!commonCheckpoints.isEmpty()) {
            maxCheckpoint = Objects.firstNonNull(commonCheckpoints.floor(maxCheckpoint), maxCheckpoint);
        }
    }

    for (PTOperator groupOper : groupOpers) {
        // checkpoint frozen during deployment
        if (!pendingDeploy || ctx.recovery) {
            // remove previous checkpoints
            Checkpoint c1 = Checkpoint.INITIAL_CHECKPOINT;
            LinkedList<Checkpoint> checkpoints = groupOper.checkpoints;
            synchronized (checkpoints) {
                if (!checkpoints.isEmpty() && (checkpoints.getFirst()).windowId <= maxCheckpoint.windowId) {
                    c1 = checkpoints.getFirst();
                    Checkpoint c2;
                    while (checkpoints.size() > 1
                            && ((c2 = checkpoints.get(1)).windowId) <= maxCheckpoint.windowId) {
                        checkpoints.removeFirst();
                        //LOG.debug("Checkpoint to delete: operator={} windowId={}", operator.getName(), c1);
                        this.purgeCheckpoints.add(new Pair<PTOperator, Long>(groupOper, c1.windowId));
                        c1 = c2;
                    }
                } else {
                    if (ctx.recovery && checkpoints.isEmpty() && groupOper.isOperatorStateLess()) {
                        LOG.debug("Adding checkpoint for stateless operator {} {}", groupOper,
                                Codec.getStringWindowId(maxCheckpoint.windowId));
                        c1 = groupOper.addCheckpoint(maxCheckpoint.windowId, this.vars.windowStartMillis);
                    }
                }
            }
            //LOG.debug("Operator {} checkpoints: commit {} recent {}", new Object[] {operator.getName(), c1, operator.checkpoints});
            groupOper.setRecoveryCheckpoint(c1);
        } else {
            LOG.debug("Skipping checkpoint update {} during {}", groupOper, groupOper.getState());
        }
    }

}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

void addRowsToJochreImage(SourceImage sourceImage, List<RowOfShapes> rows) {
    LOG.debug("########## addRowsToJochreImage #########");

    sourceImage.getRows().clear();/*from   ww w  .j  av  a2  s.  c  om*/

    TreeSet<RowOfShapes> rowSet = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator());
    rowSet.addAll(rows);
    int i = 0;
    LOG.debug("====== Row list ========");
    for (RowOfShapes row : rowSet) {
        // order the shapes within the rows
        // here is where left-to-right or right-to-left matters
        row.reorderShapes();
        sourceImage.addRow(row);
        int oldIndex = row.getIndex();
        row.setIndex(i++);

        LOG.debug(row.toString() + " (old index = " + oldIndex + ")");
    }
}

From source file:net.spfbl.core.User.java

public synchronized Set<Long> headSet(long threshold) {
    if (queryMap == null) {
        return new TreeSet<Long>();
    } else {/* w  w  w .j a va 2 s.com*/
        TreeSet<Long> set = new TreeSet<Long>();
        set.addAll(queryMap.headMap(threshold).keySet());
        return set;
    }
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Combine rows that represent thin lines directly above or below another row
 * (e.g. diacritics)//from w w w .  j  a  va  2 s . c  o m
 */
void combineRows(SourceImage sourceImage) {
    LOG.debug("########## combineRows #########");
    // We thought of using row height, but mean row height is not a good enough
    // indicator when there are title rows with very big characters.
    // Instead, we need to go with Distance between rows when compared to mean - baseline
    // where distance between rows is measured between the tops and bottoms of nearby shapes.      

    int maxRowHeight = 0;
    for (RowOfShapes row : sourceImage.getRows()) {
        int rowHeight = row.getXHeightMax();
        if (rowHeight > maxRowHeight)
            maxRowHeight = rowHeight;
    }
    LOG.debug("maxRowHeight: " + maxRowHeight);

    TreeSet<RowOfShapes> rowSet = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator());
    rowSet.addAll(sourceImage.getRows());
    List<RowOfShapes> rows = new ArrayList<RowOfShapes>(rowSet);

    List<RowOfShapes> rowsToDelete = new ArrayList<RowOfShapes>();

    double maxShapeWidth = sourceImage.getAverageShapeWidth() * 8.0;
    LOG.debug("maxShapeWidth: " + maxShapeWidth);

    double maxRatioForCombine = 0.6;
    LOG.debug("maxRatioForCombine: " + maxRatioForCombine);

    int i = 0;
    while (i < rows.size()) {
        RowOfShapes currentRow = rows.get(i);
        boolean rowsCombined = false;

        if (!rowsToDelete.contains(currentRow)) {
            LOG.trace("Checking " + currentRow.toString());
            int currentRowHeight = currentRow.getXHeightMax();
            LOG.trace("xHeightMax =  " + currentRowHeight);

            RowOfShapes nearestRow = null;
            double shortestDistance = Double.MAX_VALUE;
            int masterRowHeight = -1;
            int j = 0;
            for (RowOfShapes otherRow : rows) {
                if (!rowsToDelete.contains(otherRow) && !(currentRow.equals(otherRow))) {
                    // limit our search to nearby rows
                    if (Math.abs(currentRow.getBaseLineMiddlePoint() - otherRow.getBaseLineMiddlePoint()) < (2.0
                            * maxRowHeight) && (currentRow.getRight() >= otherRow.getLeft())
                            && (otherRow.getRight() >= currentRow.getLeft())) {
                        LOG.trace("Comparing to " + otherRow.toString());
                        int otherRowHeight = otherRow.getXHeightMax();
                        LOG.trace("xHeightMax =  " + otherRowHeight);

                        RowOfShapes masterRow = currentRowHeight > otherRowHeight ? currentRow : otherRow;
                        RowOfShapes slaveRow = currentRowHeight > otherRowHeight ? otherRow : currentRow;

                        double heightRatio = ((double) slaveRow.getXHeightMax()
                                / (double) masterRow.getXHeightMax());
                        LOG.trace("height ratio (" + slaveRow.getXHeightMax() + " / "
                                + masterRow.getXHeightMax() + "): " + heightRatio);
                        if (heightRatio > maxRatioForCombine)
                            continue;

                        // avoid combining very long horizontal rules with other rows
                        // their top gives a false impression of being closer to the other row's bottom.
                        if ((masterRow.getMaxShapeWidth() > maxShapeWidth
                                || slaveRow.getMaxShapeWidth() > maxShapeWidth))
                            continue;

                        double distance = 0;
                        if (currentRow.getBaseLineMiddlePoint() < otherRow.getBaseLineMiddlePoint()) {
                            distance = (otherRow.getBaseLineMiddlePoint() - otherRow.getXHeightMax())
                                    - currentRow.getBaseLineMiddlePoint();
                            LOG.trace("(otherRow.baseLineMiddlePoint() " + otherRow.getBaseLineMiddlePoint()
                                    + " - otherRow.getXHeightMax() " + otherRow.getXHeightMax()
                                    + ") - currentRow.baseLineMiddlePoint() "
                                    + currentRow.getBaseLineMiddlePoint());
                        } else {
                            distance = (currentRow.getBaseLineMiddlePoint() - currentRow.getXHeightMax())
                                    - otherRow.getBaseLineMiddlePoint();
                            LOG.trace("(currentRow.baseLineMiddlePoint() " + currentRow.getBaseLineMiddlePoint()
                                    + " - currentRow.getXHeightMax() " + currentRow.getXHeightMax()
                                    + ") - otherRow.baseLineMiddlePoint() "
                                    + otherRow.getBaseLineMiddlePoint());
                        }
                        LOG.debug("Distance between rows: " + distance);

                        if (distance < shortestDistance) {
                            LOG.trace("Found new closest row: " + otherRow);
                            nearestRow = otherRow;
                            shortestDistance = distance;
                            masterRowHeight = (currentRowHeight >= otherRowHeight) ? currentRowHeight
                                    : otherRowHeight;
                        }
                    }
                }
                j++;
            }

            if (nearestRow != null) {
                // The number 3 below is chosen arbitrarily - basically we want a
                // relative way of indicating that the rows are very near to each other.
                double minDistanceForCombine = ((double) masterRowHeight / 3);
                LOG.trace("minDistanceForCombine: " + minDistanceForCombine);
                if (shortestDistance < minDistanceForCombine) {
                    LOG.debug("Combining the two rows");
                    LOG.debug(currentRow.toString());
                    LOG.debug(nearestRow.toString());
                    rowsToDelete.add(nearestRow);
                    currentRow.addShapes(nearestRow.getShapes());
                    currentRow.reorderShapes();
                    currentRow.recalculate();

                    this.joinShapesVertically(currentRow);
                    currentRow.assignGuideLines();

                    LOG.debug("Resulting row: " + currentRow.toString());

                    rowsCombined = true;
                }
            }
        }
        // We may need to combine multiple rows
        // so we only advance if no combination has taken place
        if (!rowsCombined)
            i++;

    }

    // actually delete the rows
    for (RowOfShapes rowToDelete : rowsToDelete) {
        sourceImage.getRows().remove(rowToDelete);
    }
    LOG.debug("########## end combineRows #########");
}

From source file:net.spfbl.core.User.java

public synchronized TreeSet<Long> getTimeSet() {
    if (queryMap == null) {
        return new TreeSet<Long>();
    } else {//from w  w  w . j  a  v a  2s.  c  o m
        TreeSet<Long> timeSet = new TreeSet<Long>();
        timeSet.addAll(queryMap.keySet());
        return timeSet;
    }
}

From source file:net.spfbl.core.User.java

public synchronized TreeSet<Long> getTimeSet(long begin, long end) {
    if (queryMap == null) {
        return new TreeSet<Long>();
    } else {/*from  w  ww.  ja va  2 s  .  c  o  m*/
        TreeSet<Long> timeSet = new TreeSet<Long>();
        timeSet.addAll(queryMap.subMap(begin, end).keySet());
        return timeSet;
    }
}

From source file:org.chiba.tools.schemabuilder.AbstractSchemaFormBuilder.java

/**
 * Build the type tree//from  w ww . ja v  a  2s .c  o m
 */
/*private void buildTypeTree(XSTypeDefinition type, TreeSet descendents) {
if (type != null) {
        
    if (descendents.size() > 0) {
        TreeSet compatibleTypes = (TreeSet) typeTree.get(type.getName());
        
        if (compatibleTypes == null) {
            compatibleTypes = new TreeSet(descendents);
            typeTree.put(type.getName(), compatibleTypes);
        } else {
            compatibleTypes.addAll(descendents);
        }
    }
        
    XSTypeDefinition parentType = type.getBaseType();
        
    if (parentType != null
        && type.getTypeCategory() == parentType.getTypeCategory()) {
        String typeName = type.getName();
        String parentTypeName = parentType.getName();
        if ((typeName == null && parentTypeName != null)
            || (typeName != null && parentTypeName == null)
            || (typeName != null
                && parentTypeName != null
                && !type.getName().equals(parentType.getName())
                && !parentType.getName().equals("anyType"))) {
        
TreeSet newDescendents=new TreeSet(descendents);
//extension (we only add it to "newDescendants" because we don't want
//to have a type descendant to itself, but to consider it for the parent
if (type.getTypeCategory() == XSTypeDefinition.COMPLEX_TYPE) {
XSComplexTypeDefinition complexType =
            (XSComplexTypeDefinition) type;
if (complexType.getDerivationMethod()
            == XSConstants.DERIVATION_EXTENSION
            && !complexType.getAbstract()
            && !descendents.contains(type.getName()) //to be tested
            ) {
newDescendents.add(type.getName());
}
}
//note: extensions are impossible on simpleTypes !
        
        buildTypeTree(parentType, newDescendents);
        }
    }
}
}*/
private void buildTypeTree(XSTypeDefinition type, TreeSet descendents) {
    if (type != null) {

        if (descendents.size() > 0) {
            //TreeSet compatibleTypes = (TreeSet) typeTree.get(type.getName());
            TreeSet compatibleTypes = (TreeSet) typeTree.get(type.getName());

            if (compatibleTypes == null) {
                //compatibleTypes = new TreeSet(descendents);
                compatibleTypes = new TreeSet(TypeExtensionSorter.getInstance());
                compatibleTypes.addAll(descendents);
                //typeTree.put(type.getName(), compatibleTypes);
                typeTree.put(type.getName(), compatibleTypes);
            } else {
                compatibleTypes.addAll(descendents);
            }
        }

        XSTypeDefinition parentType = type.getBaseType();

        if (parentType != null && type.getTypeCategory() == parentType.getTypeCategory()) {
            /*String typeName = type.getName();
            String parentTypeName = parentType.getName();
            if ((typeName == null && parentTypeName != null)
            || (typeName != null && parentTypeName == null)
            || (typeName != null
                && parentTypeName != null
                && !type.getName().equals(parentType.getName())
                && !parentType.getName().equals("anyType"))) {*/
            if (type != parentType
                    && (parentType.getName() == null || !parentType.getName().equals("anyType"))) {

                //TreeSet newDescendents=new TreeSet(descendents);
                TreeSet newDescendents = new TreeSet(TypeExtensionSorter.getInstance());
                newDescendents.addAll(descendents);

                //extension (we only add it to "newDescendants" because we don't want
                //to have a type descendant to itself, but to consider it for the parent
                if (type.getTypeCategory() == XSTypeDefinition.COMPLEX_TYPE) {
                    XSComplexTypeDefinition complexType = (XSComplexTypeDefinition) type;
                    if (complexType.getDerivationMethod() == XSConstants.DERIVATION_EXTENSION
                            && !complexType.getAbstract() && !descendents.contains(type) //to be tested
                    //&& !descendents.contains(type.getName()) //to be tested
                    ) {
                        //newDescendents.add(type.getName());
                        newDescendents.add(type);
                    }
                }
                //note: extensions are impossible on simpleTypes !

                buildTypeTree(parentType, newDescendents);
            }
        }
    }
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Clear out anything found in the right & left margins
 * @param sourceImage/* ww  w  .j ava2  s. c  o  m*/
 */
void cleanMargins(SourceImage sourceImage) {
    LOG.debug("########## cleanMargins #########");

    int minCardinalityForMargin = 8;
    double averageShapeWidth = sourceImage.getAverageShapeWidth();

    LOG.debug("Finding right margin");
    double rightLimit = (double) sourceImage.getWidth() * 0.67;

    // first, create a DBScan cluster of all rows near the right-hand side
    List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
    List<double[]> rightCoordinates = new ArrayList<double[]>();

    for (RowOfShapes row : sourceImage.getRows()) {
        double right = row.getRight();
        if (right >= rightLimit) {
            LOG.trace(row.toString());
            LOG.trace(
                    "Right: " + right + " + " + row.getXAdjustment() + " = " + (right - row.getXAdjustment()));
            right -= row.getXAdjustment();
            rightHandRows.add(row);
            rightCoordinates.add(new double[] { right });
        }
    }

    DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
            rightCoordinates);
    Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin,
            true);

    TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
            new CardinalityComparator<RowOfShapes>());
    orderedRowClusters.addAll(rowClusters);

    int i = 0;

    // find the right-most cluster with sufficient cardinality, and assume it's the right margin
    DescriptiveStatistics rightMarginStats = null;
    for (Set<RowOfShapes> cluster : orderedRowClusters) {
        DescriptiveStatistics rightStats = new DescriptiveStatistics();
        for (RowOfShapes row : cluster)
            rightStats.addValue(row.getRight() - row.getXAdjustment());

        LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
        LOG.debug("Right mean : " + rightStats.getMean());
        LOG.debug("Right std dev: " + rightStats.getStandardDeviation());

        if (cluster.size() >= minCardinalityForMargin
                && (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean())) {
            rightMarginStats = rightStats;
        }
        i++;
    }

    // see how many rows would violate this margin - if too many, assume no margin
    // these rows are only rows which extend across the margin
    if (rightMarginStats != null) {
        LOG.debug("Right margin mean : " + rightMarginStats.getMean());
        LOG.debug("Right margin std dev: " + rightMarginStats.getStandardDeviation());

        double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth();
        LOG.debug("rightMarginLimit: " + rightMarginLimit);
        int numRowsToChop = 0;
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getRight() >= rightLimit) {
                if (row.getRight() - row.getXAdjustment() >= rightMarginLimit
                        && row.getLeft() - row.getXAdjustment() <= rightMarginLimit) {
                    LOG.debug("Found overlapping row : " + row);
                    LOG.debug("Adjusted right : " + (row.getRight() - row.getXAdjustment()));
                    numRowsToChop++;
                }
            }
        }
        if (numRowsToChop >= 3) {
            LOG.debug("Too many overlapping rows - ignoring margin");
            rightMarginStats = null;
        }
    }

    if (rightMarginStats != null) {
        double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth();
        List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            double right = row.getRight() - row.getXAdjustment();
            LOG.trace(row.toString());
            LOG.trace("Adjusted right: " + right);

            if (right >= rightMarginLimit) {
                LOG.trace("Has out-of-margin stuff!");
                // need to chop off groups to the right of this threshold
                List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>();
                for (GroupOfShapes group : row.getGroups()) {
                    if (group.getLeft() - row.getXAdjustment() > rightMarginLimit) {
                        groupsToChop.add(group);
                        LOG.debug("Chopping group outside of right margin: " + group);
                    }
                }
                for (GroupOfShapes group : groupsToChop) {
                    row.getShapes().removeAll(group.getShapes());
                }
                row.getGroups().removeAll(groupsToChop);

                if (row.getGroups().size() == 0) {
                    LOG.debug("Removing empty " + row);
                    rowsToRemove.add(row);
                } else {
                    row.recalculate();
                    row.assignGuideLines();
                }
            } // does this row extend beyond the margin?
        } // next row
        sourceImage.getRows().removeAll(rowsToRemove);
    } // have a right margin

    LOG.debug("Finding left margin");
    double leftLimit = (double) sourceImage.getWidth() * 0.33;

    // first, create a DBScan cluster of all rows near the left-hand side
    List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
    List<double[]> leftCoordinates = new ArrayList<double[]>();

    for (RowOfShapes row : sourceImage.getRows()) {
        double left = row.getLeft();
        if (left <= leftLimit) {
            LOG.trace(row.toString());
            LOG.trace("Left: " + left + " - " + row.getXAdjustment() + " = " + (left - row.getXAdjustment()));
            left -= row.getXAdjustment();
            leftHandRows.add(row);
            leftCoordinates.add(new double[] { left });
        }
    }

    DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
            leftCoordinates);
    Set<Set<RowOfShapes>> rowClustersLeft = leftMarginClusterer.cluster(averageShapeWidth,
            minCardinalityForMargin, true);

    TreeSet<Set<RowOfShapes>> orderedRowClustersLeft = new TreeSet<Set<RowOfShapes>>(
            new CardinalityComparator<RowOfShapes>());
    orderedRowClustersLeft.addAll(rowClustersLeft);

    i = 0;

    // find the left-most cluster with sufficient cardinality, and assume it's the left margin
    DescriptiveStatistics leftMarginStats = null;
    for (Set<RowOfShapes> cluster : orderedRowClustersLeft) {
        DescriptiveStatistics leftStats = new DescriptiveStatistics();
        for (RowOfShapes row : cluster)
            leftStats.addValue(row.getLeft() - row.getXAdjustment());

        LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
        LOG.debug("Left mean : " + leftStats.getMean());
        LOG.debug("Left std dev: " + leftStats.getStandardDeviation());

        if (cluster.size() >= minCardinalityForMargin
                && (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean())) {
            leftMarginStats = leftStats;
        }
        i++;
    }

    // see how many rows would violate this margin - if too many, assume no margin
    // these rows are only rows which extend across the margin
    if (leftMarginStats != null) {
        LOG.debug("Left margin mean : " + leftMarginStats.getMean());
        LOG.debug("Left margin std dev: " + leftMarginStats.getStandardDeviation());

        double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth();
        LOG.debug("leftMarginLimit: " + leftMarginLimit);
        int numRowsToChop = 0;
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getLeft() <= leftLimit) {
                if (row.getLeft() - row.getXAdjustment() <= leftMarginLimit
                        && row.getRight() - row.getXAdjustment() >= leftMarginLimit) {
                    LOG.debug("Found overlapping row : " + row);
                    LOG.debug("Adjusted left : " + (row.getLeft() - row.getXAdjustment()));
                    numRowsToChop++;
                }
            }
        }
        if (numRowsToChop >= 3) {
            LOG.debug("Too many overlapping rows - ignoring margin");
            leftMarginStats = null;
        }
    }

    if (leftMarginStats != null) {
        double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth();
        List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            double left = row.getLeft() - row.getXAdjustment();
            LOG.trace(row.toString());
            LOG.trace("Adjusted left: " + left);

            if (left <= leftMarginLimit) {
                LOG.trace("Has out-of-margin stuff!");
                // need to chop off groups to the left of this threshold
                List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>();
                for (GroupOfShapes group : row.getGroups()) {
                    if (group.getRight() - row.getXAdjustment() < leftMarginLimit) {
                        groupsToChop.add(group);
                        LOG.debug("Chopping group outside of left margin: " + group);
                    }
                }
                for (GroupOfShapes group : groupsToChop) {
                    row.getShapes().removeAll(group.getShapes());
                }
                row.getGroups().removeAll(groupsToChop);

                if (row.getGroups().size() == 0) {
                    LOG.debug("Removing empty " + row);
                    rowsToRemove.add(row);
                } else {
                    row.recalculate();
                    row.assignGuideLines();
                }
            } // does this row extend beyond the margin?
        } // next row
        sourceImage.getRows().removeAll(rowsToRemove);
    } // have a left margin
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Detects paragraph splits and assign rows to correct paragraphs.
 * @param sourceImage/*from  w  ww  .  j a va  2s.  c o m*/
 */
void groupRowsIntoParagraphs(SourceImage sourceImage) {
    LOG.debug("########## groupRowsIntoParagraphs #########");
    // We'll use various possible indicators, including
    // indented start, indented end, and spacing between rows.

    // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end
    // This means we cannot use deviation. Instead, we use the average shape width on the page.
    // We also adjust maxLeft & minRight to match the vertical line slope

    // This is now complicated by the possibility of multiple columns

    // Need to take into account a big horizontal space - Pietrushka page 14
    // Find horizontal spaces that go all the way across and are wider than a certain threshold
    // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold
    // Columns are thus arranged into "areas", separated by white-space.
    boolean[] fullRows = new boolean[sourceImage.getHeight()];
    for (RowOfShapes row : sourceImage.getRows()) {
        for (int y = row.getTop(); y <= row.getBottom(); y++) {
            fullRows[y] = true;
        }
    }
    DescriptiveStatistics rowHeightStats = new DescriptiveStatistics();

    for (RowOfShapes row : sourceImage.getRows()) {
        int height = row.getXHeight();
        rowHeightStats.addValue(height);
    }
    double avgRowHeight = rowHeightStats.getPercentile(50);
    LOG.debug("meanRowHeight: " + avgRowHeight);
    double minHeightForWhiteSpace = avgRowHeight * 1.3;
    LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace);

    // find the "white rows" - any horizontal white space
    // in the page which is sufficiently high
    List<int[]> whiteRows = new ArrayList<int[]>();
    boolean inWhite = false;
    int startWhite = 0;
    for (int y = 0; y < sourceImage.getHeight(); y++) {
        if (!inWhite && !fullRows[y]) {
            inWhite = true;
            startWhite = y;
        } else if (inWhite && fullRows[y]) {
            int length = y - startWhite;
            if (length > minHeightForWhiteSpace) {
                LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1));
                whiteRows.add(new int[] { startWhite, y - 1 });
            }
            inWhite = false;
        }
    }
    if (inWhite)
        whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 });
    whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() });

    // place rows in "areas" defined by the "white rows" found above
    List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>();
    int startY = -1;
    for (int[] whiteRow : whiteRows) {
        List<RowOfShapes> area = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) {
                area.add(row);
            }
        }
        if (area.size() > 0) {
            areas.add(area);
        }
        startY = whiteRow[1];
    }

    // break up each area into vertical columns
    LOG.debug("break up each area into vertical columns");
    List<Column> columns = new ArrayList<Column>();
    List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>();
    for (List<RowOfShapes> area : areas) {
        LOG.debug("Next area");
        List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>();
        columnsPerAreaList.add(columnsPerArea);
        TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator());
        rows.addAll(area);
        for (RowOfShapes row : rows) {
            // try to place this row in one of the columns directly above it.
            // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered
            List<Column> overlappingColumns = new ArrayList<Column>();
            for (Column column : columnsPerArea) {
                if (!column.closed) {
                    RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                    if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft()
                            - lastRowInColumn.getXAdjustment()
                            && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight()
                                    - lastRowInColumn.getXAdjustment()) {
                        overlappingColumns.add(column);
                    }
                }
            }
            if (overlappingColumns.size() == 1) {
                Column myColumn = overlappingColumns.get(0);
                RowOfShapes lastRowInMyColumn = myColumn.get(0);

                // close any columns that are now at a distance of more than one row
                for (Column column : columnsPerArea) {
                    if (!column.closed && !column.equals(myColumn)) {
                        RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                        if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) {
                            column.closed = true;
                            LOG.debug("Closing distant column " + lastRowInColumn);
                        }
                    }
                }

                myColumn.add(row);
                LOG.debug(row.toString());
                LOG.debug("  added to column " + lastRowInMyColumn);
            } else {
                for (Column overlappingColumn : overlappingColumns) {
                    overlappingColumn.closed = true;
                    RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1);
                    LOG.debug("Closing overlapping column " + lastRowInColumn);
                }
                Column myColumn = new Column(sourceImage);
                myColumn.add(row);
                LOG.debug("Found new column");
                LOG.debug(row.toString());
                columns.add(myColumn);
                columnsPerArea.add(myColumn);
            }
        }
    } // next area

    for (Column column : columns)
        column.recalculate();

    // Intermediate step to reform the vertical columns, if they exist
    // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents
    // should be shared, to increase the statistical sample size and reduce anomalies.
    // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally
    // and don't overlap with any other column in the other column's area.
    List<List<Column>> columnGroups = new ArrayList<List<Column>>();
    List<Column> columnsInPrevArea = null;
    for (List<Column> columnsPerArea : columnsPerAreaList) {
        if (columnsInPrevArea != null) {
            for (Column prevColumn : columnsInPrevArea) {
                LOG.debug("Checking " + prevColumn);
                // find the column group containing the previous column
                List<Column> myColumnGroup = null;
                for (List<Column> columnGroup : columnGroups) {
                    if (columnGroup.contains(prevColumn)) {
                        myColumnGroup = columnGroup;
                        break;
                    }
                }
                if (myColumnGroup == null) {
                    myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                    LOG.debug("Creating column group for column " + prevColumn.toString());
                    columnGroups.add(myColumnGroup);
                    myColumnGroup.add(prevColumn);
                }

                // does only one column overlap with this one?
                Column overlappingColumn = null;
                for (Column column : columnsPerArea) {
                    if (column.adjustedRight >= prevColumn.adjustedLeft
                            && column.adjustedLeft <= prevColumn.adjustedRight) {
                        if (overlappingColumn == null) {
                            LOG.debug("I overlap with " + column);

                            overlappingColumn = column;
                        } else {
                            LOG.debug("But I overlap also with " + column);

                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    // does it overlap with only me?
                    for (Column otherPrevColumn : columnsInPrevArea) {
                        if (otherPrevColumn.equals(prevColumn))
                            continue;
                        if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft
                                && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) {
                            LOG.debug("But it overlaps also with " + otherPrevColumn);
                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    myColumnGroup.add(overlappingColumn);
                    LOG.debug("Adding " + overlappingColumn);
                    LOG.debug(" to group with " + prevColumn);
                }

            } // next previous column
        } // have previous columns
        columnsInPrevArea = columnsPerArea;
    } // next area
    if (columnsInPrevArea != null) {
        for (Column prevColumn : columnsInPrevArea) {
            // find the column group containing the previous column
            List<Column> myColumnGroup = null;
            for (List<Column> columnGroup : columnGroups) {
                if (columnGroup.contains(prevColumn)) {
                    myColumnGroup = columnGroup;
                    break;
                }
            }
            if (myColumnGroup == null) {
                myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                LOG.debug("Creating column group for column " + prevColumn.toString());
                columnGroups.add(myColumnGroup);
                myColumnGroup.add(prevColumn);
            }
        }
    }

    // What we really want here is, for each column (in the case of right-to-left),
    // two clusters on the right
    // and one relatively big cluster on the left.
    // anything outside of the cluster on the left is an EOP.
    boolean hasTab = false;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        double averageShapeWidth = sourceImage.getAverageShapeWidth();
        LOG.debug("averageShapeWidth: " + averageShapeWidth);
        double epsilon = averageShapeWidth / 2.0;
        LOG.debug("epsilon: " + epsilon);

        int columnGroupTop = sourceImage.getHeight();
        int columnGroupBottom = 0;
        int columnGroupLeft = sourceImage.getWidth();
        int columnGroupRight = 0;
        for (Column column : columnGroup) {
            if (column.top < columnGroupTop)
                columnGroupTop = (int) Math.round(column.top);
            if (column.bottom > columnGroupBottom)
                columnGroupBottom = (int) Math.round(column.bottom);
            if (column.adjustedLeft < columnGroupLeft)
                columnGroupLeft = (int) Math.round(column.adjustedLeft);
            if (column.adjustedRight > columnGroupRight)
                columnGroupRight = (int) Math.round(column.adjustedRight);
        }

        // right thresholds
        LOG.debug("Calculating right thresholds");

        // first, create a DBScan cluster of all rows by their adjusted right coordinate
        List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
        List<double[]> rightCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double right = row.getRight() - row.getXAdjustment();
                //               double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                //               if (rightOverlap==0) {
                //                  // leave out any right-overlapping rows here
                //                  // since we need accurate statistics for margin detection
                //               // This is questionable - especially since a long vertical bar (see Petriushka)
                //               // tends to give all rows a left overlap. Also, because the overlap is calculated based
                //               // on the mean right & mean left, not based on any sort of margin clusters.
                //                  rightHandRows.add(row);
                //                  rightCoordinates.add(new double[] {right});
                //               }
                rightHandRows.add(row);
                rightCoordinates.add(new double[] { right });

            }
        }

        int minCardinalityForRightMargin = 5;
        DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
                rightCoordinates);
        Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin,
                true);

        TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedRowClusters.addAll(rowClusters);

        int i = 0;

        // find the two right-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics rightMarginStats = null;
        DescriptiveStatistics rightTabStats = null;
        for (Set<RowOfShapes> cluster : orderedRowClusters) {
            DescriptiveStatistics rightStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = rightHandRows.indexOf(row);
                double right = rightCoordinates.get(rowIndex)[0];
                rightStats.addValue(right);
                rightDev.increment(right);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Right mean : " + rightStats.getMean());
            LOG.debug("Right dev: " + rightDev.getResult());

            if (cluster.size() >= minCardinalityForRightMargin) {
                if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) {
                    if (rightMarginStats != null)
                        rightTabStats = rightMarginStats;
                    rightMarginStats = rightStats;
                } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) {
                    rightTabStats = rightStats;
                }
            } else {
                break;
            }
            i++;
        } // next right-coordinate cluster

        double rightMargin = sourceImage.getWidth();
        double rightTab = sourceImage.getWidth();
        if (rightMarginStats != null) {
            rightMargin = rightMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getLeft() >= columnGroupRight) {
                    if (columnSeparator.getLeft() < rightMargin)
                        rightMargin = columnSeparator.getLeft();
                }
            }
        }
        if (rightTabStats != null) {
            rightTab = rightTabStats.getMean();
        }

        LOG.debug("rightMargin: " + rightMargin);
        LOG.debug("rightTab: " + rightTab);

        // left thresholds
        LOG.debug("Calculating left thresholds");

        // first, create a DBScan cluster of all rows by their adjusted left coordinate
        List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
        List<double[]> leftCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double left = row.getLeft() - row.getXAdjustment();
                //               double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);
                //               if (leftOverlap == 0) {
                //                  // leave out any overlapping rows from margin calcs,
                //                  // since we need accurate statistics here
                //                  leftHandRows.add(row);
                //                  leftCoordinates.add(new double[] {left});
                //               }
                leftHandRows.add(row);
                leftCoordinates.add(new double[] { left });
            }
        }

        int minCardinalityForLeftMargin = 5;
        DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
                leftCoordinates);
        Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon,
                minCardinalityForLeftMargin, true);

        TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedLeftRowClusters.addAll(leftRowClusters);

        i = 0;

        // find the two left-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics leftMarginStats = null;
        DescriptiveStatistics leftTabStats = null;
        for (Set<RowOfShapes> cluster : orderedLeftRowClusters) {
            DescriptiveStatistics leftStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = leftHandRows.indexOf(row);
                double left = leftCoordinates.get(rowIndex)[0];
                leftStats.addValue(left);
                leftDev.increment(left);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Left mean : " + leftStats.getMean());
            LOG.debug("Left dev: " + leftDev.getResult());

            if (cluster.size() >= minCardinalityForLeftMargin) {
                if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) {
                    if (leftMarginStats != null)
                        leftTabStats = leftMarginStats;
                    leftMarginStats = leftStats;
                } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) {
                    leftTabStats = leftStats;
                }
            } else {
                break;
            }
            i++;
        } // next left-coordinate cluster

        double leftMargin = 0;
        double leftTab = 0;
        if (leftMarginStats != null) {
            leftMargin = leftMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getRight() <= columnGroupLeft) {
                    if (columnSeparator.getRight() > leftMargin)
                        leftMargin = columnSeparator.getRight();
                }
            }
        }
        if (leftTabStats != null) {
            leftTab = leftTabStats.getMean();
        }

        LOG.debug("leftMargin: " + leftMargin);
        LOG.debug("leftTab: " + leftTab);

        for (Column column : columnGroup) {
            if (sourceImage.isLeftToRight()) {
                column.startMargin = leftMargin;
                if (leftTabStats != null) {
                    column.startTab = leftTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No left tab - setting based on left margin");
                    column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = rightMargin;
            } else {
                column.startMargin = rightMargin;
                if (rightTabStats != null) {
                    column.startTab = rightTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No right tab - setting based on right margin");
                    column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = leftMargin;
            }
            LOG.debug("Margins for " + column);
            LOG.debug("startMargin: " + column.startMargin);
            LOG.debug("startTab: " + column.startTab);
            LOG.debug("endMargin: " + column.endMargin);
        } // next column
    } // next column group
    LOG.debug("hasTab: " + hasTab);

    double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth();

    // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs).
    // This applies to the entire page.
    // To recognise indenting vs. outdenting, we have to see if the row preceding each
    // indent/outdent is full or partial. In the case of indentation, partial rows will
    // typically be followed by an indent. In the case of outdentation, partial rows will
    // typically be followed by an outdent.
    boolean isIndented = true;

    int indentCount = 0;
    int outdentCount = 0;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        boolean prevRowPartial = false;
        for (Column column : columnGroup) {
            if (column.hasTab) {
                for (RowOfShapes row : column) {
                    if (sourceImage.isLeftToRight()) {
                        if (prevRowPartial) {
                            if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) {
                                indentCount++;
                            } else if (row.getLeft() - row.getXAdjustment() < column.startMargin
                                    + safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } else {
                        if (prevRowPartial) {
                            if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) {
                                indentCount++;
                            } else if (row.getRight() - row.getXAdjustment() > column.startMargin
                                    - safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } // left-to-right?
                } // next row  
            } // column has tab
        } // next column
    } // next column group
    isIndented = (indentCount + 2 >= outdentCount);
    LOG.debug("indentCount: " + indentCount);
    LOG.debug("outdentCount: " + outdentCount);
    LOG.debug("isIndented: " + isIndented);

    // order the columns
    TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns);
    columns.clear();
    columns.addAll(orderedColumns);

    // find the paragraphs found in each column
    for (Column column : columns) {
        LOG.debug("--- Next column ---");

        // break up the column into paragraphs 
        Paragraph paragraph = null;
        RowOfShapes previousRow = null;
        int maxShapesForStandaloneParagraph = 2;
        List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>();
        Point2D previousPointStartMargin = null;
        Point2D previousPointStartTab = null;
        Point2D previousPointEndMargin = null;

        for (RowOfShapes row : column) {
            boolean rowForStandaloneParagraph = false;
            boolean newParagraph = false;
            if (row.getShapes().size() <= maxShapesForStandaloneParagraph) {
                rowsForStandaloneParagraphs.add(row);
                rowForStandaloneParagraph = true;
            } else {
                double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);

                if (drawSegmentation) {
                    double rowVerticalMidPoint = row.getBaseLineMiddlePoint();
                    double startMarginX = column.startMargin + row.getXAdjustment();
                    double startTabX = column.startTab + row.getXAdjustment();
                    double endMarginX = column.endMargin + row.getXAdjustment();

                    if (sourceImage.isLeftToRight()) {
                        startMarginX += safetyMargin;
                        startTabX -= safetyMargin;
                        endMarginX -= safetyMargin;

                        startMarginX += leftOverlap;
                        startTabX += leftOverlap;
                        endMarginX -= rightOverlap;
                    } else {
                        startMarginX -= safetyMargin;
                        startTabX += safetyMargin;
                        endMarginX += safetyMargin;

                        startMarginX -= rightOverlap;
                        startTabX -= rightOverlap;
                        endMarginX += leftOverlap;
                    }

                    Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX,
                            rowVerticalMidPoint);
                    Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint);
                    Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint);

                    if (previousPointStartMargin != null) {
                        graphics2D.setStroke(new BasicStroke(1));
                        graphics2D.setPaint(Color.BLUE);
                        graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()),
                                (int) Math.round(previousPointStartMargin.getY()),
                                (int) Math.round(currentPointStartMargin.getX()),
                                (int) Math.round(currentPointStartMargin.getY()));
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()),
                                (int) Math.round(previousPointStartTab.getY()),
                                (int) Math.round(currentPointStartTab.getX()),
                                (int) Math.round(currentPointStartTab.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));
                    }
                    previousPointStartMargin = currentPointStartMargin;
                    previousPointStartTab = currentPointStartTab;
                    previousPointEndMargin = currentPointEndMargin;
                }

                if (previousRow == null) {
                    LOG.debug("New paragraph (first)");
                    newParagraph = true;
                } else {
                    if (sourceImage.isLeftToRight()) {
                        if (previousRow.getRight() - previousRow.getXAdjustment()
                                - rightOverlap < column.endMargin - safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap > column.startTab - safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap < column.startMargin + safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } else {
                        if (previousRow.getLeft() - previousRow.getXAdjustment()
                                + leftOverlap > column.endMargin + safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap < column.startTab + safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap > column.startMargin - safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } // left-to-right?
                } // have previous row
            } // standalone paragraph?

            if (!rowForStandaloneParagraph)
                LOG.debug(row.toString());

            if (newParagraph) {
                if (rowsForStandaloneParagraphs.size() > 0) {
                    for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                        LOG.debug("Standalone paragraph");
                        LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop()
                                + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                        Paragraph standaloneParagraph = sourceImage.newParagraph();
                        standaloneParagraph.getRows().add(oneRow);
                    }
                    rowsForStandaloneParagraphs.clear();
                }
                paragraph = sourceImage.newParagraph();
            }
            //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")");

            if (!rowForStandaloneParagraph) {
                paragraph.getRows().add(row);
                previousRow = row;
            }
        } // next row in column
        if (rowsForStandaloneParagraphs.size() > 0) {
            for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                LOG.debug("Standalone paragraph");
                LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right("
                        + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                Paragraph standaloneParagraph = sourceImage.newParagraph();
                standaloneParagraph.getRows().add(oneRow);
            }
            rowsForStandaloneParagraphs.clear();
        }
    } // next column

}

From source file:crawler.HackerEarthCrawler.java

@Override
public void crawl() {

    int flag = 0;

    //set of urls which should be crawled
    TreeSet<String> linksset = new TreeSet<String>();
    TreeSet<String> tempset = new TreeSet<String>();
    TreeSet<String> tutorialset = new TreeSet<String>();
    //final set of problem urls
    TreeSet<String> problemset = new TreeSet<String>();
    //visited for maintaing status of if url is already crawled or not
    TreeMap<String, Integer> visited = new TreeMap<String, Integer>();

    //add base url
    linksset.add(baseUrl);/*from   w w w  . ja v a2s. c om*/
    //mark base url as not crawled
    visited.put(baseUrl, 0);

    try {
        while (true) {
            flag = 0;
            tempset.clear();

            for (String str : linksset) {
                //check if url is already crawled or not and it has valid domain name
                if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) {
                    System.out.println("crawling  " + str);

                    //retriving response of current url as document
                    Document doc = Jsoup.connect(str).timeout(0).userAgent(
                            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0")
                            .referrer("http://www.google.com").ignoreHttpErrors(true).get();
                    //retriving all urls from current page
                    Elements links = doc.select("a[href]");

                    //mark url as crawled
                    visited.put(str, 1);

                    //mark flag as url is crawled
                    flag = 1;
                    //retrive all urls
                    for (Element link : links) {
                        if (link.absUrl("href").endsWith("/tutorial/")) {
                            tutorialset.add(link.absUrl("href"));
                        }
                        //check if url is problem url then add it in problemurlset
                        if (link.absUrl("href").startsWith("https://www.hackerearth.com/")
                                && isProblemUrl(link.absUrl("href"))) {
                            problemset.add(link.absUrl("href"));
                        }
                        //check if url has valid domain and it has problem urls or not
                        if (link.absUrl("href").contains(("https://www.hackerearth.com/"))
                                && isCrawlable(link.absUrl("href"))) {
                            //if link is not visited then mark it as uncrawled
                            if (!visited.containsKey(link.absUrl("href"))) {
                                visited.put(link.absUrl("href"), 0);
                            }
                            //add it in tempsetorary set
                            tempset.add(link.absUrl("href"));
                            //System.out.println("\n  base: "+str+" ::: link  : " + link.absUrl("href"));
                        }
                    }
                }
            }
            //if nothing is left to crawl break the loop
            if (flag == 0) {
                break;
            }
            //add all retrieved links to linksset
            linksset.addAll(tempset);
        }

        System.out.println("\n\ntotal problem urls " + problemset.size());

        int i = 0;
        for (String str : problemset) {
            System.out.println("link " + i + " : " + str);
            i++;
        }

    } catch (IOException ex) {
        Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex);
    }

    //scrap and store into database
    //for every problem url scrap problem page
    for (String problemUrl : problemset) {

        System.out.println("problemUrl :" + problemUrl);
        try {
            //create problem class to store in database
            Problem problem = new Problem();
            String problemSIOC = "", problemIOC = "";
            String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "",
                    problemConstraints = "";
            String sampleInput = "", sampleOutput = "";
            String problemExplanation = "";
            //set default timelimit to 1 second
            double problemTimeLimit = 1.0;
            ArrayList<String> tags = new ArrayList<String>();

            //get response for given problem url
            Response response = Jsoup.connect(problemUrl).execute();
            Document doc = response.parse();

            //retrieve problem title from page
            Element elementTitle = doc.getElementsByTag("title").first();
            StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|");
            problemTitle = stTitle.nextToken().trim();

            Element content = doc.getElementsByClass("starwars-lab").first();
            problemSIOC = content.text();
            Elements e = content.children();

            //to find problem statement
            String breakloop[] = { "input", "input:", "input :", "input format:", "input format :",
                    "input format", "Input and output", "constraints :", "constraints:", "constraints",
                    "$$Input :$$" };
            flag = 0;
            for (Element p : e) {
                String tempStatement = "";
                for (Element pp : p.getAllElements()) {

                    for (String strbreak : breakloop) {
                        if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) {
                            //System.out.println("strbreak :"+strbreak);

                            tempStatement = p.text().substring(0,
                                    p.text().toLowerCase().indexOf(strbreak.toLowerCase()));
                            // System.out.println("temp "+tempStatement);
                            flag = 1;
                            break;
                        }
                    }
                }

                if (flag == 1) {
                    problemStatement += tempStatement;
                    //remove extra space at end
                    if (tempStatement.length() == 0) {
                        problemStatement = problemStatement.substring(0, problemStatement.length() - 1);
                    }
                    break;
                }
                problemStatement += p.text() + " ";
            }

            System.out.println("problemSIOC :" + problemSIOC);
            System.out.println("problemStatement :" + problemStatement);

            if (problemStatement.length() <= problemSIOC.length()) {
                //remove problem statement from whole text and remove extra spaces at the beginning and the end
                problemIOC = problemSIOC.substring(problemStatement.length()).trim();
            } else {
                problemIOC = "";
            }

            System.out.println("problemIOC :" + problemIOC);

            //keywords for identifying input
            String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:",
                    "inputformat :", "inputformat", "input and output", "input :", "input:", "input" };
            //keywords for identifying output
            String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:",
                    "outputformat :", "outputformat", "output :", "output:", "output" };
            //keywords for identifying constraint
            String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :",
                    "constraint:", "constraint :", "constraint", "Contraints :" };

            int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0,
                    flagcon = 0, inlen = 0, outlen = 0, conlen = 0;

            //find inputformat position,length of keyword
            for (idxin = 0; idxin < decideInput.length; idxin++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) {

                    posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase());
                    flaginput = 1;
                    inlen = decideInput[idxin].length();

                    //decide it is keyowrd for actucal input or it is "sample input"
                    if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) {
                        if (posin > problemIOC.toLowerCase().indexOf("sample input")) {
                            flaginput = 0;
                            inlen = 0;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
            }

            //find outputformat position,length of keyword
            for (idxout = 0; idxout < decideOutput.length; idxout++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) {
                    posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase());
                    flagoutput = 1;
                    outlen = decideOutput[idxout].length();
                    break;
                }
            }

            //find constraint position,length of keyword
            for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) {
                    poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase());
                    flagcon = 1;
                    conlen = decideConstraint[idxcon].length();
                    break;
                }
            }

            System.out.println("input " + flaginput + " " + inlen + " " + posin);
            System.out.println("output " + flagoutput + " " + outlen + " " + posoutput);
            System.out.println("constraint " + flagcon + " " + conlen + " " + poscon);
            //retrieve problem input and output if present in problem page

            //if input format is present
            if (flaginput == 1) {
                //if input keyword is "input and output" and contraint is present in problem page
                if (idxin == 6 && flagcon == 1) {
                    problemInput = problemIOC.substring(inlen, poscon);
                }
                //if input keyword is "input and output" and contraint is not present in problem page
                else if (idxin == 6 && flagcon == 0) {
                    problemInput = problemIOC.substring(inlen);
                }
                //if output format and constraint is present
                else if (flagoutput == 1 && flagcon == 1) {
                    //if constraint is present before input format
                    if (poscon < posin) {
                        problemInput = problemIOC.substring(posin + inlen, posoutput);
                        problemOutput = problemIOC.substring(posoutput + outlen);
                    }
                    //if constraint is present before sample
                    else if (poscon < posoutput) {
                        problemInput = problemIOC.substring(inlen, poscon);
                        problemOutput = problemIOC.substring(posoutput + outlen);
                    } else {
                        problemInput = problemIOC.substring(inlen, posoutput);
                        problemOutput = problemIOC.substring(posoutput + outlen, poscon);
                    }
                }
                //if constraint is not present
                else if (flagoutput == 1 && flagcon == 0) {
                    problemInput = problemIOC.substring(inlen, posoutput);
                    problemOutput = problemIOC.substring(posoutput + outlen);
                } else if (flagoutput == 0 && flagcon == 1) {
                    if (poscon < posin) {
                        problemInput = problemIOC.substring(posin + inlen);
                    } else {
                        problemInput = problemIOC.substring(poscon + conlen, posin);
                    }
                    problemOutput = "";
                } else {
                    problemInput = problemIOC.substring(inlen);
                    problemOutput = "";
                }
            }
            //if input format and output format is not present
            else {
                problemInput = "";
                problemOutput = "";
            }

            //if constraint is present
            if (flagcon == 1) {
                //if constraint is present before input format
                if (poscon < posin) {
                    problemConstraints = problemIOC.substring(0, posin);
                }
                //if constraint is present before output format
                else if (poscon < posoutput) {
                    problemConstraints = problemIOC.substring(poscon + conlen, posoutput);
                } else {
                    problemConstraints = problemIOC.substring(poscon + conlen);
                }
            }

            System.out.println("problemInput :" + problemInput);
            System.out.println("problemOutput :" + problemOutput);
            System.out.println("problemConstraints :" + problemConstraints);

            //retrieve problem tags from problem page
            Element elementtag = doc.getElementsByClass("problem-tags").first().child(1);
            StringTokenizer st = new StringTokenizer(elementtag.text(), ",");
            while (st.hasMoreTokens()) {
                tags.add(st.nextToken().trim());
            }

            //retrieve sample input sample output if present
            Element elementSIO = doc.getElementsByClass("input-output-container").first();
            //if sample input output is present
            if (elementSIO != null) {
                //find position of sample output
                int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT");
                sampleInput = elementSIO.text().substring(12, soutpos);
                sampleOutput = elementSIO.text().substring(soutpos + 13);
                System.out.println("Sample input :\n" + sampleInput + "\n\n\n");
                System.out.println("Sample Output :\n" + sampleOutput);
            } else {
                sampleInput = "";
                sampleOutput = "";
            }

            //retrieve problem explanation from problem page if present
            Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0);
            if (elementExplanation.text().toLowerCase().contains("explanation")) {
                problemExplanation = elementExplanation.nextElementSibling().text();
            }
            System.out.println("Explanation :" + problemExplanation);

            //retrieve timelimit
            Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1);
            StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " ");
            problemTimeLimit = Double.parseDouble(stTL.nextToken());

            //System.out.println("problemTimeLimit :"+problemTimeLimit);
            //set all retrieved information to problem class
            problem.setProblemUrl(problemUrl);
            if (problemTitle.length() == 0) {
                problemTitle = null;
            }
            if (problemStatement.length() == 0) {
                problemStatement = null;
            }
            if (problemInput.length() == 0) {
                problemInput = null;
            }
            if (problemOutput.length() == 0) {
                problemOutput = null;
            }
            if (problemExplanation.length() == 0) {
                problemExplanation = null;
            }
            if (problemConstraints.length() == 0) {
                problemConstraints = null;
            }
            problem.setTitle(problemTitle);
            problem.setProblemUrl(problemUrl);
            problem.setProblemStatement(problemStatement);
            problem.setInputFormat(problemInput);
            problem.setOutputFormat(problemOutput);
            problem.setTimeLimit(problemTimeLimit);
            problem.setExplanation(problemExplanation);
            problem.setConstraints(problemConstraints);

            //set sample input output to problem class
            SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput);
            problem.getSampleInputOutputs().add(sampleInputOutput);
            //set platform as hackerearth
            problem.setPlatform(Platform.HackerEarth);
            for (String strtag : tags) {
                problem.getTags().add(strtag);
            }

            //store in database
            Session session = null;
            Transaction transaction = null;
            try {
                //start session
                session = HibernateUtil.getSessionFactory().openSession();
                transaction = session.beginTransaction();

                //check if problem is already stored in database
                String hql = "FROM Problem p where p.problemUrl = :problem_url";
                Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl)
                        .uniqueResult();
                String task;

                //if problem is present in database
                if (oldProblem != null) {
                    //update the old problem
                    task = "updated";
                    //retrieve id of old problem
                    problem.setId(oldProblem.getId());
                    session.delete(oldProblem);
                    session.flush();
                    session.save(problem);
                } else {
                    task = "saved";
                    session.save(problem);
                }

                transaction.commit();
                //log the info to console
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}",
                        new Object[] { task, problem.getProblemUrl() });
            } catch (HibernateException ee) {
                if (transaction != null) {
                    transaction.rollback();
                }
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE,
                        "Cannot Insert/Update problem into databse: " + problemUrl, e);
            } finally {
                //close the session
                if (session != null) {
                    session.close();
                }
            }
        } catch (Exception ee) {
            System.out.println(ee.toString());
        }
    }

    System.out.println("\n\n\n\ntutorial urls\n\n");
    try {

        for (String tutorialurl : tutorialset) {
            //System.out.println(tutorialurl+"\n\n");
            Response tutorialres = Jsoup.connect(tutorialurl).execute();
            Document doc = tutorialres.parse();

            Tutorial tutorial = new Tutorial();
            tutorial.setContent(doc.getElementsByClass("tutorial").first().text());

            tutorial.setName(baseUrl);
            tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10);
            StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/");

            String tempstr = "";
            while (tutorialtok.hasMoreTokens()) {
                tempstr = tutorialtok.nextToken();
            }

            Session session = null;
            Transaction transaction = null;
            try {
                //start session
                session = HibernateUtil.getSessionFactory().openSession();
                transaction = session.beginTransaction();

                //check if problem is already stored in database
                String hql = "FROM Tutorial p where p.name = :name";
                Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr)
                        .uniqueResult();
                String task;

                //if problem is present in database
                if (oldProblem != null) {
                    //update the old problem
                    task = "updated";
                    //retrieve id of old problem
                    tutorial.setName(oldProblem.getName());
                    session.delete(oldProblem);
                    session.flush();
                    session.save(tutorial);
                } else {
                    task = "saved";
                    tutorial.setName(tempstr);
                    session.save(tutorial);
                }

                transaction.commit();
                //log the info to console
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}",
                        new Object[] { task, tutorial.getName() });
            } catch (HibernateException ee) {
                if (transaction != null) {
                    transaction.rollback();
                }
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE,
                        "Cannot Insert/Update problem into databse: " + tempstr, ee);
            } finally {
                //close the session
                if (session != null) {
                    session.close();
                }
            }

        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    }
}