org.sejda.impl.sambox.component.split.SplitByTextChangesOutputStrategy.java Source code

Introduction

Here is the source code for org.sejda.impl.sambox.component.split.SplitByTextChangesOutputStrategy.java
Source

/* 
 * This file is part of the Sejda source code
 * Created on 31/Jul/2015
 * Copyright 2015 by Edi Weissmann (edi.weissmann@gmail.com).
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as 
 * published by the Free Software Foundation, either version 3 of the 
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.sejda.impl.sambox.component.split;

import static org.apache.commons.lang3.StringUtils.defaultIfBlank;
import static org.apache.commons.lang3.StringUtils.isBlank;
import static org.apache.commons.lang3.StringUtils.isNotBlank;

import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.sejda.impl.sambox.component.PdfTextExtractorByArea;
import org.sejda.model.TopLeftRectangularBox;
import org.sejda.model.exception.TaskExecutionException;
import org.sejda.model.exception.TaskIOException;
import org.sejda.model.split.NextOutputStrategy;
import org.sejda.model.split.SplitPages;
import org.sejda.sambox.pdmodel.PDDocument;
import org.sejda.sambox.pdmodel.PDPage;
import org.slf4j.LoggerFactory;

public class SplitByTextChangesOutputStrategy implements NextOutputStrategy {

    private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(SplitByTextChangesOutputStrategy.class);

    private SplitPages delegate;
    private Collection<Integer> pages;
    private Map<Integer, String> textByPage = new HashMap<Integer, String>();

    public SplitByTextChangesOutputStrategy(PDDocument document, TopLeftRectangularBox area, String startsWith,
            String endsWith) throws TaskIOException {
        this.pages = findPageToSplitAt(document, area, startsWith, endsWith);
        this.delegate = new SplitPages(pages);
    }

    Collection<Integer> findPageToSplitAt(PDDocument document, TopLeftRectangularBox area, String startsWith,
            String endsWith) throws TaskIOException {
        Collection<Integer> pagesToSplitAt = new HashSet<Integer>();
        String prevPageText = null;

        for (int pageNumber = 1; pageNumber <= document.getNumberOfPages(); pageNumber++) {
            PDPage page = document.getDocumentCatalog().getPages().get(pageNumber - 1);

            String pageText = org.sejda.core.support.util.StringUtils
                    .nbspAsWhitespace(extractTextFromPageArea(page, area)).trim();

            // checks if the page text matches any (optional) prefixes/suffixes specified
            if (isNotBlank(startsWith)) {
                if (!pageText.startsWith(startsWith)) {
                    LOG.debug("Detected page text does not match specified prefix: '{}' on '{}'", startsWith,
                            pageText);
                    pageText = "";
                } else {
                    pageText = pageText.substring(startsWith.length()).trim();
                }
            }
            if (isNotBlank(endsWith)) {
                if (!pageText.endsWith(endsWith)) {
                    LOG.debug("Detected page text does not match specified suffix: '{}' on '{}'", pageText);
                    pageText = "";
                } else {
                    pageText = pageText.substring(0, pageText.length() - endsWith.length()).trim();
                }
            }

            boolean noChanges = (prevPageText == null || // no previous
                    isBlank(pageText) || // if there's no text in the area, include in the prev document
                    prevPageText.equals(pageText));

            boolean someChanges = !noChanges;

            if (someChanges) {
                LOG.debug("Text changed from {} to {} on page {} in area: {}", prevPageText, pageText, pageNumber,
                        area);
                // decrementing with 1 because the splitter expects page X if X+1 should start a new document
                pagesToSplitAt.add(pageNumber - 1);
            }

            if (isNotBlank(pageText)) {
                prevPageText = pageText;
            }

            textByPage.put(pageNumber, pageText);
        }

        return pagesToSplitAt;
    }

    String extractTextFromPageArea(PDPage page, TopLeftRectangularBox area) throws TaskIOException {
        String text = new PdfTextExtractorByArea().extractTextFromArea(page, area.asRectangle());
        String result = defaultIfBlank(text, "");
        result = StringUtils.strip(result);
        return result;
    }

    Collection<Integer> getPages() {
        return pages;
    }

    @Override
    public void ensureIsValid() throws TaskExecutionException {
        delegate.ensureIsValid();
    }

    @Override
    public boolean isOpening(Integer page) {
        return delegate.isOpening(page);
    }

    @Override
    public boolean isClosing(Integer page) {
        return delegate.isClosing(page);
    }

    public String getTextByPage(int page) {
        return textByPage.get(page);
    }
}