org.apache.pdfbox.text.BidiTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pdfbox.text.BidiTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pdfbox.text;

import static org.junit.Assert.fail;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

/**
 * Test for the PDButton class.
 *
 */
public class BidiTest {
    /**
     * Logger instance.
     */
    private static final Log log = LogFactory.getLog(TestTextStripper.class);

    private static final File IN_DIR = new File("src/test/resources/org/apache/pdfbox/text/");
    private static final File outDir = new File("target/test-output");
    private static final String NAME_OF_PDF = "BidiSample.pdf";

    private static final String ENCODING = "UTF-8";

    private PDDocument document;
    private PDFTextStripper stripper;

    private boolean bFail = false;

    @Before
    public void setUp() throws IOException {
        if (!outDir.exists()) {
            if (!outDir.mkdirs()) {
                throw (new IOException("Error creating " + outDir.getAbsolutePath() + " directory"));
            }
        }

        document = PDDocument.load(new File(IN_DIR, NAME_OF_PDF));
        stripper = new PDFTextStripper();
        stripper.setLineSeparator("\n");
    }

    @Test
    public void testSorted() throws IOException {
        File testFile = new File(IN_DIR, NAME_OF_PDF);
        doTestFile(testFile, outDir, false, true);
    }

    @Test
    public void testNotSorted() throws IOException {
        File testFile = new File(IN_DIR, NAME_OF_PDF);
        doTestFile(testFile, outDir, false, false);
    }

    @After
    public void tearDown() throws IOException {
        document.close();
    }

    /**
     * Validate text extraction on a single file.
     *
     * @param inFile The PDF file to validate
     * @param outDir The directory to store the output in
     * @param bLogResult Whether to log the extracted text
     * @param bSort Whether or not the extracted text is sorted
     * @throws Exception when there is an exception
     */
    public void doTestFile(File inFile, File outDir, boolean bLogResult, boolean bSort) throws IOException {
        if (bSort) {
            log.info("Preparing to parse " + inFile.getName() + " for sorted test");
        } else {
            log.info("Preparing to parse " + inFile.getName() + " for standard test");
        }

        if (!outDir.exists()) {
            if (!outDir.mkdirs()) {
                throw (new IOException("Error creating " + outDir.getAbsolutePath() + " directory"));
            }
        }

        PDDocument document = PDDocument.load(inFile);
        try {
            File outFile;
            File expectedFile;

            if (bSort) {
                outFile = new File(outDir, inFile.getName() + "-sorted.txt");
                expectedFile = new File(inFile.getParentFile(), inFile.getName() + "-sorted.txt");
            } else {
                outFile = new File(outDir, inFile.getName() + ".txt");
                expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt");
            }

            OutputStream os = new FileOutputStream(outFile);
            try {
                Writer writer = new OutputStreamWriter(os, ENCODING);
                try {
                    //Allows for sorted tests 
                    stripper.setSortByPosition(bSort);
                    stripper.writeText(document, writer);
                } finally {
                    // close the written file before reading it again
                    writer.close();
                }
            } finally {
                os.close();
            }

            if (bLogResult) {
                log.info("Text for " + inFile.getName() + ":");
                log.info(stripper.getText(document));
            }

            if (!expectedFile.exists()) {
                this.bFail = true;
                fail("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() + " did not exist");
                return;
            }

            LineNumberReader expectedReader = new LineNumberReader(
                    new InputStreamReader(new FileInputStream(expectedFile), ENCODING));
            LineNumberReader actualReader = new LineNumberReader(
                    new InputStreamReader(new FileInputStream(outFile), ENCODING));

            while (true) {
                String expectedLine = expectedReader.readLine();
                while (expectedLine != null && expectedLine.trim().length() == 0) {
                    expectedLine = expectedReader.readLine();
                }
                String actualLine = actualReader.readLine();
                while (actualLine != null && actualLine.trim().length() == 0) {
                    actualLine = actualReader.readLine();
                }
                if (!stringsEqual(expectedLine, actualLine)) {
                    this.bFail = true;
                    fail("FAILURE: Line mismatch for file " + inFile.getName() + " (sort = " + bSort + ")"
                            + " at expected line: " + expectedReader.getLineNumber() + " at actual line: "
                            + actualReader.getLineNumber() + "\nexpected line was: \"" + expectedLine + "\""
                            + "\nactual line was:   \"" + actualLine + "\"" + "\n");

                    //lets report all lines, even though this might produce some verbose logging
                    //break;
                }

                if (expectedLine == null || actualLine == null) {
                    break;
                }
            }
            expectedReader.close();
            actualReader.close();
        } finally {
            document.close();
        }
    }

    /**
     * Determine whether two strings are equal, where two null strings are
     * considered equal.
     *
     * @param expected Expected string
     * @param actual Actual String
     * @return <code>true</code> is the strings are both null,
     * or if their contents are the same, otherwise <code>false</code>.
     */
    private boolean stringsEqual(String expected, String actual) {
        boolean equals = true;
        if ((expected == null) && (actual == null)) {
            return true;
        } else if (expected != null && actual != null) {
            expected = expected.trim();
            actual = actual.trim();
            char[] expectedArray = expected.toCharArray();
            char[] actualArray = actual.toCharArray();
            int expectedIndex = 0;
            int actualIndex = 0;
            while (expectedIndex < expectedArray.length && actualIndex < actualArray.length) {
                if (expectedArray[expectedIndex] != actualArray[actualIndex]) {
                    equals = false;
                    log.warn("Lines differ at index" + " expected:" + expectedIndex + "-"
                            + (int) expectedArray[expectedIndex] + " actual:" + actualIndex + "-"
                            + (int) actualArray[actualIndex]);
                    break;
                }
                expectedIndex = skipWhitespace(expectedArray, expectedIndex);
                actualIndex = skipWhitespace(actualArray, actualIndex);
                expectedIndex++;
                actualIndex++;
            }
            if (equals) {
                if (expectedIndex != expectedArray.length) {
                    equals = false;
                    log.warn("Expected line is longer at:" + expectedIndex);
                }
                if (actualIndex != actualArray.length) {
                    equals = false;
                    log.warn("Actual line is longer at:" + actualIndex);
                }
            }
        } else {
            equals = (expected == null && actual != null && actual.trim().isEmpty())
                    || (actual == null && expected != null && expected.trim().isEmpty());
        }
        return equals;
    }

    /**
     * If the current index is whitespace then skip any subsequent whitespace.
     */
    private int skipWhitespace(char[] array, int index) {
        //if we are at a space character then skip all space
        //characters, but when all done rollback 1 because stringsEqual
        //will roll forward 1
        if (array[index] == ' ' || array[index] > 256) {
            while (index < array.length && (array[index] == ' ' || array[index] > 256)) {
                index++;
            }
            index--;
        }
        return index;
    }

}