org.apache.pdfbox.text.TextPosition.java Source code

Introduction

Here is the source code for org.apache.pdfbox.text.TextPosition.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.text;

import java.text.Normalizer;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.Matrix;

/**
 * This represents a string and a position on the screen of those characters.
 *
 * @author Ben Litchfield
 */
public final class TextPosition {
    private static final Log LOG = LogFactory.getLog(TextPosition.class);

    private static final Map<Integer, String> DIACRITICS = createDiacritics();

    // Adds non-decomposing diacritics to the hash with their related combining character.
    // These are values that the unicode spec claims are equivalent but are not mapped in the form
    // NFKC normalization method. Determined by going through the Combining Diacritical Marks
    // section of the Unicode spec and identifying which characters are not  mapped to by the
    // normalization.
    private static Map<Integer, String> createDiacritics() {
        HashMap<Integer, String> map = new HashMap<Integer, String>();
        map.put(0x0060, "\u0300");
        map.put(0x02CB, "\u0300");
        map.put(0x0027, "\u0301");
        map.put(0x02B9, "\u0301");
        map.put(0x02CA, "\u0301");
        map.put(0x005e, "\u0302");
        map.put(0x02C6, "\u0302");
        map.put(0x007E, "\u0303");
        map.put(0x02C9, "\u0304");
        map.put(0x00B0, "\u030A");
        map.put(0x02BA, "\u030B");
        map.put(0x02C7, "\u030C");
        map.put(0x02C8, "\u030D");
        map.put(0x0022, "\u030E");
        map.put(0x02BB, "\u0312");
        map.put(0x02BC, "\u0313");
        map.put(0x0486, "\u0313");
        map.put(0x055A, "\u0313");
        map.put(0x02BD, "\u0314");
        map.put(0x0485, "\u0314");
        map.put(0x0559, "\u0314");
        map.put(0x02D4, "\u031D");
        map.put(0x02D5, "\u031E");
        map.put(0x02D6, "\u031F");
        map.put(0x02D7, "\u0320");
        map.put(0x02B2, "\u0321");
        map.put(0x02CC, "\u0329");
        map.put(0x02B7, "\u032B");
        map.put(0x02CD, "\u0331");
        map.put(0x005F, "\u0332");
        map.put(0x204E, "\u0359");
        return map;
    }

    // text matrix for the start of the text object, coordinates are in display units
    // and have not been adjusted
    private final Matrix textMatrix;

    // ending X and Y coordinates in display units
    private final float endX;
    private final float endY;

    private final float maxHeight; // maximum height of text, in display units
    private final int rotation; // 0, 90, 180, 270 degrees of page rotation
    private final float x;
    private final float y;
    private final float pageHeight;
    private final float pageWidth;

    private final float widthOfSpace; // width of a space, in display units

    private final int[] charCodes; // internal PDF character codes
    private final PDFont font;
    private final float fontSize;
    private final int fontSizePt;

    // mutable
    private float[] widths;
    private String unicode;

    /**
     * Constructor.
     *
     * @param pageRotation rotation of the page that the text is located in
     * @param pageWidth rotation of the page that the text is located in
     * @param pageHeight rotation of the page that the text is located in
     * @param textMatrix TextMatrix for start of text (in display units)
     * @param endX x coordinate of the end position
     * @param endY y coordinate of the end position
     * @param maxHeight Maximum height of text (in display units)
     * @param individualWidth The width of the given character/string. (in text units)
     * @param spaceWidth The width of the space character. (in display units)
     * @param unicode The string of Unicode characters to be displayed.
     * @param charCodes An array of the internal PDF character codes for the glyphs in this text.
     * @param font The current font for this text position.
     * @param fontSize The new font size.
     * @param fontSizeInPt The font size in pt units.
     */
    public TextPosition(int pageRotation, float pageWidth, float pageHeight, Matrix textMatrix, float endX,
            float endY, float maxHeight, float individualWidth, float spaceWidth, String unicode, int[] charCodes,
            PDFont font, float fontSize, int fontSizeInPt) {
        this.textMatrix = textMatrix;

        this.endX = endX;
        this.endY = endY;

        int rotationAngle = pageRotation;
        this.rotation = rotationAngle;

        this.maxHeight = maxHeight;
        this.pageHeight = pageHeight;
        this.pageWidth = pageWidth;

        this.widths = new float[] { individualWidth };
        this.widthOfSpace = spaceWidth;
        this.unicode = unicode;
        this.charCodes = charCodes;
        this.font = font;
        this.fontSize = fontSize;
        this.fontSizePt = fontSizeInPt;

        x = getXRot(rotationAngle);
        if (rotationAngle == 0 || rotationAngle == 180) {
            y = this.pageHeight - getYLowerLeftRot(rotationAngle);
        } else {
            y = this.pageWidth - getYLowerLeftRot(rotationAngle);
        }
    }

    /**
     * Return the string of characters stored in this object.
     *
     * @return The string on the screen.
     */
    public String getUnicode() {
        return unicode;
    }

    /**
     * Return the internal PDF character codes of the glyphs in this text.
     *
     * @return an array of internal PDF character codes
     */
    public int[] getCharacterCodes() {
        return charCodes;
    }

    /**
     * Return the text matrix stored in this object.
     *
     * @return The Matrix containing the starting text position
     */
    public Matrix getTextMatrix() {
        return textMatrix;
    }

    /**
     * Return the direction/orientation of the string in this object based on its text matrix.
     * @return The direction of the text (0, 90, 180, or 270)
     */
    public float getDir() {
        float a = textMatrix.getScaleY();
        float b = textMatrix.getShearY();
        float c = textMatrix.getShearX();
        float d = textMatrix.getScaleX();

        // 12 0   left to right
        // 0 12
        if (a > 0 && Math.abs(b) < d && Math.abs(c) < a && d > 0) {
            return 0;
        }
        // -12 0   right to left (upside down)
        // 0 -12
        else if (a < 0 && Math.abs(b) < Math.abs(d) && Math.abs(c) < Math.abs(a) && d < 0) {
            return 180;
        }
        // 0  12    up
        // -12 0
        else if (Math.abs(a) < Math.abs(c) && b > 0 && c < 0 && Math.abs(d) < b) {
            return 90;
        }
        // 0  -12   down
        // 12 0
        else if (Math.abs(a) < c && b < 0 && c > 0 && Math.abs(d) < Math.abs(b)) {
            return 270;
        }
        return 0;
    }

    /**
     * Return the X starting coordinate of the text, adjusted by the given rotation amount.
     * The rotation adjusts where the 0,0 location is relative to the text.
     *
     * @param rotation Rotation to apply (0, 90, 180, or 270).  0 will perform no adjustments.
     * @return X coordinate
     */
    private float getXRot(float rotation) {
        if (rotation == 0) {
            return textMatrix.getTranslateX();
        } else if (rotation == 90) {
            return textMatrix.getTranslateY();
        } else if (rotation == 180) {
            return pageWidth - textMatrix.getTranslateX();
        } else if (rotation == 270) {
            return pageHeight - textMatrix.getTranslateY();
        }
        return 0;
    }

    /**
     * This will get the page rotation adjusted x position of the character.
     * This is adjusted based on page rotation so that the upper left is 0,0.
     *
     * @return The x coordinate of the character.
     */
    public float getX() {
        return x;
    }

    /**
     * This will get the text direction adjusted x position of the character.
     * This is adjusted based on text direction so that the first character
     * in that direction is in the upper left at 0,0.
     *
     * @return The x coordinate of the text.
     */
    public float getXDirAdj() {
        return getXRot(getDir());
    }

    /**
     * This will get the y position of the character with 0,0 in lower left.
     * This will be adjusted by the given rotation.
     *
     * @param rotation Rotation to apply to text to adjust the 0,0 location (0,90,180,270)
     * @return The y coordinate of the text
     */
    private float getYLowerLeftRot(float rotation) {
        if (rotation == 0) {
            return textMatrix.getTranslateY();
        } else if (rotation == 90) {
            return pageWidth - textMatrix.getTranslateX();
        } else if (rotation == 180) {
            return pageHeight - textMatrix.getTranslateY();
        } else if (rotation == 270) {
            return textMatrix.getTranslateX();
        }
        return 0;
    }

    /**
     * This will get the y position of the text, adjusted so that 0,0 is upper left and it is
     * adjusted based on the page rotation.
     *
     * @return The adjusted y coordinate of the character.
     */
    public float getY() {
        return y;
    }

    /**
     * This will get the y position of the text, adjusted so that 0,0 is upper left and it is
     * adjusted based on the text direction.
     *
     * @return The adjusted y coordinate of the character.
     */
    public float getYDirAdj() {
        float dir = getDir();
        // some PDFBox code assumes that the 0,0 point is in upper left, not lower left
        if (dir == 0 || dir == 180) {
            return pageHeight - getYLowerLeftRot(dir);
        } else {
            return pageWidth - getYLowerLeftRot(dir);
        }
    }

    /**
     * Get the length or width of the text, based on a given rotation.
     *
     * @param rotation Rotation that was used to determine coordinates (0,90,180,270)
     * @return Width of text in display units
     */
    private float getWidthRot(float rotation) {
        if (rotation == 90 || rotation == 270) {
            return Math.abs(endY - textMatrix.getTranslateY());
        } else {
            return Math.abs(endX - textMatrix.getTranslateX());
        }
    }

    /**
     * This will get the width of the string when page rotation adjusted coordinates are used.
     *
     * @return The width of the text in display units.
     */
    public float getWidth() {
        return getWidthRot(rotation);
    }

    /**
     * This will get the width of the string when text direction adjusted coordinates are used.
     *
     * @return The width of the text in display units.
     */
    public float getWidthDirAdj() {
        return getWidthRot(getDir());
    }

    /**
     * This will get the maximum height of all characters in this string.
     *
     * @return The maximum height of all characters in this string.
     */
    public float getHeight() {
        return maxHeight;
    }

    /**
     * This will get the maximum height of all characters in this string.
     *
     * @return The maximum height of all characters in this string.
     */
    public float getHeightDir() {
        // this is not really a rotation-dependent calculation, but this is defined for symmetry
        return maxHeight;
    }

    /**
     * This will get the font size that this object is suppose to be drawn at.
     *
     * @return The font size.
     */
    public float getFontSize() {
        return fontSize;
    }

    /**
     * This will get the font size in pt. To get this size we have to multiply the pdf-fontsize
     * and the scaling from the textmatrix
     *
     * @return The font size in pt.
     */
    public float getFontSizeInPt() {
        return fontSizePt;
    }

    /**
     * This will get the font for the text being drawn.
     *
     * @return The font size.
     */
    public PDFont getFont() {
        return font;
    }

    /**
     * This will get the width of a space character. This is useful for some algorithms such as the
     * text stripper, that need to know the width of a space character.
     *
     * @return The width of a space character.
     */
    public float getWidthOfSpace() {
        return widthOfSpace;
    }

    /**
     * @return Returns the xScale.
     */
    public float getXScale() {
        return textMatrix.getScalingFactorX();
    }

    /**
     * @return Returns the yScale.
     */
    public float getYScale() {
        return textMatrix.getScalingFactorY();
    }

    /**
     * Get the widths of each individual character.
     *
     * @return An array that is the same length as the length of the string.
     */
    public float[] getIndividualWidths() {
        return widths;
    }

    /**
     * Determine if this TextPosition logically contains another (i.e. they overlap and should be
     * rendered on top of each other).
     *
     * @param tp2 The other TestPosition to compare against
     * @return True if tp2 is contained in the bounding box of this text.
     */
    public boolean contains(TextPosition tp2) {
        double thisXstart = getXDirAdj();
        double thisXend = getXDirAdj() + getWidthDirAdj();

        double tp2Xstart = tp2.getXDirAdj();
        double tp2Xend = tp2.getXDirAdj() + tp2.getWidthDirAdj();

        // no X overlap at all so return as soon as possible
        if (tp2Xend <= thisXstart || tp2Xstart >= thisXend) {
            return false;
        }

        // no Y overlap at all so return as soon as possible. Note: 0.0 is in the upper left and
        // y-coordinate is top of TextPosition
        if (tp2.getYDirAdj() + tp2.getHeightDir() < getYDirAdj()
                || tp2.getYDirAdj() > getYDirAdj() + getHeightDir()) {
            return false;
        }
        // we're going to calculate the percentage of overlap, if its less than a 15% x-coordinate
        // overlap then we'll return false because its negligible, .15 was determined by trial and
        // error in the regression test files
        else if (tp2Xstart > thisXstart && tp2Xend > thisXend) {
            double overlap = thisXend - tp2Xstart;
            double overlapPercent = overlap / getWidthDirAdj();
            return overlapPercent > .15;
        } else if (tp2Xstart < thisXstart && tp2Xend < thisXend) {
            double overlap = tp2Xend - thisXstart;
            double overlapPercent = overlap / getWidthDirAdj();
            return overlapPercent > .15;
        }
        return true;
    }

    /**
     * Merge a single character TextPosition into the current object. This is to be used only for
     * cases where we have a diacritic that overlaps an existing TextPosition. In a graphical
     * display, we could overlay them, but for text extraction we need to merge them. Use the
     * contains() method to test if two objects overlap.
     *
     * @param diacritic TextPosition to merge into the current TextPosition.
     */
    public void mergeDiacritic(TextPosition diacritic) {
        if (diacritic.getUnicode().length() > 1) {
            return;
        }

        float diacXStart = diacritic.getXDirAdj();
        float diacXEnd = diacXStart + diacritic.widths[0];

        float currCharXStart = getXDirAdj();

        int strLen = unicode.length();
        boolean wasAdded = false;

        for (int i = 0; i < strLen && !wasAdded; i++) {
            if (i >= widths.length) {
                LOG.info("diacritic " + diacritic.getUnicode() + " on ligature " + unicode
                        + " is not supported yet and is ignored (PDFBOX-2831)");
                break;
            }
            float currCharXEnd = currCharXStart + widths[i];

            // this is the case where there is an overlap of the diacritic character with the
            // current character and the previous character. If no previous character, just append
            // the diacritic after the current one
            if (diacXStart < currCharXStart && diacXEnd <= currCharXEnd) {
                if (i == 0) {
                    insertDiacritic(i, diacritic);
                } else {
                    float distanceOverlapping1 = diacXEnd - currCharXStart;
                    float percentage1 = distanceOverlapping1 / widths[i];

                    float distanceOverlapping2 = currCharXStart - diacXStart;
                    float percentage2 = distanceOverlapping2 / widths[i - 1];

                    if (percentage1 >= percentage2) {
                        insertDiacritic(i, diacritic);
                    } else {
                        insertDiacritic(i - 1, diacritic);
                    }
                }
                wasAdded = true;
            }
            // diacritic completely covers this character and therefore we assume that this is the
            // character the diacritic belongs to
            else if (diacXStart < currCharXStart && diacXEnd > currCharXEnd) {
                insertDiacritic(i, diacritic);
                wasAdded = true;
            }
            // otherwise, The diacritic modifies this character because its completely
            // contained by the character width
            else if (diacXStart >= currCharXStart && diacXEnd <= currCharXEnd) {
                insertDiacritic(i, diacritic);
                wasAdded = true;
            }
            // last character in the TextPosition so we add diacritic to the end
            else if (diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == strLen - 1) {
                insertDiacritic(i, diacritic);
                wasAdded = true;
            }

            // couldn't find anything useful so we go to the next character in the TextPosition
            currCharXStart += widths[i];
        }
    }

    /**
     * Inserts the diacritic TextPosition to the str of this TextPosition and updates the widths
     * array to include the extra character width.
     *
     * @param i current character
     * @param diacritic The diacritic TextPosition
     */
    private void insertDiacritic(int i, TextPosition diacritic) {
        StringBuilder sb = new StringBuilder();
        sb.append(unicode.substring(0, i));

        float[] widths2 = new float[widths.length + 1];
        System.arraycopy(widths, 0, widths2, 0, i);

        // Unicode combining diacritics always go after the base character, regardless of whether
        // the string is in presentation order or logical order
        sb.append(unicode.charAt(i));
        widths2[i] = widths[i];
        sb.append(combineDiacritic(diacritic.getUnicode()));
        widths2[i + 1] = 0;

        // get the rest of the string
        sb.append(unicode.substring(i + 1, unicode.length()));
        System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);

        unicode = sb.toString();
        widths = widths2;
    }

    /**
     * Combine the diacritic, for example, convert non-combining diacritic characters to their
     * combining counterparts.
     *
     * @param str String to normalize
     * @return Normalized string
     */
    private String combineDiacritic(String str) {
        // Unicode contains special combining forms of the diacritic characters which we want to use
        int codePoint = str.codePointAt(0);

        // convert the characters not defined in the Unicode spec
        if (DIACRITICS.containsKey(codePoint)) {
            return DIACRITICS.get(codePoint);
        } else {
            return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
        }
    }

    /**
     * @return True if the current character is a diacritic char.
     */
    public boolean isDiacritic() {
        String text = this.getUnicode();
        if (text.length() != 1) {
            return false;
        }
        int type = Character.getType(text.charAt(0));
        return type == Character.NON_SPACING_MARK || type == Character.MODIFIER_SYMBOL
                || type == Character.MODIFIER_LETTER;

    }

    /**
     * Show the string data for this text position.
     *
     * @return A human readable form of this object.
     */
    @Override
    public String toString() {
        return getUnicode();
    }
}