Java tutorial
/* * Copyright 2014 Joel Kuiper * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package vortext; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.Writer; import java.text.Normalizer; import java.util.ArrayList; import java.util.List; import java.util.Map.Entry; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.graphics.color.PDGamma; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup; import org.apache.pdfbox.util.Matrix; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; /** * This class implements the methods highlight and highlightDefault which will * add a highlight to the PDF based on a Pattern or String. The idea is to * extend the PDFTextStripper and override the methods that write to the Output * to instead write to a TextAggregate that keeps data on the position of the * TextPositions. From this information we can then derive bounding boxes (and * quads) that can be used to write the annotations. See the main method for * example usage. * * @author Joel Kuiper <me@joelkuiper.eu> * */ public class TextHighlight extends PDFTextStripper { private float verticalTolerance = 0; private float heightModifier = (float) 1.125; /** * Internal utility class */ private class Match { public final String str; public final List<TextPosition> positions; public Match(final String str, final List<TextPosition> positions) { this.str = str; this.positions = positions; } } /** * Internal utility class that keeps a mapping from the text contents to their * TextPositions. This is needed to compute bounding boxes. The data is stored * on a per-page basis (keyed on the 1-based pageNo) */ private class TextAggregate { private final TreeMap<Integer, StringBuilder> texts = new TreeMap<Integer, StringBuilder>(); private final TreeMap<Integer, ArrayList<TextPosition>> positions = new TreeMap<Integer, ArrayList<TextPosition>>(); private final boolean skipAllWhitespace; private final boolean normalizeText; public TextAggregate(boolean skipAllWhitespace, boolean normalizeText) { this.skipAllWhitespace = skipAllWhitespace; this.normalizeText = normalizeText; } private StringBuilder obtainStringBuilder(final Integer pageNo) { StringBuilder sb = texts.get(pageNo); if (sb == null) { sb = new StringBuilder(); texts.put(pageNo, sb); } return sb; } private ArrayList<TextPosition> obtainTextPositions(final Integer pageNo) { ArrayList<TextPosition> textPositions = positions.get(pageNo); if (textPositions == null) { textPositions = new ArrayList<TextPosition>(); positions.put(pageNo, textPositions); } return textPositions; } public String getText(final Integer pageNo) { return obtainStringBuilder(pageNo).toString(); } public String getText() { StringBuilder text = new StringBuilder(); for (Entry<Integer, StringBuilder> entry : texts.entrySet()) { text.append(this.getText(entry.getKey()).toString()); } return text.toString(); } public List<TextPosition> getTextPositions(final Integer pageNo) { return obtainTextPositions(pageNo); } public void append(String str, final TextPosition pos) { final int currentPage = getCurrentPageNo(); final ArrayList<TextPosition> positions = obtainTextPositions(currentPage); final StringBuilder sb = obtainStringBuilder(currentPage); if (normalizeText) { str = Normalizer.normalize(str, Normalizer.Form.NFKD); } for (int i = 0; i < str.length(); i++) { char nextChar = str.charAt(i); if (this.skipAllWhitespace && Character.isWhitespace(nextChar)) { continue; } sb.append(nextChar); positions.add(pos); } } /** * Given a page and a pattern it will return a list of matches for that * pattern. A Match is a tuple of <String, List<TextPositions>> * * @param pageNo * @param pattern * @return list of matches */ public List<Match> match(final Integer pageNo, final Pattern pattern) { final Matcher matcher = pattern.matcher(this.getText(pageNo)); final List<Match> matches = new ArrayList<Match>(); while (matcher.find()) { final List<TextPosition> elements = getTextPositions(pageNo).subList(matcher.start(), matcher.end()); matches.add(new Match(matcher.group(), elements)); } return matches; } } private TextAggregate textAggregate; private PDGamma defaultColor; // Whether to skip all the whitespace when extracting the text private boolean skipAllWhitespace = false; // Whether to normalize UTF-8 to ASCII, more robust but less accurate private boolean normalizeText = false; /** * Instantiate a new object. This object will load properties from * PDFTextAnnotator.properties and will apply encoding-specific conversions to * the output text. * * @param encoding * The encoding that the output will be written in. * @throws IOException * If there is an error reading the properties. */ public TextHighlight(final String encoding) throws IOException { super(encoding); } /** * Computes a series of bounding boxes (PDRectangle) from a list of * TextPositions. It will create a new bounding box if the verticalTolerance * is exceeded * * @param positions * @throws IOException */ public List<PDRectangle> getTextBoundingBoxes(final List<TextPosition> positions) { final List<PDRectangle> boundingBoxes = new ArrayList<PDRectangle>(); float lowerLeftX = -1, lowerLeftY = -1, upperRightX = -1, upperRightY = -1; boolean first = true; for (int i = 0; i < positions.size(); i++) { final TextPosition position = positions.get(i); if (position == null) { continue; } final Matrix textPos = position.getTextPos(); final float height = position.getHeight() * getHeightModifier(); if (first) { lowerLeftX = textPos.getXPosition(); upperRightX = lowerLeftX + position.getWidth(); lowerLeftY = textPos.getYPosition(); upperRightY = lowerLeftY + height; first = false; continue; } // we are still on the same line if (Math.abs(textPos.getYPosition() - lowerLeftY) <= getVerticalTolerance()) { upperRightX = textPos.getXPosition() + position.getWidth(); upperRightY = textPos.getYPosition() + height; } else { final PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX, upperRightY); boundingBoxes.add(boundingBox); // new line lowerLeftX = textPos.getXPosition(); upperRightX = lowerLeftX + position.getWidth(); lowerLeftY = textPos.getYPosition(); upperRightY = lowerLeftY + height; } } if (!(lowerLeftX == -1 && lowerLeftY == -1 && upperRightX == -1 && upperRightY == -1)) { final PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX, upperRightY); boundingBoxes.add(boundingBox); } return boundingBoxes; } private PDRectangle boundingBox(final float lowerLeftX, final float lowerLeftY, final float upperRightX, final float upperRightY) { final PDRectangle boundingBox = new PDRectangle(); boundingBox.setLowerLeftX(lowerLeftX); boundingBox.setLowerLeftY(lowerLeftY); boundingBox.setUpperRightX(upperRightX); boundingBox.setUpperRightY(upperRightY); return boundingBox; } /** * Highlights a pattern within the PDF with the default color. Returns the * list of added annotations for further modification Note: it will process * every page, but cannot process patterns that span multiple pages Note: it * will not work for top-bottom text (such as Chinese) * * @param pattern * String that will be converted to Regex pattern * @throws IOException */ public List<PDAnnotationTextMarkup> highlightDefault(String pattern) throws IOException { if (this.normalizeText) { pattern = Normalizer.normalize(pattern, Normalizer.Form.NFKD); } if (this.skipAllWhitespace) { pattern = pattern.replaceAll("\\s+", ""); } String p = Pattern.quote(pattern); return this.highlightDefault(Pattern.compile(p)); } /** * Highlights a pattern within the PDF with the default color. Returns the * list of added annotations for further modification. Note: it will process * every page, but cannot process patterns that span multiple pages. Note: it * will not work for top-bottom text (such as Chinese) * * @param pattern * Pattern (regex) * @throws IOException */ public List<PDAnnotationTextMarkup> highlightDefault(final Pattern pattern) throws IOException { final List<PDAnnotationTextMarkup> highlights = this.highlight(pattern, PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT); for (final PDAnnotationTextMarkup highlight : highlights) { highlight.setConstantOpacity((float) 0.8); highlight.setColour(getDefaultColor()); highlight.setPrinted(true); } return highlights; } public List<PDAnnotationTextMarkup> highlight(final String pattern, final String subType) throws IOException { return this.highlight(Pattern.compile(pattern), subType); } @SuppressWarnings("unchecked") public List<PDAnnotationTextMarkup> highlight(final Pattern pattern, final String subType) throws IOException { if (textAggregate == null || document == null) { throw new IllegalArgumentException("TextAggregate was not initilized"); } final List<PDPage> pages = document.getDocumentCatalog().getAllPages(); final ArrayList<PDAnnotationTextMarkup> newAnnotations = new ArrayList<PDAnnotationTextMarkup>(); for (int pageIndex = getStartPage() - 1; pageIndex < getEndPage() && pageIndex < pages.size(); pageIndex++) { final PDPage page = pages.get(pageIndex); final List<PDAnnotation> annotations = page.getAnnotations(); final List<Match> matches = textAggregate.match(pageIndex + 1, pattern); for (final Match match : matches) { final List<PDRectangle> textBoundingBoxes = getTextBoundingBoxes(match.positions); if (textBoundingBoxes.size() > 0) { final PDAnnotationTextMarkup annotation = new PDAnnotationTextMarkup(subType); annotation.setRectangle(textBoundingBoxes.get(0)); final float[] quads = this.getQuads(textBoundingBoxes); annotation.setQuadPoints(quads); annotation.setContents(match.str); annotations.add(annotation); newAnnotations.add(annotation); } } } return newAnnotations; } /** * Computes a float array of size 8 * length(rects) with all the vertices of * the consecutive PDRectangles */ public float[] getQuads(final List<PDRectangle> rects) { final float[] quads = new float[8 * rects.size()]; int cursor = 0; for (final PDRectangle rect : rects) { final float[] tmp = this.getQuads(rect); for (int i = 0; i < tmp.length; i++) { quads[cursor + i] = tmp[i]; } cursor = cursor + 8; } return quads; } /** * Computes a float array of size eight with all the vertices of the * PDRectangle */ public float[] getQuads(final PDRectangle rect) { final float[] quads = new float[8]; // top left quads[0] = rect.getLowerLeftX(); // x1 quads[1] = rect.getUpperRightY(); // y1 // bottom left quads[2] = rect.getUpperRightX(); // x2 quads[3] = quads[1]; // y2 // top right quads[4] = quads[0]; // x3 quads[5] = rect.getLowerLeftY(); // y3 // bottom right quads[6] = quads[2]; // x4 quads[7] = quads[5]; // y5 return quads; } public void setDefaultColor(final PDGamma color) { defaultColor = color; } public PDGamma getDefaultColor() { if (defaultColor != null) { return defaultColor; } else { // #fbe85a final PDGamma c = new PDGamma(); c.setR((float) 0.9843); c.setG((float) 0.9098); c.setB((float) 0.3879); return c; } } /** * The vertical tolerance determines whether a character is still on the same * line */ public float getVerticalTolerance() { return verticalTolerance; } /** * {@link getVerticalTolerance} */ public void setVerticalTolerance(final float tolerance) { verticalTolerance = tolerance; } /** * The height modifier is applied to the font height, it allows the * annotations to be changed by a certain factor */ public float getHeightModifier() { return heightModifier; } /** * {@link getHeightModifier} */ public void setHeightModifier(final float heightModifier) { this.heightModifier = heightModifier; } /* * The following methods are overwritten from the PDTextStripper */ @SuppressWarnings("unchecked") public void initialize(final PDDocument pdf) throws IOException { resetEngine(); document = pdf; if (document.isEncrypted()) { // We are expecting non-encrypted documents here, but it is common // for users to pass in a document that is encrypted with an empty // password (such a document appears to not be encrypted by // someone viewing the document, thus the confusion). We will // attempt to decrypt with the empty password to handle this case. // try { document.decrypt(""); } catch (CryptographyException e) { throw new IllegalArgumentException("Error decrypting document, details: ", e); } } textAggregate = new TextAggregate(this.isSkipAllWhitespace(), this.isNormalizeText()); if (getAddMoreFormatting()) { setParagraphEnd(getLineSeparator()); setPageStart(getLineSeparator()); setArticleStart(getLineSeparator()); setArticleEnd(getLineSeparator()); } startDocument(pdf); processPages(pdf.getDocumentCatalog().getAllPages()); endDocument(pdf); } /** * {@inheritDoc} */ @Override public void resetEngine() { super.resetEngine(); textAggregate = null; } /** * Start a new article, which is typically defined as a column on a single * page (also referred to as a bead). Default implementation is to do nothing. * Subclasses may provide additional information. * * @param isltr * true if primary direction of text is left to right. * @throws IOException * If there is any error writing to the stream. */ @Override protected void startArticle(final boolean isltr) throws IOException { final String articleStart = getArticleStart(); textAggregate.append(articleStart, null); } /** * End an article. Default implementation is to do nothing. Subclasses may * provide additional information. * * @throws IOException * If there is any error writing to the stream. */ @Override protected void endArticle() throws IOException { final String articleEnd = getArticleEnd(); textAggregate.append(articleEnd, null); } /** * Start a new page. Default implementation is to do nothing. Subclasses may * provide additional information. * * @param page * The page we are about to process. * * @throws IOException * If there is any error writing to the stream. */ @Override protected void startPage(final PDPage page) throws IOException { // default is to do nothing. } /** * End a page. Default implementation is to do nothing. Subclasses may provide * additional information. * * @param page * The page we are about to process. * * @throws IOException * If there is any error writing to the stream. */ @Override protected void endPage(final PDPage page) throws IOException { // default is to do nothing } /** * Write the page separator value to the text cache. * * @throws IOException * If there is a problem writing out the pageseparator to the * document. */ @Override protected void writePageSeperator() { final String pageSeparator = getPageSeparator(); textAggregate.append(pageSeparator, null); } /** * Write the line separator value to the text cache. * * @throws IOException * If there is a problem writing out the lineseparator to the * document. */ @Override protected void writeLineSeparator() { final String lineSeparator = getLineSeparator(); textAggregate.append(lineSeparator, null); } /** * Write the word separator value to the text cache. * * @throws IOException * If there is a problem writing out the wordseparator to the * document. */ @Override protected void writeWordSeparator() { final String wordSeparator = getWordSeparator(); textAggregate.append(wordSeparator, null); } /** * Write the string in TextPosition to the text cache. * * @param text * The text to write to the stream. */ @Override protected void writeCharacters(final TextPosition text) { final String character = text.getCharacter(); textAggregate.append(character, text); } /** * Write a string to the text cache. The default implementation will ignore * the <code>text</code> and just calls {@link #writeCharacters(TextPosition)} * . * * @param text * The text to write to the stream. * @param textPositions * The TextPositions belonging to the text. */ @Override protected void writeString(final String text, final List<TextPosition> textPositions) { for (final TextPosition textPosition : textPositions) { writeCharacters(textPosition); } } private boolean inParagraph; /** * writes the paragraph separator string to the text cache. * * @throws IOException */ @Override protected void writeParagraphSeparator() { writeParagraphEnd(); writeParagraphStart(); } /** * Write something (if defined) at the start of a paragraph. * * @throws IOException */ @Override protected void writeParagraphStart() { if (inParagraph) { writeParagraphEnd(); inParagraph = false; } final String paragraphStart = getParagraphStart(); textAggregate.append(paragraphStart, null); inParagraph = true; } /** * Write something (if defined) at the end of a paragraph. * * @throws IOException */ @Override protected void writeParagraphEnd() { final String paragraphEnd = getParagraphEnd(); textAggregate.append(paragraphEnd, null); inParagraph = false; } /** * Write something (if defined) at the start of a page. * * @throws IOException */ @Override protected void writePageStart() { final String pageStart = getPageStart(); textAggregate.append(pageStart, null); } /** * Write something (if defined) at the start of a page. * * @throws IOException */ @Override protected void writePageEnd() { final String pageEnd = getPageEnd(); textAggregate.append(pageEnd, null); } @Override public String getText(final PDDocument doc) throws IOException { throw new IllegalArgumentException("Not applicable for TextHighlight"); } @Override @Deprecated public String getText(final COSDocument doc) throws IOException { throw new IllegalArgumentException("Not applicable for TextHighlight"); } @Override @Deprecated public void writeText(final COSDocument doc, final Writer outputStream) throws IOException { throw new IllegalArgumentException("Not applicable for TextHighlight"); } @Override public void writeText(final PDDocument doc, final Writer outputStream) throws IOException { throw new IllegalArgumentException("Not applicable for TextHighlight"); } public String getText() throws IOException { return textAggregate.getText(); } public boolean isSkipAllWhitespace() { return skipAllWhitespace; } public void setSkipAllWhitespace(boolean skipAllWhitespace) { this.skipAllWhitespace = skipAllWhitespace; } public boolean isNormalizeText() { return normalizeText; } public void setNormalizeText(boolean normalizeText) { this.normalizeText = normalizeText; } /* main */ public static void main(final String args[]) throws Exception { if (args.length != 3) { usage(); } PDDocument pdDoc = null; final File file = new File(args[0]); if (!file.isFile()) { System.err.println("File " + args[0] + " does not exist."); return; } final PDFParser parser = new PDFParser(new FileInputStream(file)); parser.parse(); pdDoc = new PDDocument(parser.getDocument()); final TextHighlight pdfHighlight = new TextHighlight("UTF-8"); // depends on what you want to match, but this creates a long string // without newlines pdfHighlight.setSkipAllWhitespace(true); pdfHighlight.setNormalizeText(true); pdfHighlight.initialize(pdDoc); List<PDAnnotationTextMarkup> highlightDefault = pdfHighlight.highlightDefault(args[2]); pdDoc.save(args[1]); try { if (parser.getDocument() != null) { parser.getDocument().close(); } if (pdDoc != null) { pdDoc.close(); } } catch (final Exception e) { e.printStackTrace(); } } private static void usage() { System.err.println("Usage: <input-pdf> <output-pdf> <pattern>"); } }