Java tutorial
package com.infoimage.infotrac.pdfbox;/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.graphics.color.PDGamma; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup; import org.apache.pdfbox.util.Matrix; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; public class PDFTextAnnotator extends PDFTextStripper { private float verticalTolerance = 0; private float heightModifier = (float) 2.250; private class Match { public final String str; public final List<TextPosition> positions; public Match(String str, List<TextPosition> positions) { this.str = str; this.positions = positions; } } /** * Internal class that keeps a mapping from the text contents to their * TextPositions. This is needed to compute bounding boxes. The data is stored * on a per-page basis (keyed on the 1-based pageNo) */ private class TextCache { private final Map<Integer, StringBuilder> texts = new HashMap<Integer, StringBuilder>(); private final Map<Integer, ArrayList<TextPosition>> positions = new HashMap<Integer, ArrayList<TextPosition>>(); public StringBuilder obtainStringBuilder(Integer pageNo) { StringBuilder sb = texts.get(pageNo); if (sb == null) { sb = new StringBuilder(); texts.put(pageNo, sb); } return sb; } public ArrayList<TextPosition> obtainTextPositions(Integer pageNo) { ArrayList<TextPosition> textPositions = positions.get(pageNo); if (textPositions == null) { textPositions = new ArrayList<TextPosition>(); positions.put(pageNo, textPositions); } return textPositions; } public String getText(Integer pageNo) { return obtainStringBuilder(pageNo).toString(); } public void append(String str, TextPosition pos) { int currentPage = getCurrentPageNo(); ArrayList<TextPosition> positions = obtainTextPositions(currentPage); StringBuilder sb = obtainStringBuilder(currentPage); for (int i = 0; i < str.length(); i++) { sb.append(str.charAt(i)); positions.add(pos); } } public List<TextPosition> getTextPositions(Integer pageNo) { return obtainTextPositions(pageNo); } public List<Match> getTextPositions(Integer pageNo, Pattern pattern) { Matcher matcher = pattern.matcher(getText(pageNo)); List<Match> matches = new ArrayList<Match>(); while (matcher.find()) { List<TextPosition> elements = this.getTextPositions(pageNo).subList(matcher.start(), matcher.end()); matches.add(new Match(matcher.group(), elements)); } return matches; } } private TextCache textCache; private PDGamma defaultColor; /** * Instantiate a new PDFTextAnnotator object. This object will load properties * from PDFTextAnnotator.properties and will apply encoding-specific * conversions to the output text. * * @param encoding * The encoding that the output will be written in. * @throws IOException * If there is an error reading the properties. */ public PDFTextAnnotator(final String encoding) throws IOException { super(encoding); } /** * Computes a series of bounding boxes from the TextPositions. It will create * a new bounding box if the vertical tolerance is exceeded * * @param matches * @throws IOException */ private List<PDRectangle> getTextBoundingBoxes(List<TextPosition> matches) { List<PDRectangle> boundingBoxes = new ArrayList<PDRectangle>(); float lowerLeftX = 0, lowerLeftY = 0, upperRightX = 0, upperRightY = 0; boolean first = true; for (int i = 0; i < matches.size(); i++) { TextPosition position = matches.get(i); if (position == null) { continue; } Matrix textPos = position.getTextPos(); float height = (float) (position.getHeight() * getHeightModifier()); if (first) { lowerLeftX = textPos.getXPosition(); upperRightX = lowerLeftX + position.getWidth(); lowerLeftY = textPos.getYPosition(); upperRightY = lowerLeftY + height; first = false; continue; } // we are still on the same line if (Math.abs(textPos.getYPosition() - lowerLeftY) <= getVerticalTolerance()) { upperRightX = textPos.getXPosition() + position.getWidth(); upperRightY = textPos.getYPosition() + height; } else { PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX, upperRightY); boundingBoxes.add(boundingBox); // new line lowerLeftX = textPos.getXPosition(); upperRightX = lowerLeftX + position.getWidth(); lowerLeftY = textPos.getYPosition(); upperRightY = lowerLeftY + height; } } if (!(lowerLeftX == 0 && lowerLeftY == 0 && upperRightX == 0 && upperRightY == 0)) { PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX, upperRightY); boundingBoxes.add(boundingBox); } return boundingBoxes; } private PDRectangle boundingBox(float lowerLeftX, float lowerLeftY, float upperRightX, float upperRightY) { PDRectangle boundingBox = new PDRectangle(); boundingBox.setLowerLeftX(lowerLeftX); boundingBox.setLowerLeftY(lowerLeftY); boundingBox.setUpperRightX(upperRightX); boundingBox.setUpperRightY(upperRightY); return boundingBox; } /** * Highlights a pattern within the PDF with the default color * Returns the list of added annotations for further modification * Note: it will process every page, but cannot process patterns that span multiple pages * Note: it will not work for top-bottom text (such as Chinese) * * @param pdf * PDDocument * @param pattern * String that will be converted to Regex pattern * @throws Exception */ public List<PDAnnotationTextMarkup> highlight(final PDDocument pdf, final String pattern) throws Exception { return highlight(pdf, Pattern.compile(pattern)); } /** * Highlights a pattern within the PDF with the default color * Returns the list of added annotations for further modification * Note: it will process every page, but cannot process patterns that span multiple pages * Note: it will not work for top-bottom text (such as Chinese) * * @param pdf * PDDocument * @param pattern * Pattern (regex) * @throws Exception */ public List<PDAnnotationTextMarkup> highlight(PDDocument pdf, Pattern pattern) throws Exception { if (textCache == null) { throw new Exception("TextCache was not initilized, please run initialize on the document first"); } List<PDPage> pages = pdf.getDocumentCatalog().getAllPages(); ArrayList<PDAnnotationTextMarkup> highligts = new ArrayList<PDAnnotationTextMarkup>(); for (int pageIndex = getStartPage() - 1; pageIndex < getEndPage() && pageIndex < pages.size(); pageIndex++) { PDPage page = pages.get(pageIndex); List<PDAnnotation> annotations = page.getAnnotations(); List<Match> matches = this.textCache.getTextPositions(pageIndex + 1, pattern); for (Match match : matches) { List<PDRectangle> textBoundingBoxes = getTextBoundingBoxes(match.positions); PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup( PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT); if (textBoundingBoxes.size() > 0) { markup.setRectangle(textBoundingBoxes.get(0)); float[] quads = new float[8 * textBoundingBoxes.size()]; int cursor = 0; for (PDRectangle rect : textBoundingBoxes) { float[] tmp = computeQuads(rect); for (int i = 0; i < tmp.length; i++) { quads[cursor + i] = tmp[i]; } cursor = cursor + 8; } markup.setQuadPoints(quads); markup.setConstantOpacity((float) 0.8); markup.setColour(getDefaultColor()); markup.setPrinted(true); markup.setContents(match.str); annotations.add(markup); highligts.add(markup); } } } return highligts; } private float[] computeQuads(PDRectangle rect) { float[] quads = new float[8]; // top left quads[0] = rect.getLowerLeftX(); // x1 quads[1] = rect.getUpperRightY() - 2; // y1 // bottom left quads[2] = rect.getUpperRightX(); // x2 quads[3] = quads[1]; // y2 // top right quads[4] = quads[0]; // x3 quads[5] = rect.getLowerLeftY() - 2; // y3 // bottom right quads[6] = quads[2]; // x4 quads[7] = quads[5]; // y4 return quads; } public void getDefaultColor(PDGamma color) { this.defaultColor = color; } public PDGamma getDefaultColor() { if (this.defaultColor != null) { return this.defaultColor; } else { // #fbe85a PDGamma c = new PDGamma(); c.setR((float) 0.9843); c.setG((float) 0.9098); c.setB((float) 0.3879); return c; } } public float getVerticalTolerance() { return this.verticalTolerance; } public void setVerticalTolerance(float tolerance) { this.verticalTolerance = tolerance; } /** * {@inheritDoc} */ @Override public void resetEngine() { super.resetEngine(); this.textCache = null; } public void initialize(final PDDocument pdf) throws IOException { this.resetEngine(); this.textCache = new TextCache(); if (this.getAddMoreFormatting()) { this.setParagraphEnd(this.getLineSeparator()); this.setPageStart(this.getLineSeparator()); this.setArticleStart(this.getLineSeparator()); this.setArticleEnd(this.getLineSeparator()); } this.startDocument(pdf); this.processPages(pdf.getDocumentCatalog().getAllPages()); this.endDocument(pdf); } /** * Start a new article, which is typically defined as a column on a single * page (also referred to as a bead). Default implementation is to do nothing. * Subclasses may provide additional information. * * @param isltr * true if primary direction of text is left to right. * @throws IOException * If there is any error writing to the stream. */ @Override protected void startArticle(final boolean isltr) throws IOException { String articleStart = this.getArticleStart(); this.textCache.append(articleStart, null); } /** * End an article. Default implementation is to do nothing. Subclasses may * provide additional information. * * @throws IOException * If there is any error writing to the stream. */ @Override protected void endArticle() throws IOException { String articleEnd = this.getArticleEnd(); this.textCache.append(articleEnd, null); } /** * Start a new page. Default implementation is to do nothing. Subclasses may * provide additional information. * * @param page * The page we are about to process. * * @throws IOException * If there is any error writing to the stream. */ @Override protected void startPage(final PDPage page) throws IOException { // default is to do nothing. } /** * End a page. Default implementation is to do nothing. Subclasses may provide * additional information. * * @param page * The page we are about to process. * * @throws IOException * If there is any error writing to the stream. */ @Override protected void endPage(final PDPage page) throws IOException { // default is to do nothing } /** * Write the page separator value to the text cache. * * @throws IOException * If there is a problem writing out the pageseparator to the * document. */ @Override protected void writePageSeperator() { String pageSeparator = this.getPageSeparator(); this.textCache.append(pageSeparator, null); } /** * Write the line separator value to the text cache. * * @throws IOException * If there is a problem writing out the lineseparator to the * document. */ @Override protected void writeLineSeparator() { String lineSeparator = this.getLineSeparator(); this.textCache.append(lineSeparator, null); } /** * Write the word separator value to the text cache. * * @throws IOException * If there is a problem writing out the wordseparator to the * document. */ @Override protected void writeWordSeparator() { String wordSeparator = this.getWordSeparator(); this.textCache.append(wordSeparator, null); } /** * Write the string in TextPosition to the text cache. * * @param text * The text to write to the stream. * @throws IOException * If there is an error when writing the text. */ @Override protected void writeCharacters(final TextPosition text) { String character = text.getCharacter(); this.textCache.append(character, text); } /** * Write a string to the text cache. The default implementation will ignore * the <code>text</code> and just calls {@link #writeCharacters(TextPosition)} * . * * @param text * The text to write to the stream. * @param textPositions * The TextPositions belonging to the text. * @throws IOException * If there is an error when writing the text. */ @Override protected void writeString(final String text, final List<TextPosition> textPositions) { for (final TextPosition textPosition : textPositions) { this.writeCharacters(textPosition); } } private boolean inParagraph; /** * writes the paragraph separator string to the text cache. * * @throws IOException * if something went wrong */ @Override protected void writeParagraphSeparator() { this.writeParagraphEnd(); this.writeParagraphStart(); } /** * Write something (if defined) at the start of a paragraph. * * @throws IOException * if something went wrong */ @Override protected void writeParagraphStart() { if (this.inParagraph) { this.writeParagraphEnd(); this.inParagraph = false; } String paragraphStart = this.getParagraphStart(); this.textCache.append(paragraphStart, null); this.inParagraph = true; } /** * Write something (if defined) at the end of a paragraph. * * @throws IOException * if something went wrong */ @Override protected void writeParagraphEnd() { String paragraphEnd = this.getParagraphEnd(); this.textCache.append(paragraphEnd, null); this.inParagraph = false; } /** * Write something (if defined) at the start of a page. * * @throws IOException * if something went wrong */ @Override protected void writePageStart() { String pageStart = this.getPageStart(); this.textCache.append(pageStart, null); } /** * Write something (if defined) at the start of a page. * * @throws IOException * if something went wrong */ @Override protected void writePageEnd() { String pageEnd = this.getPageEnd(); this.textCache.append(pageEnd, null); } public float getHeightModifier() { return heightModifier; } public void setHeightModifier(float heightModifier) { this.heightModifier = heightModifier; } }