Java tutorial
/** * Copyright (C) 2010 Orbeon, Inc. * * This program is free software; you can redistribute it and/or modify it under the terms of the * GNU Lesser General Public License as published by the Free Software Foundation; either version * 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * The full text of the license is available at http://www.gnu.org/copyleft/lesser.html */ package biz.taoconsulting.oxf.processor.converter; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.dom4j.Document; import org.orbeon.oxf.pipeline.api.PipelineContext; import org.orbeon.oxf.xml.XMLParsing; import org.orbeon.oxf.xml.XMLReceiver; import org.orbeon.oxf.processor.ProcessorInputOutputInfo; import org.orbeon.oxf.processor.SimpleProcessor; import org.orbeon.oxf.util.Base64XMLReceiver; import org.pdfbox.exceptions.OutlineNotLocalException; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocumentCatalog; import org.pdfbox.pdmodel.PDDocumentInformation; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.pdmodel.common.PDMetadata; import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.pdfbox.util.PDFTextStripper; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.List; /** * Convert from a binary PDF (Acrobat) file to XML. Supports configuration * option: <action>pages </action> Extracts text page by page <action>bookmarks * </action> Extracts text in bookmarks <action>bookmarksonly </action>Extracts * bookmarks only <action>meta </action>Extracts only meta data * * @author Stephan H. Wissel * @version 0.4 */ public class FromPdfConverter extends SimpleProcessor { //TODO: Move that somewhere less visible? private List allPages = null; // Array of all pages //Key Variables for static info - Parameter names private static final String INPUT_CONFIG = "config"; private static final String INPUT_SCOPE = "string(//action)"; private static final String INPUT_DATA = "data"; private static final String OUTPUT_DATA = "data"; //DEFINED SCOPES private static final String SCOPE_BOOKMARKS = "bookmarks"; private static final String SCOPE_PAGES = "pages"; private static final String SCOPE_BOOKMARKSONLY = "bookmarksonly"; private static final String SCOPE_BOOKMARKPAGES = "bookmarkpages"; private static final String SCOPE_METADATA = "meta"; // ToDo: find a smarter way for scope list private static final String[] LIST_OF_SCOPES = { SCOPE_BOOKMARKS, SCOPE_PAGES, SCOPE_BOOKMARKSONLY, SCOPE_METADATA, SCOPE_BOOKMARKPAGES }; // Tag attributes private static final String ATT_CDATA = "CDATA"; private static final String ATT_PAGES = "pages"; private static final String ATT_PAGE = "page"; private static final String ATT_LEVEL = "level"; private static final String ATT_PAGENUM = "number"; private static final String ATT_AUTHOR = "author"; private static final String ATT_TITLE = "title"; private static final String ATT_SUBJECT = "subject"; // Tag names and attributes private static final String TAG_ROOT = "PDFDocument"; private static final String TAG_META = "PDFMetadata"; private static final String TAG_PAGE = "Page"; private static final String TAG_BOOKMARK = "Bookmark"; private static final String TAG_TITLE = "Title"; private static final String TAG_TEXT = "Text"; private static final String TAG_ERROR = "Error"; // Logger private static final Logger logger = Logger.getLogger(FromPdfConverter.class); public FromPdfConverter() { addInputInfo(new ProcessorInputOutputInfo(INPUT_CONFIG)); addInputInfo(new ProcessorInputOutputInfo(INPUT_DATA)); addOutputInfo(new ProcessorInputOutputInfo(OUTPUT_DATA)); } public void generateData(PipelineContext context, XMLReceiver xmlReceiver) { // We will read the scope of conversion from the Config input // the PDF from Data and write the output to Data // Data Input is base 64, so sax won't do // Read the configuration options and get the scope String scope = getScopeFromInput(context, INPUT_CONFIG); // Read binary content of PDF into an Inputstream InputStream pdfStream = getPDFStreamFromInput(context, INPUT_DATA); // Process the PDF, we get the SAX Stream back directly logger.info("Ready to call extractFromPDF"); extractFromPDF(pdfStream, xmlReceiver, scope); } /** * @param context * @param inputName * @return pdfStream */ private InputStream getPDFStreamFromInput(PipelineContext context, String inputName) { // Our result InputStream pdfStream = null; // Get the encoded data from the context ByteArrayOutputStream os = new ByteArrayOutputStream(); // Now read the binary PDF File, so we can generate the XML logger.info("Creating the Base64 Content handler for the uploaded file"); Base64XMLReceiver base64ContentHandler = new Base64XMLReceiver(os); try { readInputAsSAX(context, inputName, base64ContentHandler); final byte[] fileContent = os.toByteArray(); pdfStream = new ByteArrayInputStream(fileContent); } catch (Exception e) { logger.error(e); } //Return what we got return pdfStream; } /** * Extracts the action to be taken from the input and validates the action * against the list of defined actions * * @param context * @param inputName * @return the validated scope */ private String getScopeFromInput(PipelineContext context, String inputName) { String scope; Document scopeDocument = readInputAsDOM4J(context, inputName); scope = (String) scopeDocument.selectObject(INPUT_SCOPE); // If somebody forgot the input we use a default if (scope == null) scope = SCOPE_BOOKMARKS; // Now check the scope for being valid scope = scope.toLowerCase(); // we like it lowercase for (int i = 0; i < LIST_OF_SCOPES.length; i++) { if (LIST_OF_SCOPES[i].equals(scope)) { return scope; // We found a valid scope } } return SCOPE_BOOKMARKS; // If we got here the scope wasn't valid } private void extractFromPDF(InputStream inputStream, XMLReceiver xmlReceiver, String scope) { logger.info("Extract from PDF started"); // Write the header information of the PDF PDDocument doc = null; try { xmlReceiver.startDocument(); logger.info("Start document completed"); // Some variables for our PDF processing doc = getPDFdocument(inputStream, xmlReceiver); // PDF // Document if (doc == null) { xmlReceiver.startElement("", TAG_ROOT, TAG_ROOT, null); addErrorTagToOutput(xmlReceiver, "No PDF Information could be extracted"); xmlReceiver.endElement("", TAG_ROOT, TAG_ROOT); return; //No processing on empty documents } } catch (SAXException e) { logger.error(e); } // Get a handle on all pages in the PDF. Needed for page lookup try { logger.info("Try to get handle on all pages array"); this.allPages = doc.getDocumentCatalog().getAllPages(); logger.info("Got handle to allPages in PDF"); } catch (Exception e) { logger.error(e); addErrorTagToOutput(xmlReceiver, e.toString()); this.allPages = null; } try { // Get the document information logger.info("Ready to retrieve basic PDF information"); PDDocumentInformation docInfo = getDocumentInformation(doc, xmlReceiver); AttributesImpl atts = new AttributesImpl(); // Capture the number of pages addPageCountAttribute(atts, doc); // Now add some document Info addDocInfoAttributes(atts, docInfo); // Start the PDF Document logger.info("writing the root element PDFDocument"); xmlReceiver.startElement("", TAG_ROOT, TAG_ROOT, atts); logger.info("PDFDocument tag succesful opened"); //Pull the Meta data from the PDF Document atts = new AttributesImpl(); logger.info("PDFMetadata Element start"); xmlReceiver.startElement("", TAG_META, TAG_META, atts); extractMetaDataFromPDF(xmlReceiver, doc); xmlReceiver.endElement("", TAG_META, TAG_META); logger.info("PDFMetadata Element end"); // Get the PDF Content based on the selection in config if (scope.equals(SCOPE_PAGES)) { //PDF page by page logger.info("Will extract pages"); extractPagesFromPDF(xmlReceiver, doc); } else if (scope.equals(SCOPE_METADATA)) { logger.info("No action bejond meta data"); // No further action required since it was meta data only! } else if (scope.equals(SCOPE_BOOKMARKPAGES)) { // Try bookmarks then pages logger.info("Will extract bookmarks first then pages"); if (!extractOutlineFromPDF(xmlReceiver, doc, scope)) { logger.info("No outline found, using pages"); extractPagesFromPDF(xmlReceiver, doc); } } else { // PDF in outlines - default logger.info("Will extract: " + scope); extractOutlineFromPDF(xmlReceiver, doc, scope); } //If we got here it worked logger.info("Writing end element " + TAG_ROOT); xmlReceiver.endElement("", TAG_ROOT, TAG_ROOT); logger.info("About to close PDFDocument and SaxDocument"); xmlReceiver.endDocument(); doc.close(); // We finish it once we are done logger.info("Closed PDF and XML"); } catch (IOException e) { logger.error(e); addErrorTagToOutput(xmlReceiver, e.toString()); } catch (SAXException e) { logger.error(e); addErrorTagToOutput(xmlReceiver, e.toString()); } } /** * @param atts * @param docInfo */ private void addDocInfoAttributes(AttributesImpl atts, PDDocumentInformation docInfo) { if (docInfo != null) { String author = docInfo.getAuthor(); if (author != null) atts.addAttribute("", ATT_AUTHOR, ATT_AUTHOR, ATT_CDATA, author); String title = docInfo.getTitle(); if (title != null) atts.addAttribute("", ATT_TITLE, ATT_TITLE, ATT_CDATA, title); String subject = docInfo.getSubject(); if (subject != null) atts.addAttribute("", ATT_SUBJECT, ATT_SUBJECT, ATT_CDATA, subject); } } /** * @param inputStream * @param contentHandler * @return */ private PDDocument getPDFdocument(InputStream inputStream, ContentHandler contentHandler) { PDDocument doc = null; // Create access to PDF Document try { // We get the document from the inputstream doc = PDDocument.load(inputStream); } catch (IOException e) { logger.error("PDFParser(InputStream)", e); doc = null; // We reset the object // We write our some stuff into output document, so we have a // chance to see what went wrong addErrorTagToOutput(contentHandler, e.toString()); } return doc; } /** * Adds an error element to the output * * @param contentHandler * @param eMessage */ private void addErrorTagToOutput(ContentHandler contentHandler, String eMessage) { try { contentHandler.startElement("", TAG_ERROR, TAG_ERROR, null); contentHandler.characters(eMessage.toCharArray(), 0, eMessage.length()); contentHandler.endElement("", TAG_ERROR, TAG_ERROR); } catch (SAXException e) { logger.error(e); } } /** * @param doc * @return PDFDocumentInformation */ private PDDocumentInformation getDocumentInformation(PDDocument doc, ContentHandler contentHandler) { PDDocumentInformation tmpInfo = null; try { tmpInfo = doc.getDocumentInformation(); } catch (Exception e) { logger.error(e); addErrorTagToOutput(contentHandler, e.toString()); } return tmpInfo; } /** * @param atts * @param doc */ private void addPageCountAttribute(AttributesImpl atts, PDDocument doc) { int pageCount = 0; //The number of pages in this document try { pageCount = doc.getPageCount(); } catch (IOException e) { logger.error(e); pageCount = 0; } if (pageCount > 0) { atts.addAttribute("", ATT_PAGES, ATT_PAGES, ATT_CDATA, String.valueOf(pageCount)); } } /** * @param xmlReceiver * @param doc */ private boolean extractMetaDataFromPDF(XMLReceiver xmlReceiver, PDDocument doc) { // Reads the meta data from the input stream and pushes them 1:1 to the // output // The Meta data is converted to sax using XMLUtils and start/end document // are simply // removed. The rest is moved through... boolean tmpReturn = true; // Benefit of the doubt logger.info("Processing META data"); try { PDDocumentCatalog catalog = doc.getDocumentCatalog(); //Where meta // data lives PDMetadata metadata = catalog.getMetadata(); // The meta data could be empty! if (metadata == null) return false; // The content handler for that input stream... final XMLReceiver pdfMetaContent = new PdfMetadataXMLReceiver(xmlReceiver); //read the XML metadata into an inputstream InputStream xmlInputStream = metadata.createInputStream(); logger.info("Before creating sax stream for meta data"); XMLParsing.inputStreamToSAX(xmlInputStream, "PDF", pdfMetaContent, XMLParsing.ParserConfiguration.PLAIN, false); //Now pull it in and write it out 1:1 logger.info("Meta data stream created in SAX"); } catch (IOException e) { // If it goes wrong logger.error(e); addErrorTagToOutput(xmlReceiver, e.toString()); tmpReturn = false; } catch (Exception e) { logger.error(e); addErrorTagToOutput(xmlReceiver, e.toString()); tmpReturn = false; } return tmpReturn; } /** * @param xmlReceiver * @param doc */ private boolean extractPagesFromPDF(XMLReceiver xmlReceiver, PDDocument doc) { // This extracts all pages with the text per page boolean tmpReturn = true; // Benefit of the doubt PDFTextStripper stripper; try { stripper = new PDFTextStripper(); AttributesImpl atts; // Capture the number of pages int pageCount = doc.getPageCount(); //Loop through all the pages; for (int i = 1; i <= pageCount; i++) { atts = new AttributesImpl(); atts.addAttribute("", ATT_PAGENUM, ATT_PAGENUM, ATT_CDATA, String.valueOf(i)); stripper.setStartPage(i); stripper.setEndPage(i); String textBetweenBookmarks = stripper.getText(doc); xmlReceiver.startElement("", TAG_PAGE, TAG_PAGE, atts); textBetweenBookmarks = MassageTextResult(textBetweenBookmarks); xmlReceiver.characters(textBetweenBookmarks.toCharArray(), 0, textBetweenBookmarks.length()); xmlReceiver.endElement("", TAG_PAGE, TAG_PAGE); } } catch (IOException e) { // If it goes wrong logger.error(e); addErrorTagToOutput(xmlReceiver, e.toString()); tmpReturn = false; } catch (SAXException e) { // If it goes wrong logger.error(e); addErrorTagToOutput(xmlReceiver, e.toString()); tmpReturn = false; } return tmpReturn; } private boolean extractOutlineFromPDF(ContentHandler contentHandler, PDDocument doc, String scope) { // Get the document catalog boolean tmpReturn = true; // Benefit of the doubt PDDocumentOutline root = null; PDOutlineItem item = null; // Get the outline if there is one try { root = doc.getDocumentCatalog().getDocumentOutline(); } catch (Exception e) { logger.error(e); addErrorTagToOutput(contentHandler, e.toString()); tmpReturn = false; } // No further processing if the outline is null if (root == null) { tmpReturn = false; } else { // We try to get our hands on the content try { item = root.getFirstChild(); // Check if there is anything if (item == null) { tmpReturn = false; // No outline without a first element! } else { while (item != null) { // Memorize the next object // Recursive call into bookmark processing; processBookmark(contentHandler, doc, item, scope, 1); logger.info(item.getTitle()); item = item.getNextSibling(); } } } catch (Exception e) { logger.error(e); tmpReturn = false; } } return tmpReturn; } /** * processBookmark gets called recursively for all nested bookmarks extracts * the bookmark and the text */ private void processBookmark(ContentHandler hd, PDDocument doc, PDOutlineItem curItem, String scope, int level) { // First we check on what page the bookmark is. If we can't retrieve the // page the bookmark can't be the outline we are looking for, however we // would process children (you never know) try { int curPageNo = getPageNumber(doc, curItem); if (curPageNo > -1) { AttributesImpl atts = new AttributesImpl(); atts.addAttribute("", ATT_LEVEL, ATT_LEVEL, ATT_CDATA, Integer.toString(level)); atts.addAttribute("", ATT_PAGE, ATT_PAGE, ATT_CDATA, Integer.toString(curPageNo)); hd.startElement("", TAG_BOOKMARK, TAG_BOOKMARK, atts); // Write the properties of interest atts.clear(); hd.startElement("", TAG_TITLE, TAG_TITLE, atts); String curTitle = curItem.getTitle(); hd.characters(curTitle.toCharArray(), 0, curTitle.length()); hd.endElement("", TAG_TITLE, TAG_TITLE); //write out the text associated with this bookmark // if the scope allows for that if (!scope.toLowerCase().equals(SCOPE_BOOKMARKSONLY)) { PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartBookmark(curItem); stripper.setEndBookmark(curItem); String textBetweenBookmarks = stripper.getText(doc); hd.startElement("", TAG_TEXT, TAG_TEXT, atts); textBetweenBookmarks = MassageTextResult(textBetweenBookmarks); hd.characters(textBetweenBookmarks.toCharArray(), 0, textBetweenBookmarks.length()); hd.endElement("", TAG_TEXT, TAG_TEXT); } } // Now check the children PDOutlineItem child = curItem.getFirstChild(); while (child != null) { processBookmark(hd, doc, child, scope, level + 1); logger.info("Child:" + child.getTitle()); child = child.getNextSibling(); } // Close the mark hd.endElement("", TAG_BOOKMARK, TAG_BOOKMARK); } catch (SAXException e) { logger.error(e); addErrorTagToOutput(hd, e.toString()); } catch (IOException e) { logger.error(e); addErrorTagToOutput(hd, e.toString()); } finally { // Nothing concluding to do } } private String MassageTextResult(String rawString) { // Removes unwanted characters // Currently we need to get rid of chr(13); String oldChar = new Character((char) 13).toString(); return StringUtils.replace(rawString, oldChar, ""); //return rawString.replace(oldChar, ""); // this only works with Java 5 } private int getPageNumber(PDDocument doc, PDOutlineItem outline) { int pageNumber = -1; PDPage page = null; try { page = outline.findDestinationPage(doc); if (page != null && this.allPages != null) { pageNumber = this.allPages.indexOf(page); } } catch (OutlineNotLocalException e) { logger.error(e); } catch (IOException e) { logger.error(e); } catch (Exception e) { logger.error(e); } return pageNumber; } }