List of usage examples for com.itextpdf.text.pdf PdfReader getNumberOfPages
public int getNumberOfPages()
From source file:de.aidger.utils.pdf.ControllingConverter.java
License:Open Source License
/** * Places the created document onto the template for this report. * //from www . j ava 2s . co m * @param file * The file to which this report will be saved. * @param preTemplateFile * The report to be used. */ private boolean applyTemplate(File file, File preTemplateFile) { FileOutputStream outStream = null; FileInputStream inStream = null; PdfContentByte contentByte = null; PdfReader reader = null, templateReader = null; try { /* * Use the template located in the configuration path first, if it * exists. */ File template = new File(Runtime.getInstance().getConfigPath() + "/templates/ControllingTemplate.pdf"); URL templateURL = null; if (template.exists()) { templateURL = template.toURI().toURL(); } else { templateURL = getClass().getResource("/de/aidger/res/pdf/ControllingTemplate.pdf"); } if (templateURL == null) { throw new FileNotFoundException(_("The report template could not be loaded.") + " " + _("Please make sure that a fitting template exists in the template folder.")); } Document document = new Document(PageSize.A4); outStream = new FileOutputStream(file.getPath()); inStream = new FileInputStream(preTemplateFile); writer = PdfWriter.getInstance(document, outStream); document.open(); contentByte = writer.getDirectContent(); reader = new PdfReader(inStream); templateReader = new PdfReader(templateURL); /* * Add the template pdf to the document and place the finished * report on top of it. */ for (int i = 1; i <= reader.getNumberOfPages(); i++) { document.newPage(); PdfImportedPage page = writer.getImportedPage(templateReader, 1); int rotation = templateReader.getPageRotation(1); if (rotation == 90 || rotation == 270) { //landscape mode contentByte.addTemplate(page, 0, -1f, 1f, 0, 0, reader.getPageSizeWithRotation(1).getHeight()); } else { //portrait mode contentByte.addTemplate(page, 1f, 0, 0, 1f, 0, 0); } page = writer.getImportedPage(reader, i); rotation = reader.getPageRotation(i); if (rotation == 90 || rotation == 270) { //landscape mode contentByte.addTemplate(page, 0, -1f, 1f, 0, 0, reader.getPageSizeWithRotation(1).getHeight()); } else { //portrait mode contentByte.addTemplate(page, 1f, 0, 0, 1f, 0, 0); } } document.close(); return true; } catch (FileNotFoundException e) { if (e.getMessage() != null) { UI.displayError(e.getMessage()); } } catch (DocumentException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return false; }
From source file:de.aidger.utils.pdf.ProtocolConverter.java
License:Open Source License
/** * Places the created document onto the template for this report. * //from w w w. j av a 2 s.c o m * @param file * The file to which this report will be saved. * @param preTemplateFile * The report to be used. */ private boolean applyTemplate(File file, File preTemplateFile) { FileOutputStream outStream = null; FileInputStream inStream = null; PdfContentByte contentByte = null; PdfReader reader = null, templateReader = null; try { /* * Use the template located in the configuration path first, if it * exists. */ File template = new File(Runtime.getInstance().getConfigPath() + "/templates/ProtocolTemplate.pdf"); URL templateURL = null; if (template.exists()) { templateURL = template.toURI().toURL(); } else { templateURL = getClass().getResource("/de/aidger/res/pdf/ProtocolTemplate.pdf"); } if (templateURL == null) { throw new FileNotFoundException(_("The report template could not be loaded.") + " " + _("Please make sure that a fitting template exists in the template folder.")); } Document document = new Document(PageSize.A4.rotate()); outStream = new FileOutputStream(file.getPath()); inStream = new FileInputStream(preTemplateFile); writer = PdfWriter.getInstance(document, outStream); document.open(); contentByte = writer.getDirectContent(); reader = new PdfReader(inStream); templateReader = new PdfReader(templateURL); /* * Add the template pdf to the document and place the finished * report on top of it. */ for (int i = 1; i <= reader.getNumberOfPages(); i++) { document.newPage(); PdfImportedPage page = writer.getImportedPage(templateReader, 1); int rotation = templateReader.getPageRotation(1); if (rotation == 90 || rotation == 270) { //landscape mode contentByte.addTemplate(page, 0, -1f, 1f, 0, 0, reader.getPageSizeWithRotation(1).getHeight()); } else { //portrait mode contentByte.addTemplate(page, 1f, 0, 0, 1f, 0, 0); } page = writer.getImportedPage(reader, i); rotation = reader.getPageRotation(i); if (rotation == 90 || rotation == 270) { //landscape mode contentByte.addTemplate(page, 0, -1f, 1f, 0, 0, reader.getPageSizeWithRotation(1).getHeight()); } else { //portrait mode contentByte.addTemplate(page, 1f, 0, 0, 1f, 0, 0); } } document.close(); return true; } catch (FileNotFoundException e) { if (e.getMessage() != null) { UI.displayError(e.getMessage()); } } catch (DocumentException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return false; }
From source file:de.extra.xtt.util.pdf.PdfCreatorImpl.java
License:Apache License
/** * Erzeugt das Inhaltsverzeichnis aus den bereits vorhandenen Elementen in * der Liste <code>listEntries</code>. * /* ww w .j a v a 2 s . c o m*/ * @param docPdf * Zieldokument, falls Inhaltsverzeichnis nicht temporr erzeugt * wird * @param temp * Gibt an, ob das Inhaltsverzeichnis temporr in einer neuen * Datei/Dokument erzeugt werden soll * @return Anzahl der Seiten * @throws DocumentException * @throws IOException */ private int erzeugeInhaltsverzeichnis(Document docPdf, boolean temp) throws DocumentException, IOException { int anzPages = 0; Document docInhalt = docPdf; String filePathTempInhaltString = ""; if (temp) { // temp. Dateinamen bestimmen File fileDokuFile = new File(dateiname); filePathTempInhaltString = fileDokuFile.getParent() + "/tmp_inhalt.pdf"; // Neues Dokument erzeugen docInhalt = initPdfWriterAndDocument(filePathTempInhaltString, false); } // berschrift Chapter currChapter = new Chapter(getParagraphChapter("Inhaltsverzeichnis"), 0); // 0, damit keine Nummerierung currChapter.setNumberDepth(0); docInhalt.add(currChapter); // eine Zeile Abstand docInhalt.add(getEmptyLineTextHalf()); for (ContPdfEntry currEntry : listEntries) { // Eintrag erzeugen inkl. Abstand String strEintrag = currEntry.getBezeichnung() + " "; Chunk chunkBezeichnung; Chunk chunkSeitenzahlChunk; if (currEntry.getParentEntry() == null) { // 1. Ebene => fett, Abstand davor einfgen docInhalt.add(getEmptyLineTextHalf()); chunkBezeichnung = getChunkTextBold(strEintrag); chunkSeitenzahlChunk = getChunkTextBold("" + currEntry.getPageNumber()); } else { // 2. Ebene chunkBezeichnung = getChunkText(strEintrag); chunkSeitenzahlChunk = getChunkText("" + currEntry.getPageNumber()); } // Referenz setzen chunkBezeichnung.setLocalGoto(currEntry.getDestination()); chunkSeitenzahlChunk.setLocalGoto(currEntry.getDestination()); // Abstandzeichen generieren, Breite auffllen float widthAbstand = docInhalt.getPageSize().getWidth() * 0.81f; ; while (chunkBezeichnung.getWidthPoint() <= widthAbstand) { chunkBezeichnung.append("."); } // Tabelle erzeugen und formatieren PdfPTable currTable = new PdfPTable(2); currTable.setWidthPercentage(100f); currTable.setWidths(new int[] { 96, 4 }); // Inhalte einfgen // Zelle Bezeichnung PdfPCell currCellBezeichnung = new PdfPCell(new Phrase(chunkBezeichnung)); currCellBezeichnung.setBorder(0); currCellBezeichnung.setHorizontalAlignment(Element.ALIGN_JUSTIFIED_ALL); // Zelle Seitennummer PdfPCell currCellPageNumberCell = new PdfPCell(new Phrase(chunkSeitenzahlChunk)); currCellPageNumberCell.setBorder(0); currCellPageNumberCell.setHorizontalAlignment(Element.ALIGN_RIGHT); // Zellen zur Tabelle hinzufgen currTable.addCell(currCellBezeichnung); currTable.addCell(currCellPageNumberCell); docInhalt.add(currTable); } if (temp) { // Dokument schlieen docInhalt.close(); // Anzahl der Seitenzahlen bestimmen PdfReader reader = new PdfReader(filePathTempInhaltString); anzPages = reader.getNumberOfPages(); reader.close(); // temp. Datei lschen File currFileInhaltFile = new File(filePathTempInhaltString); currFileInhaltFile.delete(); } return anzPages; }
From source file:de.gbv.marginalia.Marginalia.java
License:Open Source License
/** * Inspect a PDF file and write the info to a writer * @param writer Writer to a text file/*from ww w . ja v a2s .c om*/ * @param filename Path to the PDF file * @throws IOException */ public static void inspect(PrintWriter writer, String filename) throws IOException, SAXException { // writer.println(filename); writer.flush(); PdfReader reader = new PdfReader(filename); ContentHandler xmlhandler = new SimpleXMLWriter(writer); xmlhandler.startDocument(); SimpleXMLCreator xml = new SimpleXMLCreator(xmlhandler, Annotation.namespaces, true); /* writer.println("Number of pages: "+reader.getNumberOfPages()); Rectangle mediabox = reader.getPageSize(1); writer.print("Size of page 1: ["); writer.print(mediabox.getLeft()); writer.print(','); writer.print(mediabox.getBottom()); writer.print(','); writer.print(mediabox.getRight()); writer.print(','); writer.print(mediabox.getTop()); writer.println("]"); writer.print("Rotation of page 1: "); writer.println(reader.getPageRotation(1)); writer.print("Page size with rotation of page 1: "); writer.println(reader.getPageSizeWithRotation(1)); writer.println(); writer.flush(); */ List<Annotation> annots = new LinkedList<Annotation>(); xml.startElement("annots"); // TODO: The following elements may be added: // - optionally write <f href="Document.pdf"/> // - optionally write <ids original="ID" modified="ID" /> xml.startElement("m", "pages"); for (int pageNum = 1; pageNum <= reader.getNumberOfPages(); pageNum++) { PdfDictionary pageDic = reader.getPageN(pageNum); Map<String, String> attr = new HashMap<String, String>(); attr.put("number", "" + pageNum); attr.put("rotate", "" + reader.getPageRotation(pageNum)); Rectangle mediabox = reader.getPageSize(pageNum); attr.put("left", "" + mediabox.getLeft()); attr.put("bottom", "" + mediabox.getBottom()); attr.put("right", "" + mediabox.getRight()); attr.put("top", "" + mediabox.getTop()); xml.contentElement("m", "page", "", attr); PdfArray rawannots = pageDic.getAsArray(PdfName.ANNOTS); if (rawannots == null || rawannots.isEmpty()) { // writer.println("page "+pageNum+" contains no annotations"); continue; } // writer.println("page "+pageNum+" has "+rawannots.size()+" annotations"); for (int i = 0; i < rawannots.size(); i++) { PdfObject obj = rawannots.getDirectObject(i); if (!obj.isDictionary()) continue; Annotation a = new Annotation((PdfDictionary) obj, pageNum); annots.add(a); } /** // Now we have all highlight and similar annotations, we need // to find out what words are actually highlighted! PDF in fact // is a dump format to express documents. // For some hints see // http://stackoverflow.com/questions/4028240/extract-each-column-of-a-pdf-file // We could reuse code from LocationTextExtractionStrategy (TODO) // LocationTextExtractionStrategy extr = new LocationTextExtractionStrategy(); String fulltext = PdfTextExtractor.getTextFromPage(reader,pageNum);//,extr writer.println(fulltext); */ } xml.endElement(); for (Annotation a : annots) { a.serializeXML(xmlhandler); } // TODO: add page information (page size and orientation) xml.endAll(); }
From source file:de.jost_net.JVerein.io.FormularAufbereitung.java
License:Open Source License
public void writeForm(Formular formular, Map<String, Object> map) throws RemoteException { try {//from w ww .java 2 s .c o m PdfReader reader = new PdfReader(formular.getInhalt()); int numOfPages = reader.getNumberOfPages(); for (int i = 1; i <= numOfPages; i++) { doc.setPageSize(reader.getPageSize(i)); doc.newPage(); PdfImportedPage page = writer.getImportedPage(reader, i); PdfContentByte contentByte = writer.getDirectContent(); contentByte.addTemplate(page, 0, 0); DBIterator<Formularfeld> it = Einstellungen.getDBService().createList(Formularfeld.class); it.addFilter("formular = ? and seite = ?", new Object[] { formular.getID(), i }); while (it.hasNext()) { Formularfeld f = (Formularfeld) it.next(); goFormularfeld(contentByte, f, map.get(f.getName())); } } } catch (IOException e) { throw new RemoteException("Fehler", e); } catch (DocumentException e) { throw new RemoteException("Fehler", e); } }
From source file:de.mat.utils.pdftools.PdfAddPageNum.java
License:Mozilla Public License
/** * <h4>FeatureDomain:</h4>//from w w w.ja va 2s . co m * PublishingTools * <h4>FeatureDescription:</h4> * adds pagenum with stamper to pages from reader * <h4>FeatureResult:</h4> * <ul> * <li>updates stamper - updates the stamper * </ul> * <h4>FeatureKeywords:</h4> * PDF Publishing * @param reader - reader with the pages * @param stamper - stamper to add the canvas * @param pageOffset - add to pagenumber * @throws DocumentException * @throws IOException */ public void addPageNumber(PdfReader reader, PdfStamper stamper, int pageOffset) throws DocumentException, IOException { // ierate all pages from reader for (int zaehler = 1; zaehler <= reader.getNumberOfPages(); zaehler++) { // read pagesize Rectangle pageSize = reader.getPageSize(zaehler); float xpos = pageSize.getLeft() + pageSize.getWidth() / 2; float ypos = 20; float fontSize = 7; // Default-Positions for --page-width 150mm --page-height 212mm == 601px if (pageSize.getHeight() > 602 || pageSize.getHeight() < 598) { // correct it relative float factor = pageSize.getHeight() / 601; if (LOGGER.isDebugEnabled()) LOGGER.debug(" PageHeight:" + pageSize.getHeight() + " Factor:" + factor); ypos = ypos * factor; fontSize = fontSize * factor; } // add pagenumber-canvas PdfContentByte canvas = stamper.getOverContent(zaehler); BaseFont bf_helv = BaseFont.createFont(BaseFont.HELVETICA, "Cp1252", false); canvas.setFontAndSize(bf_helv, fontSize); canvas.beginText(); canvas.showTextAligned(PdfContentByte.ALIGN_CENTER, "" + new Integer(zaehler + pageOffset - 1), xpos, ypos, 0); canvas.endText(); } }
From source file:de.mat.utils.pdftools.PdfExtractEmptyPages.java
License:Mozilla Public License
/** * <h4>FeatureDomain:</h4>//w w w .j a v a 2 s. co m * PublishingTools * <h4>FeatureDescription:</h4> * reads readerOrig and adds pages to writerRemoved if empty, or to * writerTrimmed if not empty * <h4>FeatureResult:</h4> * <ul> * <li>updates writerTrimmed - add all pages which are not empty * <li>updates writerRemoved - add all empty pages * </ul> * <h4>FeatureKeywords:</h4> * PDF Publishing * @param origFileName - orig filename of the sourcepdf * @param readerOrig - reader of source * @param writerTrimmed - writer for trimmed pages * @param writerRemoved - writer for empty pages * @param flgTrim - ?? * @return - count of trimmed pages * @throws Exception */ public static int addTrimmedPages(String origFileName, PdfReader readerOrig, PdfCopy writerTrimmed, PdfCopy writerRemoved, boolean flgTrim) throws Exception { PdfImportedPage page = null; int countTrimmedPages = 0; //loop each page for (int i = 1; i <= readerOrig.getNumberOfPages(); i++) { boolean flgIsEmpty = true; // get dictionary PdfDictionary pageDict = readerOrig.getPageN(i); // every pdf-version has its own way :-( char version = readerOrig.getPdfVersion(); if (version == '3') { // PDF-Version: 3 // examine the resource dictionary for /Font or // /XObject keys. If either are present, they're almost // certainly actually used on the page -> not blank. PdfObject myObj = pageDict.get(PdfName.RESOURCES); PdfDictionary resDict = null; if (myObj instanceof PdfDictionary) { resDict = (PdfDictionary) myObj; } else { resDict = (PdfDictionary) PdfReader.getPdfObject(myObj); } if (resDict != null) { flgIsEmpty = resDict.get(PdfName.FONT) == null && resDict.get(PdfName.XOBJECT) == null; if (LOGGER.isInfoEnabled()) { if (flgIsEmpty) { LOGGER.info("probably empty page " + i + " Version: 1." + version + " FONT/XOBJECT found in File:" + origFileName); } else { LOGGER.info("normal page " + i + " Version: 1." + version + " no FONT/XOBJECT found in File:" + origFileName); } } } } else if (version == '4') { // PDF-Version: 4 // check the contentsize. // get the page content byte bContent[] = readerOrig.getPageContent(i); ByteArrayOutputStream bs = new ByteArrayOutputStream(); // write the content to an output stream bs.write(bContent); flgIsEmpty = true; if (bs.size() > blankPdfsize) { if (LOGGER.isInfoEnabled()) LOGGER.info("normal page " + i + " Version: 1." + version + " BS:" + bs.size() + " File:" + origFileName); flgIsEmpty = false; } else { if (LOGGER.isInfoEnabled()) LOGGER.info("probably empty page " + i + " Version: 1." + version + " BS:" + bs.size() + " File:" + origFileName); } } else if (version == '5') { // PDF-Version: 5 // check the contentsize. // get the page content byte bContent[] = readerOrig.getPageContent(i); ByteArrayOutputStream bs = new ByteArrayOutputStream(); // write the content to an output stream bs.write(bContent); flgIsEmpty = true; if (bs.size() > blankPdfsize_v5) { if (LOGGER.isInfoEnabled()) LOGGER.info("normal page " + i + " Version: 1." + version + " BS:" + bs.size() + " File:" + origFileName); flgIsEmpty = false; } else { if (LOGGER.isInfoEnabled()) LOGGER.info("probably empty page " + i + " Version: 1." + version + " BS:" + bs.size() + " File:" + origFileName); } } // add page to removed or trimmed document if (!flgIsEmpty || !flgTrim) { if (LOGGER.isInfoEnabled()) LOGGER.info("add page " + i); page = writerTrimmed.getImportedPage(readerOrig, i); writerTrimmed.addPage(page); countTrimmedPages++; } else { if (LOGGER.isInfoEnabled()) LOGGER.info("skip page " + i + " Version: 1." + version + " File:" + origFileName); if (writerRemoved != null) { page = writerRemoved.getImportedPage(readerOrig, i); writerRemoved.addPage(page); } } } return countTrimmedPages; }
From source file:de.mat.utils.pdftools.PdfResize.java
License:Mozilla Public License
/** /**/*from w w w. j av a2 s .c o m*/ * <h4>FeatureDomain:</h4> * PublishingTools * <h4>FeatureDescription:</h4> * scales and move comntents of the pdf pages from fileSrc and output to * fileNew * <h4>FeatureResult:</h4> * <ul> * <li>create PDF - fileNew * </ul> * <h4>FeatureKeywords:</h4> * PDF Publishing * @param fileSrc - source-pdf * @param fileNew - scaled dest-pdf * @param factorX - scaling x * @param factorY - scaling y * @param pixelLeft - move right * @param pixelTop - move down * @throws Exception */ public static void resizePdf(String fileSrc, String fileNew, float factorX, float factorY, float pixelLeft, float pixelTop) throws Exception { // open reader PdfReader reader = new PdfReader(fileSrc); // get pagebasedata int pageCount = reader.getNumberOfPages(); Rectangle psize = reader.getPageSize(1); float width = psize.getHeight(); float height = psize.getWidth(); // open writer Document documentNew = new Document(new Rectangle(height * factorY, width * factorX)); PdfWriter writerNew = PdfWriter.getInstance(documentNew, new FileOutputStream(fileNew)); documentNew.open(); PdfContentByte cb = writerNew.getDirectContent(); // iterate pages int i = 0; while (i < pageCount) { i++; // imoport page from reader and scale it to writer documentNew.newPage(); PdfImportedPage page = writerNew.getImportedPage(reader, i); cb.addTemplate(page, factorX, 0, 0, factorY, pixelLeft, pixelTop); if (LOGGER.isInfoEnabled()) LOGGER.info("AddPage " + i + " from:" + fileSrc + " to:" + fileNew); } documentNew.close(); writerNew.close(); }
From source file:de.mat.utils.pdftools.PdfSort4Print.java
License:Mozilla Public License
public static void sortPdfPages(String pdfSourceFile, String pdfDestinationFile, int perPage) throws Exception { PdfImportedPage page = null;//from www . j av a 2 s . c om if (perPage != 2 && perPage != 4) { throw new IllegalArgumentException( "Sorry, perPage must only be " + "2 or 4. All other is not implemented yet :-("); } // ####### // # fill to odd pagecount // ####### // create reader PdfReader readerOrig = new PdfReader(pdfSourceFile); // calc data int countPage = readerOrig.getNumberOfPages(); int blaetter = new Double(Math.ceil((countPage + 0.0) / perPage / 2)).intValue(); int zielPages = (blaetter * perPage * 2) - countPage; if (LOGGER.isInfoEnabled()) LOGGER.info("CurPages: " + countPage + " Blaetter:" + blaetter + " AddPage:" + zielPages); // add sites String oddFile = pdfDestinationFile + ".filled.pdf"; PdfStamper stamper = new PdfStamper(readerOrig, new FileOutputStream(oddFile)); // add empty pages for (int i = 1; i <= zielPages; i++) { if (LOGGER.isDebugEnabled()) LOGGER.debug("addEmptyPage: " + i); stamper.insertPage(readerOrig.getNumberOfPages() + 1, readerOrig.getPageSizeWithRotation(1)); } stamper.close(); readerOrig.close(); // ######## // # read new odd document and sort pages // ######## // step 1: create new reader PdfReader readerOdd = new PdfReader(oddFile); // create writerSorted String sortedFile = pdfDestinationFile; Document documentSorted = new Document(readerOrig.getPageSizeWithRotation(1)); PdfCopy writerSorted = new PdfCopy(documentSorted, new FileOutputStream(sortedFile)); documentSorted.open(); // add pages in calced order List<Integer> lstPageNr = new ArrayList<Integer>(); int pageCount = readerOdd.getNumberOfPages(); int startseite = 1; for (int i = 1; i <= blaetter; i++) { if (perPage == 2) { startseite = ((i - 1) * perPage) + 1; if (LOGGER.isDebugEnabled()) LOGGER.debug("Blatt:" + i + " Startseite: " + startseite); // front top lstPageNr.add(new Integer(pageCount - startseite + 1)); // front bottom lstPageNr.add(new Integer(startseite)); // back top lstPageNr.add(new Integer(startseite + 1)); // back bottom lstPageNr.add(new Integer(pageCount - startseite + 1 - 1)); } else if (perPage == 4) { startseite = ((i - 1) * perPage) + 1; if (LOGGER.isDebugEnabled()) LOGGER.debug("Blatt:" + i + " Startseite: " + startseite); // front top left lstPageNr.add(new Integer(pageCount - startseite + 1)); // front top right lstPageNr.add(new Integer(startseite)); // front bottom lefts lstPageNr.add(new Integer(pageCount - startseite + 1 - 2)); // front bottom right lstPageNr.add(new Integer(startseite + 2)); // back top left lstPageNr.add(new Integer(startseite + 1)); // back top right lstPageNr.add(new Integer(pageCount - startseite + 1 - 1)); // back bottom left lstPageNr.add(new Integer(startseite + 1 + 2)); // back bottom right lstPageNr.add(new Integer(pageCount - startseite + 1 - 1 - 2)); } else { throw new IllegalArgumentException( "Sorry, perPage must " + "only be 2 or 4. All other is not implemented yet :-("); } } if (LOGGER.isInfoEnabled()) LOGGER.info("Seiten:" + lstPageNr.size()); // copy pages for (Iterator iter = lstPageNr.iterator(); iter.hasNext();) { int pageNum = ((Integer) iter.next()).intValue(); if (LOGGER.isDebugEnabled()) LOGGER.debug("addSortPage: " + pageNum); page = writerSorted.getImportedPage(readerOdd, pageNum); writerSorted.addPage(page); } // close everything documentSorted.close(); writerSorted.close(); readerOdd.close(); // delete Tmp-File File file = new File(oddFile); file.delete(); }
From source file:de.mpg.escidoc.services.extraction.ExtractionChain.java
License:Open Source License
public ExtractionResult doExtract(String infileName, String outfileName) { File outfile = new File(outfileName); Date stepStart = new Date(); Date current;//from www. ja va2 s. com logger.info("Extracting PDF content ----------------------------------------"); logger.info("Infile: " + infileName); logger.info("Outfile: " + outfileName); logger.info(stepStart + " -- started"); // xPDF try { logger.info("Extracting with xPDF"); StringBuffer command = new StringBuffer(2048); command.append(System.getProperty("os.name").contains("Windows") ? pdftotext + " -enc UTF-8 " : "/usr/bin/pdftotext -enc UTF-8 "); command.append(infileName); command.append(" "); command.append(outfileName); Process proc = Runtime.getRuntime().exec(command.toString()); StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "xPDF"); StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "xPDF"); inputGobbler.start(); errorGobbler.start(); int exitCode = proc.waitFor(); if (proc.exitValue() == 0) { if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } } catch (Exception e) { logger.warn("Error extracting PDF with xPDF:"); logger.warn(e.getStackTrace()); } current = new Date(); logger.info(current + " -- finished unsuccessfully"); logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime())); // PDFBox try { logger.info("Extracting with PDFBox"); stepStart = new Date(); StringBuffer command = new StringBuffer(1024); command.append(System.getProperty("os.name").contains("Windows") ? "java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText " : "/usr/bin/java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText "); command.append(infileName); command.append(" "); command.append(outfileName); Process proc = Runtime.getRuntime().exec(command.toString()); StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "PDFBox"); StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "PDFBox"); inputGobbler.start(); errorGobbler.start(); int exitCode = proc.waitFor(); if (exitCode == 0) { if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } } catch (Exception e) { logger.warn("Error extracting PDF with PDFBox:"); logger.warn(e.getStackTrace()); } current = new Date(); logger.info(current + " -- finished unsuccessfully"); logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime())); // iText try { logger.info("Extracting with iText"); stepStart = new Date(); PdfReader reader = new PdfReader(infileName); int numberOfPages = reader.getNumberOfPages(); outputStreamWriter = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); for (int i = 0; i < numberOfPages; i++) { outputStreamWriter.write(PdfTextExtractor.getTextFromPage(reader, i + 1)); } if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } catch (Exception e) { logger.warn("Error extracting PDF with iText:", e); } // tika InputStream stream = null; try { logger.info("Extracting with Tika"); stepStart = new Date(); stream = TikaInputStream.get(new File(infileName)); ContentHandler handler = new BodyContentHandler(TIKA_CONTENT_SIZE); new AutoDetectParser().parse(stream, handler, new Metadata(), new ParseContext()); String content = handler.toString(); FileUtils.writeStringToFile(outfile, content); stream.close(); if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } catch (Exception e) { logger.warn("Error extracting Tika:", e); try { stream.close(); } catch (IOException e1) { e1.printStackTrace(); } } current = new Date(); logger.warn(current + " -- finished unsuccessfully"); logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime())); logger.info("... giving up"); return ExtractionResult.FAILURE; }