List of usage examples for com.itextpdf.text.pdf PdfReader getNumberOfPages
public int getNumberOfPages()
From source file:spntoolsdata.pdf.util.RemoveBlankPageFromPDF.java
public static void removeBlankPdfPages(String source, String destination) throws IOException, DocumentException { PdfReader r = null; RandomAccessSourceFactory rasf = null; RandomAccessFileOrArray raf = null;/*from www. j a va 2 s . co m*/ Document document = null; PdfCopy writer = null; try { r = new PdfReader(source); // deprecated // RandomAccessFileOrArray raf // = new RandomAccessFileOrArray(pdfSourceFile); // itext 5.4.1 rasf = new RandomAccessSourceFactory(); raf = new RandomAccessFileOrArray(rasf.createBestSource(source)); document = new Document(r.getPageSizeWithRotation(1)); writer = new PdfCopy(document, new FileOutputStream(destination)); document.open(); PdfImportedPage page = null; for (int i = 1; i <= r.getNumberOfPages(); i++) { // first check, examine the resource dictionary for /Font or // /XObject keys. If either are present -> not blank. PdfDictionary pageDict = r.getPageN(i); PdfDictionary resDict = (PdfDictionary) pageDict.get(PdfName.RESOURCES); boolean noFontsOrImages = true; if (resDict != null) { noFontsOrImages = resDict.get(PdfName.FONT) == null && resDict.get(PdfName.XOBJECT) == null; } if (!noFontsOrImages) { byte bContent[] = r.getPageContent(i, raf); ByteArrayOutputStream bs = new ByteArrayOutputStream(); bs.write(bContent); if (bs.size() > BLANK_THRESHOLD) { page = writer.getImportedPage(r, i); writer.addPage(page); } } } } finally { if (document != null) document.close(); if (writer != null) writer.close(); if (raf != null) raf.close(); if (r != null) r.close(); } }
From source file:tan.jam.jsf.OrignalFileUploadBean.java
private void parseFiles() { for (UploadedFile f : uploadedFiles) { try {// w w w . ja v a2s . co m PdfReader reader = new PdfReader(f.getInputstream()); String[] pages = new String[reader.getNumberOfPages()]; for (int a = 0; a < pages.length; a++) { pages[a] = PdfTextExtractor.getTextFromPage(reader, a + 1); } if (pages[0].length() > 1) { orignalFiles.add(pages); } else { System.out.println("File is SCANNED"); } } catch (IOException ex) { FacesMessage message = new FacesMessage("Error Parsing File ... "); FacesContext.getCurrentInstance().addMessage(null, message); Logger.getLogger(OrignalFileUploadBean.class.getName()).log(Level.SEVERE, null, ex); } } }
From source file:textextractor.PDFManager.java
/** * Parses a PDF to a plain text file.//from w w w. j a v a 2s . c om * * @param pdf the original PDF * @throws IOException */ public ArrayList parsePdf(String pdf) throws IOException { PdfReader reader = new PdfReader(pdf); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); System.out.println(strategy.getResultantText()); listPdf.add(strategy.getResultantText()); } return listPdf; }
From source file:tutorial.PDFtoText.java
public void convertPDFtoText() throws IOException { /*variabel "pdf" digunakan untuk menampung alamat direktori tempat file pdf disimpan.*/ String pdf = txtDirektori.getText(); StringBuilder text = new StringBuilder(); String resultText;//from w w w .ja v a 2 s. c o m /*Buat file Text ".txt"*/ File namaFile = new File(txtDirektori.getText().replace("pdf", "txt")); if (namaFile.createNewFile()) { System.out.println("File .txt berhasil dibuat."); } try { /*Panggil class yang ada pada library iText untuk membaca file PDF*/ PdfReader reader = new PdfReader(pdf); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); text.append(strategy.getResultantText()); } resultText = text.toString(); /*Code untuk menuliskan hasil pembacaan file PDF ke file Text*/ StringTokenizer stringTokenizer = new StringTokenizer(resultText, "\n"); PrintWriter lineWriter = new PrintWriter(new FileOutputStream(namaFile)); while (stringTokenizer.hasMoreTokens()) { String curToken = stringTokenizer.nextToken(); lineWriter.println(curToken); } lineWriter.flush(); lineWriter.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:tutorials.readpdf.readpdf.java
public static void main(String[] args) { try {/* w ww . java 2s . c om*/ PdfReader reader = new PdfReader("test.pdf"); System.out.println("This PDF has " + reader.getNumberOfPages() + " pages."); System.out.println("Is this document tampered: " + reader.isTampered()); System.out.println("Is this document encrypted: " + reader.isEncrypted()); for (int i = 1; i <= reader.getNumberOfPages(); i++) { String page = PdfTextExtractor.getTextFromPage(reader, i); System.out.println("Page Content:\n\n" + page + "\n\n"); } reader.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:uk.ac.tgac.conan.core.service.impl.PdfOperationsServiceImpl.java
License:Open Source License
@Override public void extractPage(File in, File out, int page) throws IOException, DocumentException { log.debug("Starting PDF page extraction"); Document document = new Document(); // Create a reader for the input file PdfReader reader = new PdfReader(new FileInputStream(in)); if (page > reader.getNumberOfPages()) throw new IndexOutOfBoundsException("Page number " + page + " does not exist in " + in.getPath()); // Create a copier for the output file PdfCopy copy = new PdfCopy(document, new FileOutputStream(out)); log.debug("PDF extraction resources created"); document.open();//from w w w . ja va 2 s . c om copy.addPage(copy.getImportedPage(reader, page)); document.close(); log.debug("Starting PDF page extracted successfully"); }
From source file:uk.bl.dpt.qa.flint.wrappers.iTextWrapper.java
License:Apache License
/** * Extracts text from a PDF./*from w w w . ja v a2 s . c o m*/ * @param pFile input file * @param pOutput output file * @param pOverwrite whether or not to overwrite an existing output file * @return true if converted ok, otherwise false */ public boolean extractTextFromPDF(File pFile, File pOutput, boolean pOverwrite) { if (pOutput.exists() & (!pOverwrite)) return false; boolean ret = true; PrintWriter pw = null; PdfReader reader = null; try { pw = new PrintWriter(new FileWriter(pOutput)); reader = new PdfReader(pFile.getAbsolutePath()); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 0; i < reader.getNumberOfPages(); i++) { try { //page numbers start at 1 strategy = parser.processContent((i + 1), new SimpleTextExtractionStrategy()); //write text out to file pw.println(strategy.getResultantText()); } catch (ExceptionConverter e) { e.printStackTrace(); ret = false; pw.println("iText Exception: Page " + (i + 1) + ": " + e.getClass().getName() + ": " + e.getMessage()); } } } catch (IOException e) { ret = false; // TODO Auto-generated catch block e.printStackTrace(); } finally { if (pw != null) pw.close(); if (reader != null) reader.close(); } return ret; }
From source file:uk.bl.dpt.qa.flint.wrappers.iTextWrapper.java
License:Apache License
/** * Check if a PDF file is valid or not//from w ww . ja v a2 s. c o m * @param pFile file to check * @return whether the file is valid or not */ public boolean isValid(File pFile) { boolean ret = false; PdfReader reader = null; try { reader = new PdfReader(pFile.getAbsolutePath()); LOGGER.debug("validating through {} pages of {}", reader.getNumberOfPages(), pFile.getName()); for (int i = 0; i < reader.getNumberOfPages(); i++) { //page numbers start at 1 PdfTextExtractor.getTextFromPage(reader, (i + 1)); } ret = true; } catch (BadPasswordException e) { //actually an error??? } catch (InvalidPdfException e) { LOGGER.warn("InvalidPdfException leads to invalidity: {}", e); } catch (IOException e) { LOGGER.warn("IOException leads to invalidity: {}", e); } catch (Exception e) { LOGGER.warn("Exception leads to invalidity: {}", e); } finally { if (reader != null) reader.close(); } return ret; }
From source file:uk.bl.wa.tika.parser.pdf.itext.PDFParser.java
License:Apache License
private static String extractText(PdfReader reader) { StringBuilder output = new StringBuilder(); try {//w w w. j a v a 2s. c o m int numPages = reader.getNumberOfPages(); int page = 1; while (page <= numPages) { output.append(PdfTextExtractor.getTextFromPage(reader, page)); page++; } } catch (Exception e) { System.err.println("PDFParser.extractText(): " + e.getMessage()); } return output.toString(); }
From source file:uk.bl.wa.tika.parser.pdf.itext.PDFParser.java
License:Apache License
private static void extractMetadata(PdfReader reader, Metadata metadata) { try {/*w w w .j a v a 2 s . c om*/ HashMap<String, String> map = reader.getInfo(); // Clone the PDF info: for (String key : map.keySet()) { metadata.set(key.toLowerCase(), map.get(key)); } // Add other data of interest: metadata.set("pdf:version", "1." + reader.getPdfVersion()); metadata.set("pdf:numPages", "" + reader.getNumberOfPages()); metadata.set("pdf:cryptoMode", "" + getCryptoModeAsString(reader)); metadata.set("pdf:openedWithFullPermissions", "" + reader.isOpenedWithFullPermissions()); metadata.set("pdf:encrypted", "" + reader.isEncrypted()); metadata.set("pdf:metadataEncrypted", "" + reader.isMetadataEncrypted()); metadata.set("pdf:128key", "" + reader.is128Key()); metadata.set("pdf:tampered", "" + reader.isTampered()); // Also grap XMP metadata, if present: byte[] xmpmd = reader.getMetadata(); if (xmpmd != null) { // This is standard Tika code for parsing standard stuff from the XMP: JempboxExtractor extractor = new JempboxExtractor(metadata); extractor.parse(new ByteArrayInputStream(xmpmd)); // This is custom XMP-handling code: XMPMetadata xmp = XMPMetadata.load(new ByteArrayInputStream(xmpmd)); // There is a special class for grabbing data in the PDF schema - not sure it will add much here: // Could parse xmp:CreatorTool and pdf:Producer etc. etc. out of here. //XMPSchemaPDF pdfxmp = xmp.getPDFSchema(); // Added a PDF/A schema class: xmp.addXMLNSMapping(XMPSchemaPDFA.NAMESPACE, XMPSchemaPDFA.class); XMPSchemaPDFA pdfaxmp = (XMPSchemaPDFA) xmp.getSchemaByClass(XMPSchemaPDFA.class); if (pdfaxmp != null) { metadata.set("pdfaid:part", pdfaxmp.getPart()); metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(); //metadata.set("pdfa:version", version ); metadata.set("pdf:version", version); } } // Attempt to determine Adobe extension level: PdfDictionary extensions = reader.getCatalog().getAsDict(PdfName.EXTENSIONS); if (extensions != null) { PdfDictionary adobeExt = extensions.getAsDict(PdfName.ADBE); if (adobeExt != null) { PdfName baseVersion = adobeExt.getAsName(PdfName.BASEVERSION); int el = adobeExt.getAsNumber(PdfName.EXTENSIONLEVEL).intValue(); metadata.set("pdf:version", baseVersion.toString().substring(1) + " Adobe Extension Level " + el); } } // Ensure the normalised metadata are mapped in: if (map.get("Title") != null) metadata.set(Metadata.TITLE, map.get("Title")); if (map.get("Author") != null) metadata.set(Metadata.AUTHOR, map.get("Author")); } catch (Exception e) { System.err.println("PDFParser.extractMetadata() caught Exception: " + e.getMessage()); e.printStackTrace(); } }