Example usage for com.itextpdf.text.pdf PdfReader getNumberOfPages

List of usage examples for com.itextpdf.text.pdf PdfReader getNumberOfPages

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf PdfReader getNumberOfPages.

Prototype

public int getNumberOfPages() 

Source Link

Document

Gets the number of pages in the document.

Usage

From source file:spntoolsdata.pdf.util.RemoveBlankPageFromPDF.java

public static void removeBlankPdfPages(String source, String destination)
        throws IOException, DocumentException {
    PdfReader r = null;
    RandomAccessSourceFactory rasf = null;
    RandomAccessFileOrArray raf = null;/*from   www. j a va 2 s  .  co m*/
    Document document = null;
    PdfCopy writer = null;

    try {
        r = new PdfReader(source);
        // deprecated
        //    RandomAccessFileOrArray raf
        //           = new RandomAccessFileOrArray(pdfSourceFile);
        // itext 5.4.1
        rasf = new RandomAccessSourceFactory();
        raf = new RandomAccessFileOrArray(rasf.createBestSource(source));
        document = new Document(r.getPageSizeWithRotation(1));
        writer = new PdfCopy(document, new FileOutputStream(destination));
        document.open();
        PdfImportedPage page = null;

        for (int i = 1; i <= r.getNumberOfPages(); i++) {
            // first check, examine the resource dictionary for /Font or
            // /XObject keys.  If either are present -> not blank.
            PdfDictionary pageDict = r.getPageN(i);
            PdfDictionary resDict = (PdfDictionary) pageDict.get(PdfName.RESOURCES);
            boolean noFontsOrImages = true;
            if (resDict != null) {
                noFontsOrImages = resDict.get(PdfName.FONT) == null && resDict.get(PdfName.XOBJECT) == null;
            }

            if (!noFontsOrImages) {
                byte bContent[] = r.getPageContent(i, raf);
                ByteArrayOutputStream bs = new ByteArrayOutputStream();
                bs.write(bContent);

                if (bs.size() > BLANK_THRESHOLD) {
                    page = writer.getImportedPage(r, i);
                    writer.addPage(page);
                }
            }
        }
    } finally {
        if (document != null)
            document.close();
        if (writer != null)
            writer.close();
        if (raf != null)
            raf.close();
        if (r != null)
            r.close();
    }
}

From source file:tan.jam.jsf.OrignalFileUploadBean.java

private void parseFiles() {
    for (UploadedFile f : uploadedFiles) {
        try {// w w  w . ja  v a2s . co m
            PdfReader reader = new PdfReader(f.getInputstream());
            String[] pages = new String[reader.getNumberOfPages()];
            for (int a = 0; a < pages.length; a++) {
                pages[a] = PdfTextExtractor.getTextFromPage(reader, a + 1);
            }
            if (pages[0].length() > 1) {
                orignalFiles.add(pages);
            } else {
                System.out.println("File is SCANNED");

            }
        } catch (IOException ex) {

            FacesMessage message = new FacesMessage("Error Parsing File ... ");
            FacesContext.getCurrentInstance().addMessage(null, message);
            Logger.getLogger(OrignalFileUploadBean.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
}

From source file:textextractor.PDFManager.java

/**
 * Parses a PDF to a plain text file.//from  w w w. j  a  v  a 2s  .  c om
 *
 * @param pdf the original PDF
 * @throws IOException
 */
public ArrayList parsePdf(String pdf) throws IOException {
    PdfReader reader = new PdfReader(pdf);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        System.out.println(strategy.getResultantText());
        listPdf.add(strategy.getResultantText());
    }
    return listPdf;
}

From source file:tutorial.PDFtoText.java

public void convertPDFtoText() throws IOException {
    /*variabel "pdf" digunakan untuk menampung alamat direktori tempat file pdf disimpan.*/
    String pdf = txtDirektori.getText();
    StringBuilder text = new StringBuilder();
    String resultText;//from   w w  w .ja v  a  2  s. c o m

    /*Buat file Text ".txt"*/
    File namaFile = new File(txtDirektori.getText().replace("pdf", "txt"));
    if (namaFile.createNewFile()) {
        System.out.println("File .txt berhasil dibuat.");
    }

    try {
        /*Panggil class yang ada pada library iText untuk membaca file PDF*/
        PdfReader reader = new PdfReader(pdf);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        TextExtractionStrategy strategy;
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            text.append(strategy.getResultantText());
        }
        resultText = text.toString();

        /*Code untuk menuliskan hasil pembacaan file PDF ke file Text*/
        StringTokenizer stringTokenizer = new StringTokenizer(resultText, "\n");
        PrintWriter lineWriter = new PrintWriter(new FileOutputStream(namaFile));
        while (stringTokenizer.hasMoreTokens()) {
            String curToken = stringTokenizer.nextToken();
            lineWriter.println(curToken);
        }
        lineWriter.flush();
        lineWriter.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:tutorials.readpdf.readpdf.java

public static void main(String[] args) {

    try {/* w  ww  . java 2s  .  c om*/

        PdfReader reader = new PdfReader("test.pdf");
        System.out.println("This PDF has " + reader.getNumberOfPages() + " pages.");
        System.out.println("Is this document tampered: " + reader.isTampered());
        System.out.println("Is this document encrypted: " + reader.isEncrypted());
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            String page = PdfTextExtractor.getTextFromPage(reader, i);
            System.out.println("Page Content:\n\n" + page + "\n\n");
        }
        reader.close();

    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:uk.ac.tgac.conan.core.service.impl.PdfOperationsServiceImpl.java

License:Open Source License

@Override
public void extractPage(File in, File out, int page) throws IOException, DocumentException {

    log.debug("Starting PDF page extraction");

    Document document = new Document();

    // Create a reader for the input file
    PdfReader reader = new PdfReader(new FileInputStream(in));

    if (page > reader.getNumberOfPages())
        throw new IndexOutOfBoundsException("Page number " + page + " does not exist in " + in.getPath());

    // Create a copier for the output file
    PdfCopy copy = new PdfCopy(document, new FileOutputStream(out));

    log.debug("PDF extraction resources created");

    document.open();//from   w  w  w  . ja va 2  s .  c om

    copy.addPage(copy.getImportedPage(reader, page));

    document.close();

    log.debug("Starting PDF page extracted successfully");
}

From source file:uk.bl.dpt.qa.flint.wrappers.iTextWrapper.java

License:Apache License

/**
 * Extracts text from a PDF./*from  w  w w  . ja  v a2  s . c  o m*/
 * @param pFile input file
 * @param pOutput output file
 * @param pOverwrite whether or not to overwrite an existing output file
 * @return true if converted ok, otherwise false
 */
public boolean extractTextFromPDF(File pFile, File pOutput, boolean pOverwrite) {
    if (pOutput.exists() & (!pOverwrite))
        return false;

    boolean ret = true;

    PrintWriter pw = null;
    PdfReader reader = null;

    try {
        pw = new PrintWriter(new FileWriter(pOutput));
        reader = new PdfReader(pFile.getAbsolutePath());
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        TextExtractionStrategy strategy;
        for (int i = 0; i < reader.getNumberOfPages(); i++) {
            try {
                //page numbers start at 1
                strategy = parser.processContent((i + 1), new SimpleTextExtractionStrategy());
                //write text out to file
                pw.println(strategy.getResultantText());
            } catch (ExceptionConverter e) {
                e.printStackTrace();
                ret = false;
                pw.println("iText Exception: Page " + (i + 1) + ": " + e.getClass().getName() + ": "
                        + e.getMessage());
            }
        }
    } catch (IOException e) {
        ret = false;
        // TODO Auto-generated catch block
        e.printStackTrace();
    } finally {
        if (pw != null)
            pw.close();
        if (reader != null)
            reader.close();
    }

    return ret;
}

From source file:uk.bl.dpt.qa.flint.wrappers.iTextWrapper.java

License:Apache License

/**
 * Check if a PDF file is valid or not//from  w ww  . ja v  a2 s.  c  o m
 * @param pFile file to check
 * @return whether the file is valid or not
 */
public boolean isValid(File pFile) {

    boolean ret = false;

    PdfReader reader = null;
    try {
        reader = new PdfReader(pFile.getAbsolutePath());
        LOGGER.debug("validating through {} pages of {}", reader.getNumberOfPages(), pFile.getName());
        for (int i = 0; i < reader.getNumberOfPages(); i++) {
            //page numbers start at 1
            PdfTextExtractor.getTextFromPage(reader, (i + 1));
        }
        ret = true;
    } catch (BadPasswordException e) {
        //actually an error???
    } catch (InvalidPdfException e) {
        LOGGER.warn("InvalidPdfException leads to invalidity: {}", e);
    } catch (IOException e) {
        LOGGER.warn("IOException leads to invalidity: {}", e);
    } catch (Exception e) {
        LOGGER.warn("Exception leads to invalidity: {}", e);
    } finally {
        if (reader != null)
            reader.close();
    }

    return ret;
}

From source file:uk.bl.wa.tika.parser.pdf.itext.PDFParser.java

License:Apache License

private static String extractText(PdfReader reader) {
    StringBuilder output = new StringBuilder();
    try {//w w w.  j  a v  a 2s.  c o m
        int numPages = reader.getNumberOfPages();
        int page = 1;
        while (page <= numPages) {
            output.append(PdfTextExtractor.getTextFromPage(reader, page));
            page++;
        }
    } catch (Exception e) {
        System.err.println("PDFParser.extractText(): " + e.getMessage());
    }
    return output.toString();
}

From source file:uk.bl.wa.tika.parser.pdf.itext.PDFParser.java

License:Apache License

private static void extractMetadata(PdfReader reader, Metadata metadata) {
    try {/*w w w  .j a v a 2 s  . c  om*/
        HashMap<String, String> map = reader.getInfo();
        // Clone the PDF info:
        for (String key : map.keySet()) {
            metadata.set(key.toLowerCase(), map.get(key));
        }
        // Add other data of interest:
        metadata.set("pdf:version", "1." + reader.getPdfVersion());
        metadata.set("pdf:numPages", "" + reader.getNumberOfPages());
        metadata.set("pdf:cryptoMode", "" + getCryptoModeAsString(reader));
        metadata.set("pdf:openedWithFullPermissions", "" + reader.isOpenedWithFullPermissions());
        metadata.set("pdf:encrypted", "" + reader.isEncrypted());
        metadata.set("pdf:metadataEncrypted", "" + reader.isMetadataEncrypted());
        metadata.set("pdf:128key", "" + reader.is128Key());
        metadata.set("pdf:tampered", "" + reader.isTampered());
        // Also grap XMP metadata, if present:
        byte[] xmpmd = reader.getMetadata();
        if (xmpmd != null) {
            // This is standard Tika code for parsing standard stuff from the XMP:
            JempboxExtractor extractor = new JempboxExtractor(metadata);
            extractor.parse(new ByteArrayInputStream(xmpmd));
            // This is custom XMP-handling code:
            XMPMetadata xmp = XMPMetadata.load(new ByteArrayInputStream(xmpmd));
            // There is a special class for grabbing data in the PDF schema - not sure it will add much here:
            // Could parse xmp:CreatorTool and pdf:Producer etc. etc. out of here.
            //XMPSchemaPDF pdfxmp = xmp.getPDFSchema();
            // Added a PDF/A schema class:
            xmp.addXMLNSMapping(XMPSchemaPDFA.NAMESPACE, XMPSchemaPDFA.class);
            XMPSchemaPDFA pdfaxmp = (XMPSchemaPDFA) xmp.getSchemaByClass(XMPSchemaPDFA.class);
            if (pdfaxmp != null) {
                metadata.set("pdfaid:part", pdfaxmp.getPart());
                metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase();
                //metadata.set("pdfa:version", version );                    
                metadata.set("pdf:version", version);
            }
        }
        // Attempt to determine Adobe extension level:
        PdfDictionary extensions = reader.getCatalog().getAsDict(PdfName.EXTENSIONS);
        if (extensions != null) {
            PdfDictionary adobeExt = extensions.getAsDict(PdfName.ADBE);
            if (adobeExt != null) {
                PdfName baseVersion = adobeExt.getAsName(PdfName.BASEVERSION);
                int el = adobeExt.getAsNumber(PdfName.EXTENSIONLEVEL).intValue();
                metadata.set("pdf:version",
                        baseVersion.toString().substring(1) + " Adobe Extension Level " + el);
            }
        }
        // Ensure the normalised metadata are mapped in:
        if (map.get("Title") != null)
            metadata.set(Metadata.TITLE, map.get("Title"));
        if (map.get("Author") != null)
            metadata.set(Metadata.AUTHOR, map.get("Author"));
    } catch (Exception e) {
        System.err.println("PDFParser.extractMetadata() caught Exception: " + e.getMessage());
        e.printStackTrace();
    }
}