Example usage for com.itextpdf.text.pdf PdfReader PdfReader

List of usage examples for com.itextpdf.text.pdf PdfReader PdfReader

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf PdfReader PdfReader.

Prototype

public PdfReader(final PdfReader reader) 

Source Link

Document

Creates an independent duplicate.

Usage

From source file:integrator.Pdf.java

/**
 * Parses a PDF to a plain text file.//from  w  w w.  j  a v  a 2 s  . c  o  m
 * @param pdf the original PDF
 * @param txt the resulting text
 * @throws IOException
 */
public void parsePdf(String pdf, String txt) throws IOException {
    PdfReader reader = new PdfReader(pdf);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(new FileOutputStream(txt));
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        out.println(strategy.getResultantText());
    }
    out.flush();
    out.close();
    reader.close();
}

From source file:io.github.jonestimd.finance.file.pdf.TextExtractor.java

License:Open Source License

public TextExtractor(InputStream is) throws IOException {
    PdfReader pdfReader = new PdfReader(is);
    PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
    int pages = pdfReader.getNumberOfPages();
    for (int i = 1; i <= pages; i++) {
        ImportRenderListener renderListener = new ImportRenderListener();
        parser.processContent(i, renderListener);
        pageText.add(renderListener.text);
    }/*from   w  w w  .j  a va  2s.  c  om*/
}

From source file:io.konik.carriage.itext.ITextInvoiceAppender.java

License:Open Source License

/**
 * Append invoice intern./*from www  .  j av a2  s .c  o  m*/
 *
 * @param appendable the appendable
 * @throws IOException Signals that an I/O exception has occurred.
 * @throws DocumentException the document exception
 * @throws XMPException the XMP exception
 */
private void appendInvoiceIntern(AppendParameter appendable)
        throws IOException, DocumentException, XMPException {
    byte[] attachmentFile = convertToByteArray(appendable.attachmentFile());
    PdfReader reader = new PdfReader(appendable.inputPdf());
    PdfAStamper stamper = new PdfAStamper(reader, appendable.resultingPdf(), PdfAConformanceLevel.PDF_A_3B);

    appendZfMetadata(stamper, appendable.zugferdConformanceLevel(), appendable.zugferdVersion());
    attachFile(attachmentFile, stamper);

    stamper.close();
    reader.close();
}

From source file:io.konik.carriage.itext.ITextInvoiceExtractor.java

License:Open Source License

private static PdfReader getPdfReader(InputStream pdfStream) {
    try {/* w ww .j  ava 2s. c  o m*/
        return new PdfReader(pdfStream);
    } catch (IOException e) {
        throw new InvoiceExtractionError("Could not read or open pdf.", e);
    }
}

From source file:itextblast.ITextBlast.java

private static void processQAFile(String qa_filename, Boolean has_frontpage)
        throws IOException, DocumentException {

    // use one of the previous examples to create a PDF
    // new MovieTemplates().createPdf(MovieTemplates.RESULT);
    // Create a reader; from current existing file
    // Next time pass it from args ..
    PdfReader reader = new PdfReader(String.format(ITextBlast.working_dir + SOURCE, qa_filename));
    ITextBlast.my_reader = reader;//from w w w .j ava 2s  .co m
    // We'll create as many new PDFs as there are pages
    // Document document;
    // PdfCopy copy;
    // loop over all the pages in the original PDF
    int n = reader.getNumberOfPages();
    // For test of extraction and regexp; use first 5 pages ..
    // n = 15;
    // Text Extraction Strategy here ...
    // LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
    // SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
    // Both ^ does not work well; weird behavior ... no need so clever ..
    // START SMART Start Number ********
    Pattern smart_start_pattern;
    smart_start_pattern = Pattern.compile(".*?SOALAN.*?N.*?O.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE);
    // Extract cover page number as smartly as possible??
    String cover_page_content = PdfTextExtractor.getTextFromPage(reader, 1);
    Matcher smart_start_matcher = smart_start_pattern.matcher(cover_page_content);
    String smart_start_question_number = null;
    if (smart_start_matcher.find()) {
        // Extract the question number based on backreference
        smart_start_question_number = smart_start_matcher.group(1);
        // How will it look when using a different strategy?
        out.println("Matched " + smart_start_matcher.group(0) + " and SMART Start Number: "
                + smart_start_question_number);
    }
    // END SMART Start Number ********
    Pattern liberal_found_question_pattern_uno;
    liberal_found_question_pattern_uno = Pattern.compile(".*N.*O.*SOALAN.*", Pattern.CASE_INSENSITIVE);
    Pattern liberal_found_question_pattern_dos = Pattern.compile(".*SOALAN.*N.*O.*", Pattern.CASE_INSENSITIVE);
    Pattern pattern_uno;
    // pattern = Pattern.compile("^.*NO.*SOALAN.*?(\\d+).*$", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
    // pattern = Pattern.compile(".*SOALAN.*?(\\d+).*", Pattern.CASE_INSENSITIVE);
    pattern_uno = Pattern.compile(".*N.*O.*SOALAN.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE);
    Pattern pattern_dos = Pattern.compile(".*SOALAN.*N.*O.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE);
    // OPTION 2 is to try with the next available number between word boundaries .. but may then need non-greedy ..
    // Init start and end page
    int start_page = 1;
    int end_page = 1;
    String question_number = "0-intro";

    // This is for SOALAN LISAN; which has no Front Page
    // the Start Question Number should then be set to SMART Start Number
    if (!has_frontpage) {
        question_number = smart_start_question_number;
    }

    for (int i = 1; i < n; i++) {
        // init found_question_number
        String found_question_number = null;
        boolean found_match = false;
        // PdfDictionary page = reader.getPageN(i);
        // use location based strategy
        out.println("Page " + i);
        out.println("===========");
        // out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy));
        String content = PdfTextExtractor.getTextFromPage(reader, i);
        // DEBUG: Uncomment below ..
        // out.println(content);
        Matcher liberal_uno_matcher = liberal_found_question_pattern_uno.matcher(content);
        if (liberal_uno_matcher.find()) {
            out.println("Matched UNO!");
            found_match = true;
            Matcher matcher = pattern_uno.matcher(content);
            // Loop to find the digit; it is possible it is not found an dleft as null ..
            while (matcher.find()) {
                // Extract the question number based on backreference
                found_question_number = matcher.group(1);
                // How will it look when using a different strategy?
                out.println("Matched " + matcher.group(0) + " and Question Number: " + found_question_number);
            }
        } else if (liberal_found_question_pattern_dos.matcher(content).find()) {
            if ("0-intro".equals(question_number)) {
                out.println("SMART!!!");
            } else {
                found_match = true;
                out.println("Matched DOS!");
                Matcher matcher = pattern_dos.matcher(content);
                // Loop to find the digit; it is possible it is not found an dleft as null ..
                while (matcher.find()) {
                    // Extract the question number based on backreference
                    found_question_number = matcher.group(1);
                    // How will it look when using a different strategy?
                    out.println(
                            "Matched " + matcher.group(0) + " and Question Number: " + found_question_number);
                }

            }
        }
        // If matched; take out the last start, end 
        if (found_match) {
            // copy page over and write it down ..
            end_page = i - 1;
            if (end_page < 1) {
                end_page = 1;
            }
            if (null == found_question_number) {
                if ("0-intro".equals(question_number)) {
                    // After intro; if got problem; try the smart start
                    found_question_number = smart_start_question_number;
                    out.println("First question could not determine number; using Q No. => "
                            + found_question_number);
                    // Print out content to debug
                    out.println("*****DEBUG Content*******");
                    out.println(content);
                } else {
                    // otherwise; use current question and just append Unix timestamp ..
                    found_question_number = question_number + "_" + (System.currentTimeMillis() / 1000L);
                    out.println(
                            "Unexpectedly could not determine number; using Q No. => " + found_question_number);
                    // Print out content to debug
                    out.println("*****DEBUG Content*******");
                    out.println(content);
                }
            }
            // Write based on previous confirmed question_number
            ITextBlast.copySelectedQuestionPage(start_page, end_page, question_number);
            // re-set to current page
            start_page = i;
            end_page = i;
            question_number = found_question_number;
        }
        // out.println(PdfTextExtractor.getTextFromPage(reader, i));
        // Pattern RegExp:  #^.*NO.*SOALAN.*(\d)+$#im
        out.println();
        out.println();
        // use helper file to dump out        
        // Look out for pattern  "NO. SOALAN"
        // Once see pattern or reach end; snip off copy from start to end
        // reset start/end
        // else increase the end
    }
    // If end of the loop there are still straglers; mark with the special question_number = 999
    if (start_page <= end_page) {
        // Should always happen actually ..
        ITextBlast.copySelectedQuestionPage(start_page, end_page, question_number);
    }
    reader.close();
}

From source file:itextblast.ITextBlast.java

public static void splitByPage(String[] args) throws IOException, DocumentException {

    // use one of the previous examples to create a PDF
    // new MovieTemplates().createPdf(MovieTemplates.RESULT);
    // Create a reader; from current existing file
    // Next time pass it from args ..
    PdfReader reader = new PdfReader("./source/imokman.pdf");
    // We'll create as many new PDFs as there are pages
    Document document;/*from   w  w  w.jav a2s .  com*/
    PdfCopy copy;
    // loop over all the pages in the original PDF
    int n = reader.getNumberOfPages();
    for (int i = 0; i < n;) {
        // step 1
        document = new Document();
        // step 2
        copy = new PdfCopy(document, new FileOutputStream(String.format(RESULT, ++i)));
        // step 3
        document.open();
        // step 4
        copy.addPage(copy.getImportedPage(reader, i));
        // step 5
        document.close();
    }
    reader.close();
}

From source file:jasperSoft.MergePDF.java

/**
 * /*  w  w  w  .  j ava 2 s .co m*/
 * @param streamOfPDFFiles
 * @param outputStream
 * @param paginate 
 */
public static void concatPDFs(List<InputStream> streamOfPDFFiles, OutputStream outputStream, boolean paginate) {

    Document document = new Document();
    try {
        List<InputStream> pdfs = streamOfPDFFiles;
        List<PdfReader> readers = new ArrayList<PdfReader>();
        int totalPages = 0;
        Iterator<InputStream> iteratorPDFs = pdfs.iterator();

        // Create Readers for the pdfs.
        while (iteratorPDFs.hasNext()) {
            InputStream pdf = iteratorPDFs.next();
            PdfReader pdfReader = new PdfReader(pdf);
            readers.add(pdfReader);
            totalPages += pdfReader.getNumberOfPages();
        }
        // Create a writer for the outputstream
        PdfWriter writer = PdfWriter.getInstance(document, outputStream);

        document.open();
        BaseFont bf = BaseFont.createFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.NOT_EMBEDDED);
        PdfContentByte cb = writer.getDirectContent(); // Holds the PDF
        // data

        PdfImportedPage page;
        int currentPageNumber = 0;
        int pageOfCurrentReaderPDF = 0;
        Iterator<PdfReader> iteratorPDFReader = readers.iterator();

        // Loop through the PDF files and add to the output.
        while (iteratorPDFReader.hasNext()) {
            PdfReader pdfReader = iteratorPDFReader.next();

            // Create a new page in the target for each source page.
            while (pageOfCurrentReaderPDF < pdfReader.getNumberOfPages()) {
                document.newPage();
                pageOfCurrentReaderPDF++;
                currentPageNumber++;
                page = writer.getImportedPage(pdfReader, pageOfCurrentReaderPDF);
                cb.addTemplate(page, 0, 0);

                // Code for pagination.
                if (paginate) {
                    cb.beginText();
                    cb.setFontAndSize(bf, 9);
                    cb.showTextAligned(PdfContentByte.ALIGN_CENTER,
                            "" + currentPageNumber + " of " + totalPages, 520, 5, 0);
                    cb.endText();
                }
            }
            pageOfCurrentReaderPDF = 0;
        }
        outputStream.flush();
        document.close();
        outputStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        if (document.isOpen()) {
            document.close();
        }
        try {
            if (outputStream != null) {
                outputStream.close();
            }
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }
}

From source file:jati.GerandoArquivoCarimbado.java

public static Document montaraAquivo(String caminho) {

    try {/* w w  w  .  j a  va  2  s.c om*/
        reader = new PdfReader(caminho);
        // n recebe o numero total de paginas

        //Tamanho da primeira Pagina

        //Cria Segundo PDF 
    } catch (IOException e) {
    }
    Document documento = new Document(PageSize.A4);
    return documento;

}

From source file:javaapplication1.JavaApplication1.java

public void parsePdf(String pdf, String txt) throws IOException {
    PdfReader reader = new PdfReader(pdf + ".pdf");
    //  PrintWriter out = new PrintWriter(new FileOutputStream(txt));
    // Rectangle rect = new Rectangle(0,0, 300,800);
    //  RenderFilter filter = new RegionTextRenderFilter(rect);
    // TextExtractionStrategy strategy;
    /*    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
    strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
    out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy));
        }*//*from w  w  w. j  a  v  a2s  .com*/

    int pw, ph;
    Rectangle mrect;

    //reader = new PdfReader(pdf);
    String g = new String();
    ArrayList<Integer> cutlist;
    int lastblank;
    int lastword;
    //int p=reader.getNumberOfPages();
    int p = 1;
    double f;
    BufferedImage img;
    BufferedImage img1;
    for (int k = 1; k <= p; k++) {
        cutlist = new ArrayList<Integer>();
        lastword = 0;
        lastblank = -1;
        mrect = reader.getPageSize(k);
        ph = (int) mrect.getHeight();
        pw = (int) mrect.getWidth();
        for (int i = 0; i <= (ph - fontchecksize); i += fontchecksize) {
            g = mycheckline(reader, ph, pw, i, k);
            if (g.isEmpty())
                lastblank = i;
            else {
                //  System.out.println(g);
                if (lastword == 0) {
                    if (i > 2 * fontchecksize)
                        cutlist.add(i - 2 * fontchecksize);
                    else
                        cutlist.add(0);
                    lastword = i;
                } else if (g.matches("\\d+\\..*")) {
                    cutlist.add(((lastword + i) / 2));

                }
                lastword = i;

            }
        }
        if (lastword + 3 * fontchecksize < ph) {
            cutlist.add(lastword + 3 * fontchecksize);
        } else
            cutlist.add(ph - 1);
        //  System.out.println("The arraylist contains the following elements: "+ cutlist);
        img = ImageIO.read(new File(txt + ".png"));

        //img1 = ImageIO.read(new File("prefix-1.png"));
        f = img.getHeight() / ph;
        //System.out.println(f);
        int s;
        //  ImageIO.write(img.getSubimage(0,(int)(f*(84)),(int)(f*pw),(int)(f*(156-84))), "png", new File("7.png"));
        for (s = 0; s < (cutlist.size() - 1); s++) {
            // System.out.println(cutlist.get(s));
            ImageIO.write(
                    img.getSubimage(0, (int) (f * (cutlist.get(s))), (int) (f * pw),
                            (int) (f * (cutlist.get(s + 1) - cutlist.get(s)))),
                    "png", new File(txt + s + ".png"));
        }

        /*  ImageIO.write(img.getSubimage(0,(int)(f*(84)),(int)(f*pw),(int)(f*(156-84))), "png", new File("7.png"));
            ImageIO.write(img.getSubimage(0,(int)(f*(156)),(int)(f*pw),(int)(f*(222-156))), "png", new File("8.png"));
            ImageIO.write(img1.getSubimage(0,(int)(f*(186)),(int)(f*pw),(int)(f*(225-186))), "png", new File("1.png"));
            ImageIO.write(img1.getSubimage(0,(int)(f*(225)),(int)(f*pw),(int)(f*(297-225))), "png", new File("2.png"));
            ImageIO.write(img1.getSubimage(0,(int)(f*(297)),(int)(f*pw),(int)(f*(339-297))), "png", new File("3.png"));
            ImageIO.write(img1.getSubimage(0,(int)(f*(339)),(int)(f*pw),(int)(f*(465-339))), "png", new File("4.png"));
            ImageIO.write(img1.getSubimage(0,(int)(f*(465)),(int)(f*pw),(int)(f*(585-465))), "png", new File("5.png"));
            ImageIO.write(img1.getSubimage(0,(int)(f*(585)),(int)(f*pw),(int)(f*(630-585))), "png", new File("6.png"));*/
    }
}

From source file:javaapplication1.PDF.java

public void extractImages(String filename) throws IOException, DocumentException {
    //        System.out.println("Processing PDF at " + filename);
    PdfReader reader = new PdfReader(filename);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    listener = new ImageRenderListener();
    RenderListener print = parser.processContent(3, listener);
    reader.close();/*  ww w  .ja  v  a  2  s  .com*/
}