List of usage examples for com.itextpdf.text.pdf PdfReader PdfReader
public PdfReader(final PdfReader reader)
From source file:integrator.Pdf.java
/** * Parses a PDF to a plain text file.//from w w w. j a v a 2 s . c o m * @param pdf the original PDF * @param txt the resulting text * @throws IOException */ public void parsePdf(String pdf, String txt) throws IOException { PdfReader reader = new PdfReader(pdf); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(txt)); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); out.println(strategy.getResultantText()); } out.flush(); out.close(); reader.close(); }
From source file:io.github.jonestimd.finance.file.pdf.TextExtractor.java
License:Open Source License
public TextExtractor(InputStream is) throws IOException { PdfReader pdfReader = new PdfReader(is); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); int pages = pdfReader.getNumberOfPages(); for (int i = 1; i <= pages; i++) { ImportRenderListener renderListener = new ImportRenderListener(); parser.processContent(i, renderListener); pageText.add(renderListener.text); }/*from w w w .j a va 2s. c om*/ }
From source file:io.konik.carriage.itext.ITextInvoiceAppender.java
License:Open Source License
/** * Append invoice intern./*from www . j av a2 s .c o m*/ * * @param appendable the appendable * @throws IOException Signals that an I/O exception has occurred. * @throws DocumentException the document exception * @throws XMPException the XMP exception */ private void appendInvoiceIntern(AppendParameter appendable) throws IOException, DocumentException, XMPException { byte[] attachmentFile = convertToByteArray(appendable.attachmentFile()); PdfReader reader = new PdfReader(appendable.inputPdf()); PdfAStamper stamper = new PdfAStamper(reader, appendable.resultingPdf(), PdfAConformanceLevel.PDF_A_3B); appendZfMetadata(stamper, appendable.zugferdConformanceLevel(), appendable.zugferdVersion()); attachFile(attachmentFile, stamper); stamper.close(); reader.close(); }
From source file:io.konik.carriage.itext.ITextInvoiceExtractor.java
License:Open Source License
private static PdfReader getPdfReader(InputStream pdfStream) { try {/* w ww .j ava 2s. c o m*/ return new PdfReader(pdfStream); } catch (IOException e) { throw new InvoiceExtractionError("Could not read or open pdf.", e); } }
From source file:itextblast.ITextBlast.java
private static void processQAFile(String qa_filename, Boolean has_frontpage) throws IOException, DocumentException { // use one of the previous examples to create a PDF // new MovieTemplates().createPdf(MovieTemplates.RESULT); // Create a reader; from current existing file // Next time pass it from args .. PdfReader reader = new PdfReader(String.format(ITextBlast.working_dir + SOURCE, qa_filename)); ITextBlast.my_reader = reader;//from w w w .j ava 2s .co m // We'll create as many new PDFs as there are pages // Document document; // PdfCopy copy; // loop over all the pages in the original PDF int n = reader.getNumberOfPages(); // For test of extraction and regexp; use first 5 pages .. // n = 15; // Text Extraction Strategy here ... // LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); // SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); // Both ^ does not work well; weird behavior ... no need so clever .. // START SMART Start Number ******** Pattern smart_start_pattern; smart_start_pattern = Pattern.compile(".*?SOALAN.*?N.*?O.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE); // Extract cover page number as smartly as possible?? String cover_page_content = PdfTextExtractor.getTextFromPage(reader, 1); Matcher smart_start_matcher = smart_start_pattern.matcher(cover_page_content); String smart_start_question_number = null; if (smart_start_matcher.find()) { // Extract the question number based on backreference smart_start_question_number = smart_start_matcher.group(1); // How will it look when using a different strategy? out.println("Matched " + smart_start_matcher.group(0) + " and SMART Start Number: " + smart_start_question_number); } // END SMART Start Number ******** Pattern liberal_found_question_pattern_uno; liberal_found_question_pattern_uno = Pattern.compile(".*N.*O.*SOALAN.*", Pattern.CASE_INSENSITIVE); Pattern liberal_found_question_pattern_dos = Pattern.compile(".*SOALAN.*N.*O.*", Pattern.CASE_INSENSITIVE); Pattern pattern_uno; // pattern = Pattern.compile("^.*NO.*SOALAN.*?(\\d+).*$", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // pattern = Pattern.compile(".*SOALAN.*?(\\d+).*", Pattern.CASE_INSENSITIVE); pattern_uno = Pattern.compile(".*N.*O.*SOALAN.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE); Pattern pattern_dos = Pattern.compile(".*SOALAN.*N.*O.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE); // OPTION 2 is to try with the next available number between word boundaries .. but may then need non-greedy .. // Init start and end page int start_page = 1; int end_page = 1; String question_number = "0-intro"; // This is for SOALAN LISAN; which has no Front Page // the Start Question Number should then be set to SMART Start Number if (!has_frontpage) { question_number = smart_start_question_number; } for (int i = 1; i < n; i++) { // init found_question_number String found_question_number = null; boolean found_match = false; // PdfDictionary page = reader.getPageN(i); // use location based strategy out.println("Page " + i); out.println("==========="); // out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy)); String content = PdfTextExtractor.getTextFromPage(reader, i); // DEBUG: Uncomment below .. // out.println(content); Matcher liberal_uno_matcher = liberal_found_question_pattern_uno.matcher(content); if (liberal_uno_matcher.find()) { out.println("Matched UNO!"); found_match = true; Matcher matcher = pattern_uno.matcher(content); // Loop to find the digit; it is possible it is not found an dleft as null .. while (matcher.find()) { // Extract the question number based on backreference found_question_number = matcher.group(1); // How will it look when using a different strategy? out.println("Matched " + matcher.group(0) + " and Question Number: " + found_question_number); } } else if (liberal_found_question_pattern_dos.matcher(content).find()) { if ("0-intro".equals(question_number)) { out.println("SMART!!!"); } else { found_match = true; out.println("Matched DOS!"); Matcher matcher = pattern_dos.matcher(content); // Loop to find the digit; it is possible it is not found an dleft as null .. while (matcher.find()) { // Extract the question number based on backreference found_question_number = matcher.group(1); // How will it look when using a different strategy? out.println( "Matched " + matcher.group(0) + " and Question Number: " + found_question_number); } } } // If matched; take out the last start, end if (found_match) { // copy page over and write it down .. end_page = i - 1; if (end_page < 1) { end_page = 1; } if (null == found_question_number) { if ("0-intro".equals(question_number)) { // After intro; if got problem; try the smart start found_question_number = smart_start_question_number; out.println("First question could not determine number; using Q No. => " + found_question_number); // Print out content to debug out.println("*****DEBUG Content*******"); out.println(content); } else { // otherwise; use current question and just append Unix timestamp .. found_question_number = question_number + "_" + (System.currentTimeMillis() / 1000L); out.println( "Unexpectedly could not determine number; using Q No. => " + found_question_number); // Print out content to debug out.println("*****DEBUG Content*******"); out.println(content); } } // Write based on previous confirmed question_number ITextBlast.copySelectedQuestionPage(start_page, end_page, question_number); // re-set to current page start_page = i; end_page = i; question_number = found_question_number; } // out.println(PdfTextExtractor.getTextFromPage(reader, i)); // Pattern RegExp: #^.*NO.*SOALAN.*(\d)+$#im out.println(); out.println(); // use helper file to dump out // Look out for pattern "NO. SOALAN" // Once see pattern or reach end; snip off copy from start to end // reset start/end // else increase the end } // If end of the loop there are still straglers; mark with the special question_number = 999 if (start_page <= end_page) { // Should always happen actually .. ITextBlast.copySelectedQuestionPage(start_page, end_page, question_number); } reader.close(); }
From source file:itextblast.ITextBlast.java
public static void splitByPage(String[] args) throws IOException, DocumentException { // use one of the previous examples to create a PDF // new MovieTemplates().createPdf(MovieTemplates.RESULT); // Create a reader; from current existing file // Next time pass it from args .. PdfReader reader = new PdfReader("./source/imokman.pdf"); // We'll create as many new PDFs as there are pages Document document;/*from w w w.jav a2s . com*/ PdfCopy copy; // loop over all the pages in the original PDF int n = reader.getNumberOfPages(); for (int i = 0; i < n;) { // step 1 document = new Document(); // step 2 copy = new PdfCopy(document, new FileOutputStream(String.format(RESULT, ++i))); // step 3 document.open(); // step 4 copy.addPage(copy.getImportedPage(reader, i)); // step 5 document.close(); } reader.close(); }
From source file:jasperSoft.MergePDF.java
/** * /* w w w . j ava 2 s .co m*/ * @param streamOfPDFFiles * @param outputStream * @param paginate */ public static void concatPDFs(List<InputStream> streamOfPDFFiles, OutputStream outputStream, boolean paginate) { Document document = new Document(); try { List<InputStream> pdfs = streamOfPDFFiles; List<PdfReader> readers = new ArrayList<PdfReader>(); int totalPages = 0; Iterator<InputStream> iteratorPDFs = pdfs.iterator(); // Create Readers for the pdfs. while (iteratorPDFs.hasNext()) { InputStream pdf = iteratorPDFs.next(); PdfReader pdfReader = new PdfReader(pdf); readers.add(pdfReader); totalPages += pdfReader.getNumberOfPages(); } // Create a writer for the outputstream PdfWriter writer = PdfWriter.getInstance(document, outputStream); document.open(); BaseFont bf = BaseFont.createFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.NOT_EMBEDDED); PdfContentByte cb = writer.getDirectContent(); // Holds the PDF // data PdfImportedPage page; int currentPageNumber = 0; int pageOfCurrentReaderPDF = 0; Iterator<PdfReader> iteratorPDFReader = readers.iterator(); // Loop through the PDF files and add to the output. while (iteratorPDFReader.hasNext()) { PdfReader pdfReader = iteratorPDFReader.next(); // Create a new page in the target for each source page. while (pageOfCurrentReaderPDF < pdfReader.getNumberOfPages()) { document.newPage(); pageOfCurrentReaderPDF++; currentPageNumber++; page = writer.getImportedPage(pdfReader, pageOfCurrentReaderPDF); cb.addTemplate(page, 0, 0); // Code for pagination. if (paginate) { cb.beginText(); cb.setFontAndSize(bf, 9); cb.showTextAligned(PdfContentByte.ALIGN_CENTER, "" + currentPageNumber + " of " + totalPages, 520, 5, 0); cb.endText(); } } pageOfCurrentReaderPDF = 0; } outputStream.flush(); document.close(); outputStream.close(); } catch (Exception e) { e.printStackTrace(); } finally { if (document.isOpen()) { document.close(); } try { if (outputStream != null) { outputStream.close(); } } catch (IOException ioe) { ioe.printStackTrace(); } } }
From source file:jati.GerandoArquivoCarimbado.java
public static Document montaraAquivo(String caminho) { try {/* w w w . j a va 2 s.c om*/ reader = new PdfReader(caminho); // n recebe o numero total de paginas //Tamanho da primeira Pagina //Cria Segundo PDF } catch (IOException e) { } Document documento = new Document(PageSize.A4); return documento; }
From source file:javaapplication1.JavaApplication1.java
public void parsePdf(String pdf, String txt) throws IOException { PdfReader reader = new PdfReader(pdf + ".pdf"); // PrintWriter out = new PrintWriter(new FileOutputStream(txt)); // Rectangle rect = new Rectangle(0,0, 300,800); // RenderFilter filter = new RegionTextRenderFilter(rect); // TextExtractionStrategy strategy; /* for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy)); }*//*from w w w. j a v a2s .com*/ int pw, ph; Rectangle mrect; //reader = new PdfReader(pdf); String g = new String(); ArrayList<Integer> cutlist; int lastblank; int lastword; //int p=reader.getNumberOfPages(); int p = 1; double f; BufferedImage img; BufferedImage img1; for (int k = 1; k <= p; k++) { cutlist = new ArrayList<Integer>(); lastword = 0; lastblank = -1; mrect = reader.getPageSize(k); ph = (int) mrect.getHeight(); pw = (int) mrect.getWidth(); for (int i = 0; i <= (ph - fontchecksize); i += fontchecksize) { g = mycheckline(reader, ph, pw, i, k); if (g.isEmpty()) lastblank = i; else { // System.out.println(g); if (lastword == 0) { if (i > 2 * fontchecksize) cutlist.add(i - 2 * fontchecksize); else cutlist.add(0); lastword = i; } else if (g.matches("\\d+\\..*")) { cutlist.add(((lastword + i) / 2)); } lastword = i; } } if (lastword + 3 * fontchecksize < ph) { cutlist.add(lastword + 3 * fontchecksize); } else cutlist.add(ph - 1); // System.out.println("The arraylist contains the following elements: "+ cutlist); img = ImageIO.read(new File(txt + ".png")); //img1 = ImageIO.read(new File("prefix-1.png")); f = img.getHeight() / ph; //System.out.println(f); int s; // ImageIO.write(img.getSubimage(0,(int)(f*(84)),(int)(f*pw),(int)(f*(156-84))), "png", new File("7.png")); for (s = 0; s < (cutlist.size() - 1); s++) { // System.out.println(cutlist.get(s)); ImageIO.write( img.getSubimage(0, (int) (f * (cutlist.get(s))), (int) (f * pw), (int) (f * (cutlist.get(s + 1) - cutlist.get(s)))), "png", new File(txt + s + ".png")); } /* ImageIO.write(img.getSubimage(0,(int)(f*(84)),(int)(f*pw),(int)(f*(156-84))), "png", new File("7.png")); ImageIO.write(img.getSubimage(0,(int)(f*(156)),(int)(f*pw),(int)(f*(222-156))), "png", new File("8.png")); ImageIO.write(img1.getSubimage(0,(int)(f*(186)),(int)(f*pw),(int)(f*(225-186))), "png", new File("1.png")); ImageIO.write(img1.getSubimage(0,(int)(f*(225)),(int)(f*pw),(int)(f*(297-225))), "png", new File("2.png")); ImageIO.write(img1.getSubimage(0,(int)(f*(297)),(int)(f*pw),(int)(f*(339-297))), "png", new File("3.png")); ImageIO.write(img1.getSubimage(0,(int)(f*(339)),(int)(f*pw),(int)(f*(465-339))), "png", new File("4.png")); ImageIO.write(img1.getSubimage(0,(int)(f*(465)),(int)(f*pw),(int)(f*(585-465))), "png", new File("5.png")); ImageIO.write(img1.getSubimage(0,(int)(f*(585)),(int)(f*pw),(int)(f*(630-585))), "png", new File("6.png"));*/ } }
From source file:javaapplication1.PDF.java
public void extractImages(String filename) throws IOException, DocumentException { // System.out.println("Processing PDF at " + filename); PdfReader reader = new PdfReader(filename); PdfReaderContentParser parser = new PdfReaderContentParser(reader); listener = new ImageRenderListener(); RenderListener print = parser.processContent(3, listener); reader.close();/* ww w .ja v a 2 s .com*/ }