Example usage for org.apache.pdfbox.pdmodel PDPage getContents

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDPage getContents.

Prototype

@Override
public InputStream getContents() throws IOException

Source Link

Document

Returns the content stream(s) of this page as a single input stream.

Usage

From source file:com.giaybac.traprange.test.TESTPDFBox.java

License:MIT License

@Test
public void test() throws IOException {
    String filePath = "D:\\traprange\\_Docs\\TK0976-AB5-0-2014042211.pdf";
    //String filePath = "C:\\Users\\ThoLuong\\Downloads\\Download\\1986 NL Batting - Sheet1.pdf";
    File pdfFile = new File(filePath);
    PDDocument pdDocument = PDDocument.load(pdfFile);
    //PrintTextLocations printer = new PrinTextLocations();
    List pages = pdDocument.getDocumentCatalog().getAllPages();
    PDPage page = (PDPage) pages.get(0);
    PDStream stream = page.getContents();

    this.processStream(page, page.findResources(), stream.getStream());
    //Print out all text
    ranges.sort(new Comparator<Range>() {
        @Override// w w w.  ja  v a2s.co m
        public int compare(Range o1, Range o2) {
            return o1.lowerEndpoint().compareTo(o2.lowerEndpoint());
        }
    });
    for (Range range : ranges) {
        System.out.println("> " + range);
    }
    //Print out all ranges
    List<Range<Integer>> trapRanges = trapRangeBuilder.build();
    for (Range trapRange : trapRanges) {
        System.out.println("TrapRange: " + trapRange);
    }
}

From source file:com.odc.pdfextractor.parser.CleanPdfParser.java

License:Apache License

/**
 * This will print the documents docBuilder.
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 *//*  w  w w . j  av a 2  s  . c  o  m*/
public DocumentLocation processPdf(String filename) throws Exception {

    PDDocument document = null;
    try {
        document = PDDocument.load(filename);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (InvalidPasswordException e) {
                System.err.println("Error: Document is encrypted with a password.");
                System.exit(1);
            }
        }
        List allPages = document.getDocumentCatalog().getAllPages();
        System.out.print("Extracting text from PDF");
        for (int i = 0; i < allPages.size(); i++) {
            PDPage page = (PDPage) allPages.get(i);
            System.out.print(".");
            PDStream contents = page.getContents();
            if (contents != null) {
                this.processStream(page, page.findResources(), page.getContents().getStream());
            }
            docBuilder.incrementPage();
        }
    } finally {
        System.out.println();
        if (document != null) {
            document.close();
        }
    }
    return docBuilder.getDoc();
}

From source file:com.santaanna.friendlyreader.pdfstod.pdfstod3.ReplaceStringStreamEngine.java

License:Apache License

/**
 * Sammanfatta PDF dokumentet baserat p ..
 *
 * @param inputFile The PDF to open.//from   w w w  .j  ava  2s .c  om
 * @param outputFile The PDF to write to.
 * @param strToFind The string to find in the PDF document.
 * @param message The message to write in the file.
 *
 * @throws IOException If there is an error writing the data.
 * @throws COSVisitorException If there is an error writing the PDF.
 */
public Collection<SEmening> doIt(String inputFile, String outputFile1, boolean DoHighlight, int sumslidval,
        int valdmening) throws IOException, COSVisitorException {
    // the document
    // doc = null; // Output dokumentet (? Kollas!)
    PDFOperator gop = PDFOperator.getOperator("g");
    COSFloat cfloat5 = new COSFloat("0.25");
    COSFloat cfloat1 = new COSFloat("0.75");
    Boolean gray1 = true;
    outputFile = outputFile1;
    String meningsrest = "";
    String sidtext = "";
    Boolean filesaved = false;

    try {
        helaTexten = "";
        SkrivUt(3, "Fre DoIt doc1 load");
        doc1 = PDDocument.load(inputFile); // Indokumentet.
        SkrivUt(3, "Efter DoIt doc1 load");
        List pages = doc1.getDocumentCatalog().getAllPages();
        // SkrivUt(2, "Antal sidor: " + pages.size());
        for (int i = 0; i < pages.size(); i++) { // Frsta fasen skall samla hela texten samt ev ndra relativa till absoluta.
                                                 // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
                                                 // PageVector.add(i, TBVector); // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida helaTexten: " + i);
            PDPage page = (PDPage) pages.get(i);
            PDStream contents = page.getContents();
            //AH Kod frn PageDrawer:
            if (contents != null) {
                PDResources resources = page.findResources();
                SkrivUt(4, "Fre getHelaTexten.");
                // Fas = relativ2absolut r inte implementerad n!
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                sidtext = getHelaTexten(page.getContents().getStream()); //getTextFromPDF, Robin
                helaTexten += sidtext;
                // Hmta hela texten frn dokumentet.
                // SkrivUt(2, "Hela texten per sida0: " + sidtext);
                // helaTexten = ""; // Skall inte nollstllas nu!
                // helaTexten = ""; // AH* >> Nollstll INTE, - fr alla sidor.
                SkrivUt(4, "Efter getHelaTexten.");
                // cosStream.getStreamToken
                /*
                // PDStream nycont = new PDStream( getTokenList());
                PDFStreamEngine.fas = PDFStreamEngine.splitstrings;
                processStream( page, resources, page.getContents().getStream());
                SkrivUt(4, "Hela texten2: " + helaTexten);
                SkrivUt(4, "Efter andra processStream.");
                meningsvektor = Hittameningarna( helaTexten );
                helaTexten = ""; // AH* >> Nollstll???
                 *
                 */
            }
        } // Extrahera meningarna frn hela texten:
          // SkrivUt(2, "Hela texten1: " + helaTexten);
        meningsvektor = Hittameningarna(helaTexten); // Splittra texten i meningar.
        SkrivUt(1, "Meningsvektor.Size: " + meningsvektor.size());
        SEmening semen = null;
        for (int n = 0; n < meningsvektor.size(); n++) {
            // Lgg till mening till meningsvektor.
            // semen = new SEmening();
            // semen.helameningen = menvektor.get( n );
            // meningsvektor.add(n, semen); // Huvudstrukturen fr meningar.
            SkrivUt(1, "Mening: " + meningsvektor.get(n).helameningen);
        }
        /* for (int n = 0; n < meningsvektor.size(); n++ )
        {
        SkrivUt(4, "Mening: " + meningsvektor.get( n ).helameningen);
        } */

        // String helaTextTemp = helaTexten;

        // Vid nsta bearbetning skall string och array splittras vid
        // meningsgrnser.
        mind = 0;
        meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan
        for (int i = 0; i < pages.size(); i++) {
            // Borde flytta p denna om de inte skall anvndas!
            // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            // PageVector.add(i, TBVector); // Totalstruktur.
            // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida Split: " + i);
            // PDPage ndrad till lokal variabel!
            page1 = (PDPage) pages.get(i);
            PDStream contents = page1.getContents();
            //AH Kod frn PageDrawer:
            SkrivUt(4, "Innan contents test.");
            if (contents != null) {
                PDResources resources = page1.findResources();
                SkrivUt(3, "Fre splitMeningar 1.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                SkrivUt(4, "Fre splitMeningar 2.");
                meningsrest = splitMeningar(meningsrest, page1.getContents().getStream());
                SkrivUt(4, "*** meningsrest: " + meningsrest);
                SkrivUt(4, "3, Efter splitMeningar.");
            }
        }

        // Kolla denna kod!!!
        // saveAndClose( outputFile, doc1 ); // AH****
        // doc1 = PDDocument.load( inputFile ); // Indokumentet.
        // SkrivUt(3,"Efter DoIt doc1 load");
        // pages = doc1.getDocumentCatalog().getAllPages(); Nyinlagd. Kvar???

        // Hr skall g operatorer lggas till fr varje TJ och Tj!
        mind = 0; // Behvs denna hr?
        meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan.

        for (int i = 0; i < pages.size(); i++) {
            // Borde flytta p denna om de inte skall anvndas!
            // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            // PageVector.add(i, TBVector); // Totalstruktur.
            // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida Gray: " + i);
            // PDPage ndrad till lokal variabel!
            page1 = (PDPage) pages.get(i);
            PDStream contents = page1.getContents();
            //AH Kod frn PageDrawer:
            SkrivUt(4, "Innan contents test.");
            if (contents != null) {
                PDResources resources = page1.findResources();
                SkrivUt(3, "Fre GrayInsert 1.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                // SkrivUt(4, "Fre splitMeningar 2.");
                meningsrest = grayInsert(meningsrest, page1.getContents().getStream(), i);
                //SkrivUt(4, "*** meningsrest: "+ meningsrest);
                SkrivUt(3, "Efter grayInsert av sida.");
            }
        }
        SkrivUt(3, "Efter hela grayInsert.");

        // Bygg TB och meningsstrukturer.
        //
        mind = 0;
        meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan.

        for (int i = 0; i < pages.size(); i++) {
            TBIndex = 0; // Index i textblocks strukturen.
            // Hr anvnds och byggs datastrukturerna!
            TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            PageVector.add(i, TBVector); // Totalstruktur.
            tbpagenr = i;
            // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida Split: " + i);
            // PDPage ndrad till lokal variabel!
            page1 = (PDPage) pages.get(i);
            PDStream contents = page1.getContents();
            //AH Kod frn PageDrawer:
            SkrivUt(4, "Innan contents test.");
            if (contents != null) {
                PDResources resources = page1.findResources();
                SkrivUt(3, "Fre byggStrukturer 1.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                SkrivUt(4, "Fre byggStrukturer 2.");
                meningsrest = byggStrukturer(meningsrest, page1.getContents().getStream(), i);
                SkrivUt(4, "*** meningsrest: " + meningsrest);
                SkrivUt(3, "Efter byggStrukturer.");
            }
        }

        // Skriv ut innehllet i TB strukturen:
        listTextBlocks();

        // Skriv ut meningarna:
        listMeningar();
        //*/
        // Sista passet skall samla in TP fontmetrics och spara dem till TB strukturer.

        SkrivUt(4, "Fr lngt.");
        mind = 0; // Behvs denna hr?

        for (int i = 0; i < pages.size(); i++) { // Frsta fasen skall samla hela texten samt ev ndra relativa till absoluta.
                                                 // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
                                                 // PageVector.add(i, TBVector); // Lgg till TB vektorn fr denna sida.
            tempsidnr = i;
            SkrivUt(4, "Ny sida A: " + i);
            PDPage page = (PDPage) pages.get(i);
            PDStream contents = page.getContents();
            //AH Kod frn PageDrawer:
            if (contents != null) {
                PDResources resources = page.findResources();
                SkrivUt(4, "Fre processStream.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                // SkrivUt(4, "Hela texten2FRE: " + helaTexten);
                // processStream( page, resources, page.getContents().getStream()); // Hr anropas sidhanteringen!
                // SkrivUt(2, "Hela texten2: " + helaTexten);
                // helaTexten = ""; // AH* >> Nollstll INTE, - fr alla sidor.
                SkrivUt(4, "Efter processStream. fre nya");
                // cosStream.getStreamToken

                /*
                // PDStream nycont = new PDStream( getTokenList());
                PDFStreamEngine.fas = PDFStreamEngine.splitstrings;
                processStream( page, resources, page.getContents().getStream());
                SkrivUt(4, "Hela texten2: " + helaTexten);
                SkrivUt(4, "Efter andra processStream.");
                meningsvektor = Hittameningarna( helaTexten );
                helaTexten = ""; // AH* >> Nollstll???
                 *
                 */
            }
        }
        SkrivUt(3, "Efter processStream.");

        //if (DoHighlight)
        //{
        // Hr skall texten frmedlas till EasyReader och resultatlista med
        // meningar som skall highlightas skall returneras!
        if ((DoHighlight) && !(helaTexten.equals(""))) {
            SkrivUt(2, "Fre sammanfatta. helaTexten = \"\"");
            menisammanfattningen = sammanfatta(helaTexten, sumslidval);
            System.out.println(menisammanfattningen);
        } else
            menisammanfattningen = null;
        // Hr skall g operatorernas argument modifieras fr de som skall vara
        // med i sammanfattningen.
        mind = 0; // Behvs denna hr?
        meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan.
        cosenr = 0; // index fr COSString eller COSArray.
        mennr = 0; // index fr aktuell mening.
        mendelnr = 0;
        mendelantal = 0; // Antal delar som meningen bestr av.
        valdsida = -1; // valda sidan inte knd n.
        for (int i = 0; i < pages.size(); i++) {
            // Borde flytta p denna om de inte skall anvndas!
            // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            // PageVector.add(i, TBVector); // Totalstruktur.
            // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida highlight: " + i);
            // PDPage ndrad till lokal variabel!
            page1 = (PDPage) pages.get(i);
            PDStream contents = page1.getContents();
            //AH Kod frn PageDrawer:
            SkrivUt(4, "Innan contents test.");
            if (contents != null) {
                PDResources resources = page1.findResources();
                SkrivUt(1, "Fre highlight 1. Sida: " + i);
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                // SkrivUt(4, "Fre splitMeningar 2.");
                // if (DoHighlight) 
                meningsrest = highlight(meningsrest, page1.getContents().getStream(), i, DoHighlight,
                        valdmening);
                //SkrivUt(4, "*** meningsrest: "+ meningsrest);
                SkrivUt(1, "Efter highlight av sida:" + i);
            }
        }
        SkrivUt(3, "Efter hela highlight.");
        /*} else // Spara data till pageTokens fr sparande till fil efter.
        {
                
        }*/

        // Dags att hmta fontmetrics och spara till fil. r det samma som ovan?
        // Skall inte gras f.n!

        for (int i = 0; i < pages.size(); i++) {
            TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida.
            PageVector.add(i, TBVector); // Lgg till TB vektorn fr denna sida.
            SkrivUt(4, "Ny sida X: " + i);
            // PDPage ndrad, inte lokal lngre!
            page = (PDPage) pages.get(i);
            PDStream contents = page.getContents();
            //AH Kod frn PageDrawer:
            if (contents != null) {
                PDResources resources = page.findResources();
                SkrivUt(4, "Fre processStream.");
                // PDFStreamEngine.fas = PDFStreamEngine.rel2abs;
                setSumcharAlla(0); // Nollstll teckenrknaren fr strings
                // AH* Nsta rad anvnds fr att hmta ut fontmetrics.
                /* processStream( page, resources, page.getContents().getStream()); // Hr anropas sidhanteringen!
                SkrivUt(4, "Hela texten1: " + helaTexten);
                // helaTexten = ""; // Skall inte nollstllas nu!
                SkrivUt(4, "Efter processStream. fre nya");
                // cosStream.getStreamToken
                 *
                 */

                // PDStream nycont = new PDStream( getTokenList());
                // PDFStreamEngine.fas = PDFStreamEngine.splitstrings;
                // Nedanstende har anropats ovan.
                // processStream( page, resources, page.getContents().getStream());
                // SkrivUt(4, "Hela texten3: " + helaTexten);
                SkrivUt(4, "Efter andra processStream.");
            }
            SkrivUt(3, "Efter hela andra processStream.");
            /*
            PDFStreamParser parser = new PDFStreamParser(contents.getStream());
            parser.parse();
             *
             */
            // SkrivUt(4, "Egna loopen Sida: " + i);
            /* List tokens = getTokenList();// AH* parser.getTokens(); Tidigare hmtning av lista.
            // Dvs hmta INTE tokens frn den parsade filen. Anvnd tidigare data.
             LinkedList arguments = new LinkedList(); // AH* argumenten till operatorn.
             for( int j=0; j<tokens.size(); j++ )
             {
            Object next = tokens.get( j );
            if( next instanceof PDFOperator )
            {
                PDFOperator op = (PDFOperator)next;
                //Tj and TJ are the two operators that display
                //strings in a PDF
                //AH:
                //SkrivUt(4, "ArgumentList length: " + arguments.size());
                //>> AH* SkrivUt(4, "Operator anrop:" + OperatorCall( op, arguments ));
                // AH: Hr borde man gra ett anrop till StreamEngine!
                arguments = new LinkedList(); // Mste nollstlla argumenten
                // efter varje operator.
                if( op.getOperation().equals( "Tj" ) )
                {
                    //Tj takes one operator and that is the string
                    //to display so lets update that operator
                    COSString previous = (COSString)tokens.get( j-1 );
                    String string = previous.getString();
                    string = string.replaceFirst( strToFind, message );
                    previous.reset();
                    previous.append( string.getBytes() );
                    // AH* Testa tillgg av kod.
                            
                    tokens.add(j-1, gop);
                    if (gray1)
                    {
                        tokens.add(j-1,cfloat1 );
                        gray1 = false;
                    } else
                    {
                        tokens.add(j-1, cfloat5);
                        gray1 = true;
                    }
                    j = j+2;
                             
                            
                }
                else if( op.getOperation().equals( "TJ" ) )
                {
                    COSArray previous = (COSArray)tokens.get( j-1 );
                    for( int k=0; k<previous.size(); k++ )
                    {
                        Object arrElement = previous.getObject( k );
                        if( arrElement instanceof COSString )
                        {
                            COSString cosString = (COSString)arrElement;
                            String string = cosString.getString();
                            string = string.replaceFirst( strToFind, message );
                            cosString.reset();
                            cosString.append( string.getBytes() );
                        }
                    }
                    // AH: Tillagd kod!
                    /*
                    tokens.add(j-1, gop);
                    if (gray1)
                    {
                        tokens.add(j-1,cfloat1 );
                        gray1 = false;
                    } else
                    {
                        tokens.add(j-1, cfloat5);
                        gray1 = true;
                    }
                    j = j+2;
                    
                             
                }
            } else // Inte PDFOperator, samla argument!
            {
                if (next instanceof COSBase)
                {
                     arguments.add( next);
                     //SkrivUt(4, "COSBase " + next.toString());
                } else
                {
                    SkrivUt(4, "next inte rtt typ!");
                }
            }
             }
                    
             */
            //now that the tokens are updated we will replace the
            //page content stream.
            // Uppdatera data till filen!
            SkrivUt(3, ">>> Fre spara tokens i DoIt.");
            PDStream updatedStream = new PDStream(doc1);
            SkrivUt(3, ">>> Efter updated stream i DoIt.");
            OutputStream out = updatedStream.createOutputStream();
            ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
            tokenWriter.writeTokens(pageTokens.get(i));
            page.setContents(updatedStream);
            SkrivUt(3, ">>> Efter spara tokens i DoIt.");
        }
        /*
        if (!filesaved)
        {
        doc1.save( outputFile );
        filesaved = true;
        }
        doc1.close();
        SkrivUt(3, "doc1 closed 1.");
         *
         */
    } finally {
        saveAndClose(outputFile, doc1);
        /*
        SkrivUt(2, "Finally.");
        if( doc1 != null )
        {
        if (!filesaved)
        {
            doc1.save( outputFile );
            filesaved = true;
        }
        doc1.close();
        SkrivUt(3, "doc1 closed 2.");
        }
         *
         */
    }

    return meningsvektor;
}

From source file:de.tudarmstadt.ukp.dkpro.core.io.pdf.PdfLayoutEventStripper.java

License:Apache License

/**
 * This will process all of the pages and the text that is in them.
 * //from w ww  .  j a  va  2s .c o  m
 * @param pages
 *            The pages object in the document.
 * 
 * @throws IOException
 *             If there is an error parsing the text.
 */
protected void processPages(List<PDPage> pages) throws IOException {
    maxPage = pages.size();

    for (final PDPage page : pages) {
        currentPageNo++;
        final PDStream contentStream = page.getContents();
        if (contentStream != null) {
            final COSStream contents = contentStream.getStream();
            processPage(page, contents);
        }
    }
}

From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java

License:Apache License

/**
 * This will process all of the pages and the text that is in them.
 *
 * @param pages The pages object in the document.
 *
 * @throws IOException If there is an error parsing the text.
 *//*from   w w  w .j a  va  2 s.  c o m*/
protected void processPages(List<COSObjectable> pages) throws IOException {
    if (startBookmark != null) {
        startBookmarkPageNumber = getPageNumber(startBookmark, pages);
    }
    if (endBookmark != null) {
        endBookmarkPageNumber = getPageNumber(endBookmark, pages);
    }

    if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1
            && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
        //this is a special case where both the start and end bookmark
        //are the same but point to nothing.  In this case
        //we will not extract any text.
        startBookmarkPageNumber = 0;
        endBookmarkPageNumber = 0;
    }
    Iterator<COSObjectable> pageIter = pages.iterator();
    while (pageIter.hasNext()) {
        PDPage nextPage = (PDPage) pageIter.next();
        PDStream contentStream = nextPage.getContents();
        currentPageNo++;
        if (contentStream != null) {
            COSStream contents = contentStream.getStream();
            processPage(nextPage, contents);
        }
    }
}

From source file:edworld.pdfreader4humans.impl.MainPDFComponentLocator.java

License:Apache License

protected List<TextComponent> locateAllTextComponents(final PDPage page,
        final List<GridComponent> gridComponents) throws IOException {
    return new PDFTextStripper() {
        private Map<String, String> fusions;
        List<Component> horizontalComponents;
        private ArrayList<TextComponent> list;
        {//from   w w  w  . j a v  a 2s .  c  om
            fusions = new HashMap<String, String>();
            fusions.put("o-", "");
            fusions.put("a-", "");
        }

        public List<TextComponent> locateTextComponents() throws IOException {
            horizontalComponents = Component.horizontal(gridComponents);
            list = new ArrayList<TextComponent>();
            PDStream contents = page.getContents();
            setStartPage(getCurrentPageNo());
            setEndPage(getCurrentPageNo());
            setSortByPosition(false);
            if (contents != null) {
                output = new StringWriter();
                processPage(page, contents.getStream());
            }
            joinConsecutiveTexts(list);
            Collections.sort(list);
            return list;
        }

        protected void joinConsecutiveTexts(ArrayList<TextComponent> textComponents) {
            for (int i = 0; i < textComponents.size() - 1; i++) {
                TextComponent currentComponent = textComponents.get(i);
                TextComponent nextComponent = textComponents.get(i + 1);
                if (currentComponent.consecutive(nextComponent, false)) {
                    textComponents.set(i, joinTextComponents(currentComponent, SPACE, nextComponent));
                    textComponents.remove(i + 1);
                    i--;
                }
            }
        }

        protected TextComponent joinTextComponents(TextComponent component1, String separatorCharacter,
                TextComponent component2) {
            return new TextComponent(component1.getText() + separatorCharacter + component2.getText(),
                    component1.getFromX(), Math.min(component1.getFromY(), component2.getFromY()),
                    component2.getToX(), Math.max(component1.getToY(), component2.getToY()),
                    component1.getFontName(), component1.getFontSize());
        }

        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
            float fromX = Float.POSITIVE_INFINITY;
            float fromY = Float.POSITIVE_INFINITY;
            float toX = Float.NEGATIVE_INFINITY;
            float toY = Float.NEGATIVE_INFINITY;
            float fontSize = -1;
            String fontName = "";
            List<TextPosition> partialList = new ArrayList<TextPosition>();
            String partialText = "";
            float lastLeft = Float.NEGATIVE_INFINITY;
            float lastRight = Float.NEGATIVE_INFINITY;
            for (TextPosition textPosition : textPositions) {
                String character = textPosition.getCharacter();
                Component overlappingShape = findOverlappingHorizontalShape(textPosition);
                if (overlappingShape != null && fusible(character, "-")) {
                    character = fusion(character, "-");
                    removeOverlappingShape(overlappingShape);
                }
                float x1 = textPosition.getX();
                float y1 = textPosition.getY();
                if (x1 < lastLeft) {
                    list.add(new TextComponent(partialText, fromX, fromY, toX, toY, fontName, fontSize));
                    writeString(text.substring(partialText.length()),
                            textPositions.subList(partialList.size(), textPositions.size()));
                    return;
                } else if (x1 < lastRight && fusible(partialText, character)) {
                    partialText = fusion(partialText, character);
                } else {
                    if (x1 < fromX) {
                        fromX = x1;
                        fromY = y1 - textPosition.getHeight();
                        fontName = textPosition.getFont().getBaseFont();
                        fontSize = textPosition.getFontSizeInPt();
                    }
                    partialList.add(textPosition);
                    partialText += character;
                }
                toX = Math.max(x1 + textPosition.getWidth(), toX);
                toY = Math.max(y1, toY);
                lastLeft = x1;
                lastRight = x1 + textPosition.getWidth();
            }
            list.add(new TextComponent(partialText, fromX, fromY, toX, toY, fontName, fontSize));
        }

        private Component findOverlappingHorizontalShape(TextPosition textPosition) {
            GridComponent component = new GridComponent("rect", textPosition.getX(),
                    textPosition.getY() - textPosition.getHeight(),
                    textPosition.getX() + textPosition.getWidth(), textPosition.getY(), 1);
            for (Component candidate : horizontalComponents)
                if (candidate.intersects(component)
                        && Math.abs(candidate.getWidth() - component.getWidth()) < 0.1)
                    return candidate;
            return null;
        }

        private void removeOverlappingShape(Component overlappingShape) throws IOException {
            gridComponents.remove(overlappingShape);
        }

        private boolean fusible(String partialText, String character) {
            return partialText.endsWith(SPACE) || fusions.containsKey(fusionPair(partialText, character));
        }

        private String fusion(String partialText, String character) {
            if (partialText.endsWith(SPACE))
                return partialText.substring(0, partialText.length() - 1) + character;
            return partialText.substring(0, partialText.length() - 1)
                    + fusions.get(fusionPair(partialText, character));
        }

        private String fusionPair(String partialText, String character) {
            return partialText.substring(partialText.length() - 1) + character.charAt(0);
        }
    }.locateTextComponents();
}

From source file:fi.nls.oskari.printout.printing.PDPageContentStream.java

License:Apache License

/**
 * Create a new PDPage content stream./*from w w w .j av a2  s  .c o m*/
 * 
 * @param document
 *            The document the page is part of.
 * @param sourcePage
 *            The page to write the contents to.
 * @param appendContent
 *            Indicates whether content will be overwritten. If false all
 *            previous content is deleted.
 * @param compress
 *            Tell if the content stream should compress the page contents.
 * @param resetContext
 *            Tell if the graphic context should be reseted.
 * @throws IOException
 *             If there is an error writing to the page contents.
 */
public PDPageContentStream(PDDocument document, PDPage sourcePage, boolean appendContent, boolean compress,
        boolean resetContext) throws IOException {

    page = sourcePage;
    resources = page.getResources();
    if (resources == null) {
        resources = new PDResources();
        page.setResources(resources);
    }

    // Get the pdstream from the source page instead of creating a new one
    PDStream contents = sourcePage.getContents();
    boolean hasContent = contents != null;

    // If request specifies the need to append to the document
    if (appendContent && hasContent) {

        // Create a pdstream to append new content
        PDStream contentsToAppend = new PDStream(document);

        // This will be the resulting COSStreamArray after existing and new
        // streams are merged
        COSStreamArray compoundStream = null;

        // If contents is already an array, a new stream is simply appended
        // to it
        if (contents.getStream() instanceof COSStreamArray) {
            compoundStream = (COSStreamArray) contents.getStream();
            compoundStream.appendStream(contentsToAppend.getStream());
        } else {
            // Creates the COSStreamArray and adds the current stream plus a
            // new one to it
            COSArray newArray = new COSArray();
            newArray.add(contents.getCOSObject());
            newArray.add(contentsToAppend.getCOSObject());
            compoundStream = new COSStreamArray(newArray);
        }

        if (compress) {
            List<COSName> filters = new ArrayList<COSName>();
            filters.add(COSName.FLATE_DECODE);
            contentsToAppend.setFilters(filters);
        }

        if (resetContext) {
            // create a new stream to encapsulate the existing stream
            PDStream saveGraphics = new PDStream(document);
            output = saveGraphics.createOutputStream();
            // save the initial/unmodified graphics context
            saveGraphicsState();
            close(); // ?
            if (compress) {
                List<COSName> filters = new ArrayList<COSName>();
                filters.add(COSName.FLATE_DECODE);
                saveGraphics.setFilters(filters);
            }
            // insert the new stream at the beginning
            compoundStream.insertCOSStream(saveGraphics);
        }

        // Sets the compoundStream as page contents
        sourcePage.setContents(new PDStream(compoundStream));
        output = contentsToAppend.createOutputStream();
        if (resetContext) {
            // restore the initial/unmodified graphics context
            restoreGraphicsState();
        }
    } else {
        if (hasContent) {
            LOG.warn("You are overwriting an existing content, you should use the append mode");
        }
        contents = new PDStream(document);
        if (compress) {
            List<COSName> filters = new ArrayList<COSName>();
            filters.add(COSName.FLATE_DECODE);
            contents.setFilters(filters);
        }
        sourcePage.setContents(contents);
        output = contents.createOutputStream();
    }
    formatDecimal.setMaximumFractionDigits(10);
    formatDecimal.setGroupingUsed(false);
}

From source file:function.PrintImageLocations.java

License:Apache License

/**
 * This will print the documents data.//www.ja  v  a  2  s . com
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 */
public static void main(String[] args) throws Exception {

    PDDocument document = null;
    try {
        document = PDDocument.load(new File("C:/Users/ATUL/Desktop/Page-layout/output1.pdf"));
        if (document.isEncrypted()) {
            document.decrypt("");
        }
        PrintImageLocations printer = new PrintImageLocations();
        List allPages = document.getDocumentCatalog().getAllPages();
        for (int i = 0; i < allPages.size(); i++) {
            System.out.println("\n***********************************************************");
            PDPage page = (PDPage) allPages.get(i);
            System.out.println("Processing page: " + (i + 1));
            printer.processStream(page, page.findResources(), page.getContents().getStream());
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }

}

From source file:name.marcelomorales.siqisiqi.pdfbox.CoordinatesGenerator.java

License:Apache License

public void generarPdf(OutputStream os, String template, Map<String, Object> m, String path, String coordenates,
        float fontSize, float ancho) throws IOException {
    long t = System.currentTimeMillis();
    PDDocument doc = null;//from   w  w  w.  j a va 2s . c o  m
    try {
        doc = PDDocument.load(new File(path));

        List pages = doc.getDocumentCatalog().getAllPages();

        PDPage sourcePage = (PDPage) pages.get(0);

        boolean append = sourcePage.getContents() != null;
        PDPageContentStream contentStream = new PDPageContentStream(doc, sourcePage, append, true);

        StringReader fileReader = null;
        try {

            fileReader = new StringReader(template);
            List<String> list = CharStreams.readLines(fileReader);
            boolean textHasBegun = false;
            float currentOffset = 0f;
            for (String line : list) {

                if (line == null) {
                    continue;
                }

                if (line.startsWith("#")) {
                    continue;
                }

                final Iterable<String> str = Splitter.on(',').omitEmptyStrings().trimResults().split(line);
                final String[] split = Iterables.toArray(str, String.class);
                if (split == null || split.length < 4) {
                    continue;
                }

                if (Character.isDigit(split[0].charAt(0))) {
                    if (textHasBegun) {
                        contentStream.endText();
                    }
                    contentStream.beginText();
                    textHasBegun = true;
                    contentStream.moveTextPositionByAmount(parseFloat(split[0]), parseFloat(split[1]));
                } else {
                    contentStream.moveTextPositionByAmount(currentOffset, 0);
                }

                if (!textHasBegun) {
                    LOGGER.warn("Hay un posible mal uso de un .ree", new Throwable());
                    contentStream.beginText();
                    textHasBegun = true;
                }

                PDType1Font font;
                if ("b".equals(split[2])) {
                    font = HELVETICA_BOLD;
                } else {
                    font = HELVETICA;
                }
                contentStream.setFont(font, fontSize);

                Object text = null;
                if (split[3].startsWith("\"")) {
                    // TODO: text = substring(split[3], 1, -1);
                } else {
                    // TODO: text = new PropertyModel(m, split[3]).getObject();
                }

                if (text == null) {
                    LOGGER.warn("Propiedad {} no se encuentra", split[3]);
                    //contentStream.drawString("ERROR: propiedad no encontrada");
                    contentStream.drawString(" ");
                } else {
                    String string = text.toString();
                    currentOffset = font.getStringWidth(string) * ancho;
                    contentStream.drawString(string);
                }
            }

            if (textHasBegun) {
                contentStream.endText();
            }
        } finally {
            Closeables.closeQuietly(fileReader);
        }

        contentStream.close();

        try {
            doc.save(os);
        } catch (COSVisitorException e) {
            throw new IOException("Ha ocurrido un error al escribir en el Os", e);
        }
    } finally {
        if (doc != null) {
            doc.close();
        }
        LOGGER.info("Me ha tomado {} milisegundos hacer el pdf", System.currentTimeMillis() - t);
    }
}

From source file:net.bookinaction.TextInfoExtractor.java

License:Apache License

public static void getTextPositionFromPage(PDDocument document, StripperParam stripperParam, int pageNum,
        PrintWriter writer, boolean testMode) throws IOException {
    //System.out.println(String.format("getPage: %d", pageNum));

    PDPage page = document.getPage(pageNum - 1); // pdfbox uses the 0-base index
    PDRectangle cropBox = page.getCropBox();

    // extract image locations
    ImageLocationListener imageLocationsListener = new ImageLocationListener();

    List<Rectangle2D> imageRects = new ArrayList<Rectangle2D>();
    imageLocationsListener.setImageRects(imageRects);
    imageLocationsListener.processPage(page);

    // extract Text locations
    StripString stripString = new StripString();

    TextLocationListener stripper = new TextLocationListener(stripperParam, stripString);
    stripper.setSortByPosition(true);/*from  w w w .  j  a  v  a2s  .com*/

    List<StripLine> stripLines = new ArrayList<StripLine>();

    stripper.setStartPage(pageNum);
    stripper.setEndPage(pageNum);

    try {
        stripper.writeText(document, new OutputStreamWriter(new ByteArrayOutputStream()));
    } catch (IOException e) {
        return;
    }

    if (page.getContents() != null)
        stripper.processPage(page);

    // declare canvas and keep this position
    PDPageContentStream canvas = new PDPageContentStream(document, page, true, true, true);

    Stamper s = new Stamper(); // utility class

    if (testMode) {
        // draw the bounding box of each character
        for (int i = 0; i < stripString.size(); i++) {
            // original Rectangle
            s.showBox(canvas, stripString.boundingRect(i), cropBox, Color.GRAY80);
        }
    }

    s.recordPageSize(writer, pageNum, cropBox);

    // splits into lines
    int lineNum = 1;
    int lineStart = 0, lineEnd = 0;
    String[] splits = stripString.toString().split("\r");

    SimpleTokenizer simpleTokenizer = new SimpleTokenizer();

    for (String lineText : splits) {

        if (lineText.length() < 1)
            continue;

        lineEnd = lineStart + lineText.length();

        Rectangle2D mergedRect = stripString.boundingRect(lineStart, lineEnd - 1);
        String sub = stripString.substring(lineStart, lineEnd);

        stripLines.add(new StripLine(pageNum, lineNum, lineStart, lineEnd, mergedRect));

        //System.out.println(String.format("%d-%d: %s - [%.0f %.0f %.0f %.0f]", pageNum, lineNum, sub,
        //        mergedRect.getX(), mergedRect.getY(), mergedRect.getWidth(), mergedRect.getHeight()));
        if (testMode) {
            s.showBox(canvas, mergedRect, cropBox, Color.GREEN);
        }

        s.recordTextPosition(writer, sub, pageNum, mergedRect, "LINE");

        /******* get words in the line *********/
        List<Token> tokens = simpleTokenizer.getTokens(sub);

        for (String pattern : circles_patterns) {
            List<Token> symbolTokens = PatternAnalyzer.getTokensByPattern(sub, pattern);
            tokens.addAll(symbolTokens);
        }

        for (Token t : tokens) {
            mergedRect = stripString.boundingRect(lineStart + t.getStart(), lineStart + t.getEnd() - 1);
            //System.out.println(String.format("%d-%d: %s - [%.0f %.0f %.0f %.0f]", pageNum, lineNum, t.getStem(), mergedRect.getX(), mergedRect.getY(), mergedRect.getWidth(), mergedRect.getHeight()));

            s.recordTextPosition(writer, t.getStem(), pageNum, mergedRect, "TEXT");

            if (testMode) {
                s.showBox(canvas, mergedRect, cropBox, Color.RED);
            }

        }

        lineStart += lineText.length() + 1;
        lineNum++;
    }

    // -------------------

    // markup textMark annotation to the image
    int imageNum = 1;
    for (Rectangle2D imRect : imageRects) {
        //page.getAnnotations().add(annotationMaker.textMarkupAnnotation(Color.YELLOW, (Rectangle2D.Float) imRect, "image"+imageNum));

        if (testMode) {
            s.showBox(canvas, imRect, cropBox, Color.YELLOW);
        }
        s.recordTextPosition(writer, "[image" + imageNum + "]", pageNum, imRect, "IMAGE");

        imageNum++;
    }

    canvas.close();
}