List of usage examples for org.apache.pdfbox.pdmodel PDPage getContents
@Override public InputStream getContents() throws IOException
From source file:com.giaybac.traprange.test.TESTPDFBox.java
License:MIT License
@Test public void test() throws IOException { String filePath = "D:\\traprange\\_Docs\\TK0976-AB5-0-2014042211.pdf"; //String filePath = "C:\\Users\\ThoLuong\\Downloads\\Download\\1986 NL Batting - Sheet1.pdf"; File pdfFile = new File(filePath); PDDocument pdDocument = PDDocument.load(pdfFile); //PrintTextLocations printer = new PrinTextLocations(); List pages = pdDocument.getDocumentCatalog().getAllPages(); PDPage page = (PDPage) pages.get(0); PDStream stream = page.getContents(); this.processStream(page, page.findResources(), stream.getStream()); //Print out all text ranges.sort(new Comparator<Range>() { @Override// w w w. ja v a2s.co m public int compare(Range o1, Range o2) { return o1.lowerEndpoint().compareTo(o2.lowerEndpoint()); } }); for (Range range : ranges) { System.out.println("> " + range); } //Print out all ranges List<Range<Integer>> trapRanges = trapRangeBuilder.build(); for (Range trapRange : trapRanges) { System.out.println("TrapRange: " + trapRange); } }
From source file:com.odc.pdfextractor.parser.CleanPdfParser.java
License:Apache License
/** * This will print the documents docBuilder. * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. *//* w w w . j av a 2 s . c o m*/ public DocumentLocation processPdf(String filename) throws Exception { PDDocument document = null; try { document = PDDocument.load(filename); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } List allPages = document.getDocumentCatalog().getAllPages(); System.out.print("Extracting text from PDF"); for (int i = 0; i < allPages.size(); i++) { PDPage page = (PDPage) allPages.get(i); System.out.print("."); PDStream contents = page.getContents(); if (contents != null) { this.processStream(page, page.findResources(), page.getContents().getStream()); } docBuilder.incrementPage(); } } finally { System.out.println(); if (document != null) { document.close(); } } return docBuilder.getDoc(); }
From source file:com.santaanna.friendlyreader.pdfstod.pdfstod3.ReplaceStringStreamEngine.java
License:Apache License
/** * Sammanfatta PDF dokumentet baserat p .. * * @param inputFile The PDF to open.//from w w w .j ava 2s .c om * @param outputFile The PDF to write to. * @param strToFind The string to find in the PDF document. * @param message The message to write in the file. * * @throws IOException If there is an error writing the data. * @throws COSVisitorException If there is an error writing the PDF. */ public Collection<SEmening> doIt(String inputFile, String outputFile1, boolean DoHighlight, int sumslidval, int valdmening) throws IOException, COSVisitorException { // the document // doc = null; // Output dokumentet (? Kollas!) PDFOperator gop = PDFOperator.getOperator("g"); COSFloat cfloat5 = new COSFloat("0.25"); COSFloat cfloat1 = new COSFloat("0.75"); Boolean gray1 = true; outputFile = outputFile1; String meningsrest = ""; String sidtext = ""; Boolean filesaved = false; try { helaTexten = ""; SkrivUt(3, "Fre DoIt doc1 load"); doc1 = PDDocument.load(inputFile); // Indokumentet. SkrivUt(3, "Efter DoIt doc1 load"); List pages = doc1.getDocumentCatalog().getAllPages(); // SkrivUt(2, "Antal sidor: " + pages.size()); for (int i = 0; i < pages.size(); i++) { // Frsta fasen skall samla hela texten samt ev ndra relativa till absoluta. // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida. // PageVector.add(i, TBVector); // Lgg till TB vektorn fr denna sida. SkrivUt(4, "Ny sida helaTexten: " + i); PDPage page = (PDPage) pages.get(i); PDStream contents = page.getContents(); //AH Kod frn PageDrawer: if (contents != null) { PDResources resources = page.findResources(); SkrivUt(4, "Fre getHelaTexten."); // Fas = relativ2absolut r inte implementerad n! // PDFStreamEngine.fas = PDFStreamEngine.rel2abs; setSumcharAlla(0); // Nollstll teckenrknaren fr strings sidtext = getHelaTexten(page.getContents().getStream()); //getTextFromPDF, Robin helaTexten += sidtext; // Hmta hela texten frn dokumentet. // SkrivUt(2, "Hela texten per sida0: " + sidtext); // helaTexten = ""; // Skall inte nollstllas nu! // helaTexten = ""; // AH* >> Nollstll INTE, - fr alla sidor. SkrivUt(4, "Efter getHelaTexten."); // cosStream.getStreamToken /* // PDStream nycont = new PDStream( getTokenList()); PDFStreamEngine.fas = PDFStreamEngine.splitstrings; processStream( page, resources, page.getContents().getStream()); SkrivUt(4, "Hela texten2: " + helaTexten); SkrivUt(4, "Efter andra processStream."); meningsvektor = Hittameningarna( helaTexten ); helaTexten = ""; // AH* >> Nollstll??? * */ } } // Extrahera meningarna frn hela texten: // SkrivUt(2, "Hela texten1: " + helaTexten); meningsvektor = Hittameningarna(helaTexten); // Splittra texten i meningar. SkrivUt(1, "Meningsvektor.Size: " + meningsvektor.size()); SEmening semen = null; for (int n = 0; n < meningsvektor.size(); n++) { // Lgg till mening till meningsvektor. // semen = new SEmening(); // semen.helameningen = menvektor.get( n ); // meningsvektor.add(n, semen); // Huvudstrukturen fr meningar. SkrivUt(1, "Mening: " + meningsvektor.get(n).helameningen); } /* for (int n = 0; n < meningsvektor.size(); n++ ) { SkrivUt(4, "Mening: " + meningsvektor.get( n ).helameningen); } */ // String helaTextTemp = helaTexten; // Vid nsta bearbetning skall string och array splittras vid // meningsgrnser. mind = 0; meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan for (int i = 0; i < pages.size(); i++) { // Borde flytta p denna om de inte skall anvndas! // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida. // PageVector.add(i, TBVector); // Totalstruktur. // Lgg till TB vektorn fr denna sida. SkrivUt(4, "Ny sida Split: " + i); // PDPage ndrad till lokal variabel! page1 = (PDPage) pages.get(i); PDStream contents = page1.getContents(); //AH Kod frn PageDrawer: SkrivUt(4, "Innan contents test."); if (contents != null) { PDResources resources = page1.findResources(); SkrivUt(3, "Fre splitMeningar 1."); // PDFStreamEngine.fas = PDFStreamEngine.rel2abs; setSumcharAlla(0); // Nollstll teckenrknaren fr strings SkrivUt(4, "Fre splitMeningar 2."); meningsrest = splitMeningar(meningsrest, page1.getContents().getStream()); SkrivUt(4, "*** meningsrest: " + meningsrest); SkrivUt(4, "3, Efter splitMeningar."); } } // Kolla denna kod!!! // saveAndClose( outputFile, doc1 ); // AH**** // doc1 = PDDocument.load( inputFile ); // Indokumentet. // SkrivUt(3,"Efter DoIt doc1 load"); // pages = doc1.getDocumentCatalog().getAllPages(); Nyinlagd. Kvar??? // Hr skall g operatorer lggas till fr varje TJ och Tj! mind = 0; // Behvs denna hr? meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan. for (int i = 0; i < pages.size(); i++) { // Borde flytta p denna om de inte skall anvndas! // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida. // PageVector.add(i, TBVector); // Totalstruktur. // Lgg till TB vektorn fr denna sida. SkrivUt(4, "Ny sida Gray: " + i); // PDPage ndrad till lokal variabel! page1 = (PDPage) pages.get(i); PDStream contents = page1.getContents(); //AH Kod frn PageDrawer: SkrivUt(4, "Innan contents test."); if (contents != null) { PDResources resources = page1.findResources(); SkrivUt(3, "Fre GrayInsert 1."); // PDFStreamEngine.fas = PDFStreamEngine.rel2abs; setSumcharAlla(0); // Nollstll teckenrknaren fr strings // SkrivUt(4, "Fre splitMeningar 2."); meningsrest = grayInsert(meningsrest, page1.getContents().getStream(), i); //SkrivUt(4, "*** meningsrest: "+ meningsrest); SkrivUt(3, "Efter grayInsert av sida."); } } SkrivUt(3, "Efter hela grayInsert."); // Bygg TB och meningsstrukturer. // mind = 0; meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan. for (int i = 0; i < pages.size(); i++) { TBIndex = 0; // Index i textblocks strukturen. // Hr anvnds och byggs datastrukturerna! TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida. PageVector.add(i, TBVector); // Totalstruktur. tbpagenr = i; // Lgg till TB vektorn fr denna sida. SkrivUt(4, "Ny sida Split: " + i); // PDPage ndrad till lokal variabel! page1 = (PDPage) pages.get(i); PDStream contents = page1.getContents(); //AH Kod frn PageDrawer: SkrivUt(4, "Innan contents test."); if (contents != null) { PDResources resources = page1.findResources(); SkrivUt(3, "Fre byggStrukturer 1."); // PDFStreamEngine.fas = PDFStreamEngine.rel2abs; setSumcharAlla(0); // Nollstll teckenrknaren fr strings SkrivUt(4, "Fre byggStrukturer 2."); meningsrest = byggStrukturer(meningsrest, page1.getContents().getStream(), i); SkrivUt(4, "*** meningsrest: " + meningsrest); SkrivUt(3, "Efter byggStrukturer."); } } // Skriv ut innehllet i TB strukturen: listTextBlocks(); // Skriv ut meningarna: listMeningar(); //*/ // Sista passet skall samla in TP fontmetrics och spara dem till TB strukturer. SkrivUt(4, "Fr lngt."); mind = 0; // Behvs denna hr? for (int i = 0; i < pages.size(); i++) { // Frsta fasen skall samla hela texten samt ev ndra relativa till absoluta. // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida. // PageVector.add(i, TBVector); // Lgg till TB vektorn fr denna sida. tempsidnr = i; SkrivUt(4, "Ny sida A: " + i); PDPage page = (PDPage) pages.get(i); PDStream contents = page.getContents(); //AH Kod frn PageDrawer: if (contents != null) { PDResources resources = page.findResources(); SkrivUt(4, "Fre processStream."); // PDFStreamEngine.fas = PDFStreamEngine.rel2abs; setSumcharAlla(0); // Nollstll teckenrknaren fr strings // SkrivUt(4, "Hela texten2FRE: " + helaTexten); // processStream( page, resources, page.getContents().getStream()); // Hr anropas sidhanteringen! // SkrivUt(2, "Hela texten2: " + helaTexten); // helaTexten = ""; // AH* >> Nollstll INTE, - fr alla sidor. SkrivUt(4, "Efter processStream. fre nya"); // cosStream.getStreamToken /* // PDStream nycont = new PDStream( getTokenList()); PDFStreamEngine.fas = PDFStreamEngine.splitstrings; processStream( page, resources, page.getContents().getStream()); SkrivUt(4, "Hela texten2: " + helaTexten); SkrivUt(4, "Efter andra processStream."); meningsvektor = Hittameningarna( helaTexten ); helaTexten = ""; // AH* >> Nollstll??? * */ } } SkrivUt(3, "Efter processStream."); //if (DoHighlight) //{ // Hr skall texten frmedlas till EasyReader och resultatlista med // meningar som skall highlightas skall returneras! if ((DoHighlight) && !(helaTexten.equals(""))) { SkrivUt(2, "Fre sammanfatta. helaTexten = \"\""); menisammanfattningen = sammanfatta(helaTexten, sumslidval); System.out.println(menisammanfattningen); } else menisammanfattningen = null; // Hr skall g operatorernas argument modifieras fr de som skall vara // med i sammanfattningen. mind = 0; // Behvs denna hr? meningsrest = meningsvektor.get(mind).helameningen; // Kvarvarande text p aktuella sidan. cosenr = 0; // index fr COSString eller COSArray. mennr = 0; // index fr aktuell mening. mendelnr = 0; mendelantal = 0; // Antal delar som meningen bestr av. valdsida = -1; // valda sidan inte knd n. for (int i = 0; i < pages.size(); i++) { // Borde flytta p denna om de inte skall anvndas! // TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida. // PageVector.add(i, TBVector); // Totalstruktur. // Lgg till TB vektorn fr denna sida. SkrivUt(4, "Ny sida highlight: " + i); // PDPage ndrad till lokal variabel! page1 = (PDPage) pages.get(i); PDStream contents = page1.getContents(); //AH Kod frn PageDrawer: SkrivUt(4, "Innan contents test."); if (contents != null) { PDResources resources = page1.findResources(); SkrivUt(1, "Fre highlight 1. Sida: " + i); // PDFStreamEngine.fas = PDFStreamEngine.rel2abs; setSumcharAlla(0); // Nollstll teckenrknaren fr strings // SkrivUt(4, "Fre splitMeningar 2."); // if (DoHighlight) meningsrest = highlight(meningsrest, page1.getContents().getStream(), i, DoHighlight, valdmening); //SkrivUt(4, "*** meningsrest: "+ meningsrest); SkrivUt(1, "Efter highlight av sida:" + i); } } SkrivUt(3, "Efter hela highlight."); /*} else // Spara data till pageTokens fr sparande till fil efter. { }*/ // Dags att hmta fontmetrics och spara till fil. r det samma som ovan? // Skall inte gras f.n! for (int i = 0; i < pages.size(); i++) { TBVector = new Vector<SETextBlock>(); // TB vektorn fr denna sida. PageVector.add(i, TBVector); // Lgg till TB vektorn fr denna sida. SkrivUt(4, "Ny sida X: " + i); // PDPage ndrad, inte lokal lngre! page = (PDPage) pages.get(i); PDStream contents = page.getContents(); //AH Kod frn PageDrawer: if (contents != null) { PDResources resources = page.findResources(); SkrivUt(4, "Fre processStream."); // PDFStreamEngine.fas = PDFStreamEngine.rel2abs; setSumcharAlla(0); // Nollstll teckenrknaren fr strings // AH* Nsta rad anvnds fr att hmta ut fontmetrics. /* processStream( page, resources, page.getContents().getStream()); // Hr anropas sidhanteringen! SkrivUt(4, "Hela texten1: " + helaTexten); // helaTexten = ""; // Skall inte nollstllas nu! SkrivUt(4, "Efter processStream. fre nya"); // cosStream.getStreamToken * */ // PDStream nycont = new PDStream( getTokenList()); // PDFStreamEngine.fas = PDFStreamEngine.splitstrings; // Nedanstende har anropats ovan. // processStream( page, resources, page.getContents().getStream()); // SkrivUt(4, "Hela texten3: " + helaTexten); SkrivUt(4, "Efter andra processStream."); } SkrivUt(3, "Efter hela andra processStream."); /* PDFStreamParser parser = new PDFStreamParser(contents.getStream()); parser.parse(); * */ // SkrivUt(4, "Egna loopen Sida: " + i); /* List tokens = getTokenList();// AH* parser.getTokens(); Tidigare hmtning av lista. // Dvs hmta INTE tokens frn den parsade filen. Anvnd tidigare data. LinkedList arguments = new LinkedList(); // AH* argumenten till operatorn. for( int j=0; j<tokens.size(); j++ ) { Object next = tokens.get( j ); if( next instanceof PDFOperator ) { PDFOperator op = (PDFOperator)next; //Tj and TJ are the two operators that display //strings in a PDF //AH: //SkrivUt(4, "ArgumentList length: " + arguments.size()); //>> AH* SkrivUt(4, "Operator anrop:" + OperatorCall( op, arguments )); // AH: Hr borde man gra ett anrop till StreamEngine! arguments = new LinkedList(); // Mste nollstlla argumenten // efter varje operator. if( op.getOperation().equals( "Tj" ) ) { //Tj takes one operator and that is the string //to display so lets update that operator COSString previous = (COSString)tokens.get( j-1 ); String string = previous.getString(); string = string.replaceFirst( strToFind, message ); previous.reset(); previous.append( string.getBytes() ); // AH* Testa tillgg av kod. tokens.add(j-1, gop); if (gray1) { tokens.add(j-1,cfloat1 ); gray1 = false; } else { tokens.add(j-1, cfloat5); gray1 = true; } j = j+2; } else if( op.getOperation().equals( "TJ" ) ) { COSArray previous = (COSArray)tokens.get( j-1 ); for( int k=0; k<previous.size(); k++ ) { Object arrElement = previous.getObject( k ); if( arrElement instanceof COSString ) { COSString cosString = (COSString)arrElement; String string = cosString.getString(); string = string.replaceFirst( strToFind, message ); cosString.reset(); cosString.append( string.getBytes() ); } } // AH: Tillagd kod! /* tokens.add(j-1, gop); if (gray1) { tokens.add(j-1,cfloat1 ); gray1 = false; } else { tokens.add(j-1, cfloat5); gray1 = true; } j = j+2; } } else // Inte PDFOperator, samla argument! { if (next instanceof COSBase) { arguments.add( next); //SkrivUt(4, "COSBase " + next.toString()); } else { SkrivUt(4, "next inte rtt typ!"); } } } */ //now that the tokens are updated we will replace the //page content stream. // Uppdatera data till filen! SkrivUt(3, ">>> Fre spara tokens i DoIt."); PDStream updatedStream = new PDStream(doc1); SkrivUt(3, ">>> Efter updated stream i DoIt."); OutputStream out = updatedStream.createOutputStream(); ContentStreamWriter tokenWriter = new ContentStreamWriter(out); tokenWriter.writeTokens(pageTokens.get(i)); page.setContents(updatedStream); SkrivUt(3, ">>> Efter spara tokens i DoIt."); } /* if (!filesaved) { doc1.save( outputFile ); filesaved = true; } doc1.close(); SkrivUt(3, "doc1 closed 1."); * */ } finally { saveAndClose(outputFile, doc1); /* SkrivUt(2, "Finally."); if( doc1 != null ) { if (!filesaved) { doc1.save( outputFile ); filesaved = true; } doc1.close(); SkrivUt(3, "doc1 closed 2."); } * */ } return meningsvektor; }
From source file:de.tudarmstadt.ukp.dkpro.core.io.pdf.PdfLayoutEventStripper.java
License:Apache License
/** * This will process all of the pages and the text that is in them. * //from w ww . j a va 2s .c o m * @param pages * The pages object in the document. * * @throws IOException * If there is an error parsing the text. */ protected void processPages(List<PDPage> pages) throws IOException { maxPage = pages.size(); for (final PDPage page : pages) { currentPageNo++; final PDStream contentStream = page.getContents(); if (contentStream != null) { final COSStream contents = contentStream.getStream(); processPage(page, contents); } } }
From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java
License:Apache License
/** * This will process all of the pages and the text that is in them. * * @param pages The pages object in the document. * * @throws IOException If there is an error parsing the text. *//*from w w w .j a va 2 s. c o m*/ protected void processPages(List<COSObjectable> pages) throws IOException { if (startBookmark != null) { startBookmarkPageNumber = getPageNumber(startBookmark, pages); } if (endBookmark != null) { endBookmarkPageNumber = getPageNumber(endBookmark, pages); } if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) { //this is a special case where both the start and end bookmark //are the same but point to nothing. In this case //we will not extract any text. startBookmarkPageNumber = 0; endBookmarkPageNumber = 0; } Iterator<COSObjectable> pageIter = pages.iterator(); while (pageIter.hasNext()) { PDPage nextPage = (PDPage) pageIter.next(); PDStream contentStream = nextPage.getContents(); currentPageNo++; if (contentStream != null) { COSStream contents = contentStream.getStream(); processPage(nextPage, contents); } } }
From source file:edworld.pdfreader4humans.impl.MainPDFComponentLocator.java
License:Apache License
protected List<TextComponent> locateAllTextComponents(final PDPage page, final List<GridComponent> gridComponents) throws IOException { return new PDFTextStripper() { private Map<String, String> fusions; List<Component> horizontalComponents; private ArrayList<TextComponent> list; {//from w w w . j a v a 2s . c om fusions = new HashMap<String, String>(); fusions.put("o-", ""); fusions.put("a-", ""); } public List<TextComponent> locateTextComponents() throws IOException { horizontalComponents = Component.horizontal(gridComponents); list = new ArrayList<TextComponent>(); PDStream contents = page.getContents(); setStartPage(getCurrentPageNo()); setEndPage(getCurrentPageNo()); setSortByPosition(false); if (contents != null) { output = new StringWriter(); processPage(page, contents.getStream()); } joinConsecutiveTexts(list); Collections.sort(list); return list; } protected void joinConsecutiveTexts(ArrayList<TextComponent> textComponents) { for (int i = 0; i < textComponents.size() - 1; i++) { TextComponent currentComponent = textComponents.get(i); TextComponent nextComponent = textComponents.get(i + 1); if (currentComponent.consecutive(nextComponent, false)) { textComponents.set(i, joinTextComponents(currentComponent, SPACE, nextComponent)); textComponents.remove(i + 1); i--; } } } protected TextComponent joinTextComponents(TextComponent component1, String separatorCharacter, TextComponent component2) { return new TextComponent(component1.getText() + separatorCharacter + component2.getText(), component1.getFromX(), Math.min(component1.getFromY(), component2.getFromY()), component2.getToX(), Math.max(component1.getToY(), component2.getToY()), component1.getFontName(), component1.getFontSize()); } @Override protected void writeString(String text, List<TextPosition> textPositions) throws IOException { float fromX = Float.POSITIVE_INFINITY; float fromY = Float.POSITIVE_INFINITY; float toX = Float.NEGATIVE_INFINITY; float toY = Float.NEGATIVE_INFINITY; float fontSize = -1; String fontName = ""; List<TextPosition> partialList = new ArrayList<TextPosition>(); String partialText = ""; float lastLeft = Float.NEGATIVE_INFINITY; float lastRight = Float.NEGATIVE_INFINITY; for (TextPosition textPosition : textPositions) { String character = textPosition.getCharacter(); Component overlappingShape = findOverlappingHorizontalShape(textPosition); if (overlappingShape != null && fusible(character, "-")) { character = fusion(character, "-"); removeOverlappingShape(overlappingShape); } float x1 = textPosition.getX(); float y1 = textPosition.getY(); if (x1 < lastLeft) { list.add(new TextComponent(partialText, fromX, fromY, toX, toY, fontName, fontSize)); writeString(text.substring(partialText.length()), textPositions.subList(partialList.size(), textPositions.size())); return; } else if (x1 < lastRight && fusible(partialText, character)) { partialText = fusion(partialText, character); } else { if (x1 < fromX) { fromX = x1; fromY = y1 - textPosition.getHeight(); fontName = textPosition.getFont().getBaseFont(); fontSize = textPosition.getFontSizeInPt(); } partialList.add(textPosition); partialText += character; } toX = Math.max(x1 + textPosition.getWidth(), toX); toY = Math.max(y1, toY); lastLeft = x1; lastRight = x1 + textPosition.getWidth(); } list.add(new TextComponent(partialText, fromX, fromY, toX, toY, fontName, fontSize)); } private Component findOverlappingHorizontalShape(TextPosition textPosition) { GridComponent component = new GridComponent("rect", textPosition.getX(), textPosition.getY() - textPosition.getHeight(), textPosition.getX() + textPosition.getWidth(), textPosition.getY(), 1); for (Component candidate : horizontalComponents) if (candidate.intersects(component) && Math.abs(candidate.getWidth() - component.getWidth()) < 0.1) return candidate; return null; } private void removeOverlappingShape(Component overlappingShape) throws IOException { gridComponents.remove(overlappingShape); } private boolean fusible(String partialText, String character) { return partialText.endsWith(SPACE) || fusions.containsKey(fusionPair(partialText, character)); } private String fusion(String partialText, String character) { if (partialText.endsWith(SPACE)) return partialText.substring(0, partialText.length() - 1) + character; return partialText.substring(0, partialText.length() - 1) + fusions.get(fusionPair(partialText, character)); } private String fusionPair(String partialText, String character) { return partialText.substring(partialText.length() - 1) + character.charAt(0); } }.locateTextComponents(); }
From source file:fi.nls.oskari.printout.printing.PDPageContentStream.java
License:Apache License
/** * Create a new PDPage content stream./*from w w w .j av a2 s .c o m*/ * * @param document * The document the page is part of. * @param sourcePage * The page to write the contents to. * @param appendContent * Indicates whether content will be overwritten. If false all * previous content is deleted. * @param compress * Tell if the content stream should compress the page contents. * @param resetContext * Tell if the graphic context should be reseted. * @throws IOException * If there is an error writing to the page contents. */ public PDPageContentStream(PDDocument document, PDPage sourcePage, boolean appendContent, boolean compress, boolean resetContext) throws IOException { page = sourcePage; resources = page.getResources(); if (resources == null) { resources = new PDResources(); page.setResources(resources); } // Get the pdstream from the source page instead of creating a new one PDStream contents = sourcePage.getContents(); boolean hasContent = contents != null; // If request specifies the need to append to the document if (appendContent && hasContent) { // Create a pdstream to append new content PDStream contentsToAppend = new PDStream(document); // This will be the resulting COSStreamArray after existing and new // streams are merged COSStreamArray compoundStream = null; // If contents is already an array, a new stream is simply appended // to it if (contents.getStream() instanceof COSStreamArray) { compoundStream = (COSStreamArray) contents.getStream(); compoundStream.appendStream(contentsToAppend.getStream()); } else { // Creates the COSStreamArray and adds the current stream plus a // new one to it COSArray newArray = new COSArray(); newArray.add(contents.getCOSObject()); newArray.add(contentsToAppend.getCOSObject()); compoundStream = new COSStreamArray(newArray); } if (compress) { List<COSName> filters = new ArrayList<COSName>(); filters.add(COSName.FLATE_DECODE); contentsToAppend.setFilters(filters); } if (resetContext) { // create a new stream to encapsulate the existing stream PDStream saveGraphics = new PDStream(document); output = saveGraphics.createOutputStream(); // save the initial/unmodified graphics context saveGraphicsState(); close(); // ? if (compress) { List<COSName> filters = new ArrayList<COSName>(); filters.add(COSName.FLATE_DECODE); saveGraphics.setFilters(filters); } // insert the new stream at the beginning compoundStream.insertCOSStream(saveGraphics); } // Sets the compoundStream as page contents sourcePage.setContents(new PDStream(compoundStream)); output = contentsToAppend.createOutputStream(); if (resetContext) { // restore the initial/unmodified graphics context restoreGraphicsState(); } } else { if (hasContent) { LOG.warn("You are overwriting an existing content, you should use the append mode"); } contents = new PDStream(document); if (compress) { List<COSName> filters = new ArrayList<COSName>(); filters.add(COSName.FLATE_DECODE); contents.setFilters(filters); } sourcePage.setContents(contents); output = contents.createOutputStream(); } formatDecimal.setMaximumFractionDigits(10); formatDecimal.setGroupingUsed(false); }
From source file:function.PrintImageLocations.java
License:Apache License
/** * This will print the documents data.//www.ja v a 2 s . com * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main(String[] args) throws Exception { PDDocument document = null; try { document = PDDocument.load(new File("C:/Users/ATUL/Desktop/Page-layout/output1.pdf")); if (document.isEncrypted()) { document.decrypt(""); } PrintImageLocations printer = new PrintImageLocations(); List allPages = document.getDocumentCatalog().getAllPages(); for (int i = 0; i < allPages.size(); i++) { System.out.println("\n***********************************************************"); PDPage page = (PDPage) allPages.get(i); System.out.println("Processing page: " + (i + 1)); printer.processStream(page, page.findResources(), page.getContents().getStream()); } } finally { if (document != null) { document.close(); } } }
From source file:name.marcelomorales.siqisiqi.pdfbox.CoordinatesGenerator.java
License:Apache License
public void generarPdf(OutputStream os, String template, Map<String, Object> m, String path, String coordenates, float fontSize, float ancho) throws IOException { long t = System.currentTimeMillis(); PDDocument doc = null;//from w w w. j a va 2s . c o m try { doc = PDDocument.load(new File(path)); List pages = doc.getDocumentCatalog().getAllPages(); PDPage sourcePage = (PDPage) pages.get(0); boolean append = sourcePage.getContents() != null; PDPageContentStream contentStream = new PDPageContentStream(doc, sourcePage, append, true); StringReader fileReader = null; try { fileReader = new StringReader(template); List<String> list = CharStreams.readLines(fileReader); boolean textHasBegun = false; float currentOffset = 0f; for (String line : list) { if (line == null) { continue; } if (line.startsWith("#")) { continue; } final Iterable<String> str = Splitter.on(',').omitEmptyStrings().trimResults().split(line); final String[] split = Iterables.toArray(str, String.class); if (split == null || split.length < 4) { continue; } if (Character.isDigit(split[0].charAt(0))) { if (textHasBegun) { contentStream.endText(); } contentStream.beginText(); textHasBegun = true; contentStream.moveTextPositionByAmount(parseFloat(split[0]), parseFloat(split[1])); } else { contentStream.moveTextPositionByAmount(currentOffset, 0); } if (!textHasBegun) { LOGGER.warn("Hay un posible mal uso de un .ree", new Throwable()); contentStream.beginText(); textHasBegun = true; } PDType1Font font; if ("b".equals(split[2])) { font = HELVETICA_BOLD; } else { font = HELVETICA; } contentStream.setFont(font, fontSize); Object text = null; if (split[3].startsWith("\"")) { // TODO: text = substring(split[3], 1, -1); } else { // TODO: text = new PropertyModel(m, split[3]).getObject(); } if (text == null) { LOGGER.warn("Propiedad {} no se encuentra", split[3]); //contentStream.drawString("ERROR: propiedad no encontrada"); contentStream.drawString(" "); } else { String string = text.toString(); currentOffset = font.getStringWidth(string) * ancho; contentStream.drawString(string); } } if (textHasBegun) { contentStream.endText(); } } finally { Closeables.closeQuietly(fileReader); } contentStream.close(); try { doc.save(os); } catch (COSVisitorException e) { throw new IOException("Ha ocurrido un error al escribir en el Os", e); } } finally { if (doc != null) { doc.close(); } LOGGER.info("Me ha tomado {} milisegundos hacer el pdf", System.currentTimeMillis() - t); } }
From source file:net.bookinaction.TextInfoExtractor.java
License:Apache License
public static void getTextPositionFromPage(PDDocument document, StripperParam stripperParam, int pageNum, PrintWriter writer, boolean testMode) throws IOException { //System.out.println(String.format("getPage: %d", pageNum)); PDPage page = document.getPage(pageNum - 1); // pdfbox uses the 0-base index PDRectangle cropBox = page.getCropBox(); // extract image locations ImageLocationListener imageLocationsListener = new ImageLocationListener(); List<Rectangle2D> imageRects = new ArrayList<Rectangle2D>(); imageLocationsListener.setImageRects(imageRects); imageLocationsListener.processPage(page); // extract Text locations StripString stripString = new StripString(); TextLocationListener stripper = new TextLocationListener(stripperParam, stripString); stripper.setSortByPosition(true);/*from w w w . j a v a2s .com*/ List<StripLine> stripLines = new ArrayList<StripLine>(); stripper.setStartPage(pageNum); stripper.setEndPage(pageNum); try { stripper.writeText(document, new OutputStreamWriter(new ByteArrayOutputStream())); } catch (IOException e) { return; } if (page.getContents() != null) stripper.processPage(page); // declare canvas and keep this position PDPageContentStream canvas = new PDPageContentStream(document, page, true, true, true); Stamper s = new Stamper(); // utility class if (testMode) { // draw the bounding box of each character for (int i = 0; i < stripString.size(); i++) { // original Rectangle s.showBox(canvas, stripString.boundingRect(i), cropBox, Color.GRAY80); } } s.recordPageSize(writer, pageNum, cropBox); // splits into lines int lineNum = 1; int lineStart = 0, lineEnd = 0; String[] splits = stripString.toString().split("\r"); SimpleTokenizer simpleTokenizer = new SimpleTokenizer(); for (String lineText : splits) { if (lineText.length() < 1) continue; lineEnd = lineStart + lineText.length(); Rectangle2D mergedRect = stripString.boundingRect(lineStart, lineEnd - 1); String sub = stripString.substring(lineStart, lineEnd); stripLines.add(new StripLine(pageNum, lineNum, lineStart, lineEnd, mergedRect)); //System.out.println(String.format("%d-%d: %s - [%.0f %.0f %.0f %.0f]", pageNum, lineNum, sub, // mergedRect.getX(), mergedRect.getY(), mergedRect.getWidth(), mergedRect.getHeight())); if (testMode) { s.showBox(canvas, mergedRect, cropBox, Color.GREEN); } s.recordTextPosition(writer, sub, pageNum, mergedRect, "LINE"); /******* get words in the line *********/ List<Token> tokens = simpleTokenizer.getTokens(sub); for (String pattern : circles_patterns) { List<Token> symbolTokens = PatternAnalyzer.getTokensByPattern(sub, pattern); tokens.addAll(symbolTokens); } for (Token t : tokens) { mergedRect = stripString.boundingRect(lineStart + t.getStart(), lineStart + t.getEnd() - 1); //System.out.println(String.format("%d-%d: %s - [%.0f %.0f %.0f %.0f]", pageNum, lineNum, t.getStem(), mergedRect.getX(), mergedRect.getY(), mergedRect.getWidth(), mergedRect.getHeight())); s.recordTextPosition(writer, t.getStem(), pageNum, mergedRect, "TEXT"); if (testMode) { s.showBox(canvas, mergedRect, cropBox, Color.RED); } } lineStart += lineText.length() + 1; lineNum++; } // ------------------- // markup textMark annotation to the image int imageNum = 1; for (Rectangle2D imRect : imageRects) { //page.getAnnotations().add(annotationMaker.textMarkupAnnotation(Color.YELLOW, (Rectangle2D.Float) imRect, "image"+imageNum)); if (testMode) { s.showBox(canvas, imRect, cropBox, Color.YELLOW); } s.recordTextPosition(writer, "[image" + imageNum + "]", pageNum, imRect, "IMAGE"); imageNum++; } canvas.close(); }