Java tutorial
// Copyright 2012-01-10 PlanBase Inc. & Glen Peterson // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.planbase.pdf.layoutmanager; import org.apache.pdfbox.exceptions.COSVisitorException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.edit.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.pdmodel.graphics.xobject.PDJpeg; import org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap; import java.awt.Color; import java.awt.image.BufferedImage; import java.io.InputStream; import java.io.FileInputStream; import java.io.OutputStream; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * <p>The main class in this package; it handles page and line breaks.</p> * <p/> * <h3>Usage (the unit test is a much better example):</h3> * <pre><code>// Create a new manager * PdfLayoutMgr pageMgr = PdfLayoutMgr.newRgbPageMgr(); * <p/> * LogicalPage lp = pageMgr.logicalPageStart(); * // defaults to Landscape orientation * // call various lp.tableBuilder() or lp.put...() methods here. * // They will page-break and create extra physical pages as needed. * // ... * lp.commit(); * <p/> * lp = pageMgr.logicalPageStart(LogicalPage.Orientation.PORTRAIT); * // These pages will be in Portrait orientation * // call various lp methods to put things on the next page grouping * // ... * lp.commit(); * <p/> * // The file to write to * OutputStream os = new FileOutputStream("test.pdf"); * <p/> * // Commit all pages to output stream. * pageMgr.save(os);</code></pre> * <br> * <h3>Note:</h3> * <p>Because this class buffers and writes to an underlying stream, it is mutable, has side effects, * and is NOT thread-safe!</p> */ public class PdfLayoutMgr { // private Logger logger = Logger.getLogger(PdfLayoutMgr.class); // logger.info("Ascent: " + PDType1Font.HELVETICA.getFontDescriptor().getAscent()); // logger.info("StemH: " + PDType1Font.HELVETICA.getFontDescriptor().getStemH()); // logger.info("CapHeight: " + PDType1Font.HELVETICA.getFontDescriptor().getCapHeight()); // logger.info("XHeight: " + PDType1Font.HELVETICA.getFontDescriptor().getXHeight()); // logger.info("Descent: " + PDType1Font.HELVETICA.getFontDescriptor().getDescent()); // logger.info("Leading: " + PDType1Font.HELVETICA.getFontDescriptor().getLeading()); // // logger.info("Height: " + PDType1Font.HELVETICA.getFontDescriptor().getFontBoundingBox().getHeight()); // // Ascent: 718.0 // StemH: 0.0 // CapHeight: 718.0 // XHeight: 523.0 // Descent: -207.0 // Leading: 0.0 // Height: 1156.0 // CapHeight - descent = 925 // 925 - descent = 1132 which is still less than 1156. // I'm going to make line-height = // Java FontMetrics says getHeight() = getAscent() + getDescent() + getLeading(). // I think ascent and descent are compatible with this. I'm going to make Leading be // -descent/2 /** * If you use no scaling when printing the output PDF, PDFBox shows approximately 72 * Document-Units Per Inch. This makes one pixel on an average desktop monitor correspond to * roughly one document unit. This is a useful constant for page layout math. */ public static final float DOC_UNITS_PER_INCH = 72f; // TODO: add Sensible defaults, such as textStyle? // private TextStyle textStyle; // private PDRectangle pageDimensions; // private Padding pageMargins; // private PDRectangle printableArea; // // public TextStyle textStyle() { return textStyle; } // public PDRectangle pageDimensions() { return pageDimensions; } // public Padding pageMargins() { return pageMargins; } // public PDRectangle printableArea() { return printableArea; } // You can have many DrawJpegs backed by only a few images - it is a flyweight, and this // hash map keeps track of the few underlying images, even as intances of DrawJpeg // represent all the places where these images are used. // CRITICAL: This means that the the set of jpgs must be thrown out and created anew for each // document! Thus, a private final field on the PdfLayoutMgr instead of DrawJpeg, and DrawJpeg // must be an inner class (or this would have to be package scoped). private final Map<BufferedImage, PDJpeg> jpegMap = new HashMap<BufferedImage, PDJpeg>(); private PDJpeg ensureCached(final ScaledJpeg sj) { BufferedImage bufferedImage = sj.bufferedImage(); PDJpeg temp = jpegMap.get(bufferedImage); if (temp == null) { try { temp = new PDJpeg(doc, bufferedImage); } catch (IOException ioe) { // can there ever be an exception here? Doesn't it get written later? throw new IllegalStateException("Caught exception creating a PDJpeg from a bufferedImage", ioe); } jpegMap.put(bufferedImage, temp); } return temp; } // You can have many DrawPngs backed by only a few images - it is a flyweight, and this // hash map keeps track of the few underlying images, even as intances of DrawPng // represent all the places where these images are used. // CRITICAL: This means that the the set of jpgs must be thrown out and created anew for each // document! Thus, a private final field on the PdfLayoutMgr instead of DrawPng, and DrawPng // must be an inner class (or this would have to be package scoped). private final Map<BufferedImage, PDPixelMap> pngMap = new HashMap<BufferedImage, PDPixelMap>(); private PDPixelMap ensureCached(final ScaledPng sj) { BufferedImage bufferedImage = sj.bufferedImage(); PDPixelMap temp = pngMap.get(bufferedImage); if (temp == null) { try { temp = new PDPixelMap(doc, bufferedImage); } catch (IOException ioe) { // can there ever be an exception here? Doesn't it get written later? throw new IllegalStateException("Caught exception creating a PDPixelMap from a bufferedImage", ioe); } pngMap.put(bufferedImage, temp); } return temp; } /** * Please don't access this class directly if you don't have to. It's a little bit like a model for stuff that * needs to be drawn on a page, but much more like a heap of random functionality that sort of landed in an * inner class. This will probably be refactored away in future releases. */ static class PageBuffer { public final int pageNum; private long lastOrd = 0; private final Set<PdfItem> items = new TreeSet<PdfItem>(); private PageBuffer(int pn) { pageNum = pn; } void fillRect(final float xVal, final float yVal, final float w, final float h, final Color c, final float z) { items.add(FillRect.of(xVal, yVal, w, h, c, lastOrd++, z)); } // public void fillRect(final float xVal, final float yVal, final float w, final Color c, // final float h) { // fillRect(xVal, yVal, w, h, c, PdfItem.DEFAULT_Z_INDEX); // } // // public void drawJpeg(final float xVal, final float yVal, final BufferedImage bi, // final PdfLayoutMgr mgr, final float z) { // items.add(DrawJpeg.of(xVal, yVal, bi, mgr, lastOrd++, z)); // } void drawJpeg(final float xVal, final float yVal, final ScaledJpeg sj, final PdfLayoutMgr mgr) { items.add(DrawJpeg.of(xVal, yVal, sj, mgr, lastOrd++, PdfItem.DEFAULT_Z_INDEX)); } void drawPng(final float xVal, final float yVal, final ScaledPng sj, final PdfLayoutMgr mgr) { items.add(DrawPng.of(xVal, yVal, sj, mgr, lastOrd++, PdfItem.DEFAULT_Z_INDEX)); } private void drawLine(final float xa, final float ya, final float xb, final float yb, final LineStyle ls, final float z) { items.add(DrawLine.of(xa, ya, xb, yb, ls, lastOrd++, z)); } void drawLine(final float xa, final float ya, final float xb, final float yb, final LineStyle ls) { drawLine(xa, ya, xb, yb, ls, PdfItem.DEFAULT_Z_INDEX); } private void drawStyledText(final float xCoord, final float yCoord, final String text, TextStyle s, final float z) { items.add(Text.of(xCoord, yCoord, text, s, lastOrd++, z)); } void drawStyledText(final float xCoord, final float yCoord, final String text, TextStyle s) { drawStyledText(xCoord, yCoord, text, s, PdfItem.DEFAULT_Z_INDEX); } private void commit(PDPageContentStream stream) throws IOException { // Since items are z-ordered, then sub-ordered by entry-order, we will draw // everything in the correct order. for (PdfItem item : items) { item.commit(stream); } } private static class DrawLine extends PdfItem { private final float x1, y1, x2, y2; private final LineStyle style; private DrawLine(final float xa, final float ya, final float xb, final float yb, LineStyle s, final long ord, final float z) { super(ord, z); x1 = xa; y1 = ya; x2 = xb; y2 = yb; style = s; } public static DrawLine of(final float xa, final float ya, final float xb, final float yb, LineStyle s, final long ord, final float z) { return new DrawLine(xa, ya, xb, yb, s, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { stream.setStrokingColor(style.color()); stream.setLineWidth(style.width()); stream.drawLine(x1, y1, x2, y2); } } private static class FillRect extends PdfItem { private final float x, y, width, height; private final Color color; private FillRect(final float xVal, final float yVal, final float w, final float h, final Color c, final long ord, final float z) { super(ord, z); x = xVal; y = yVal; width = w; height = h; color = c; } public static FillRect of(final float xVal, final float yVal, final float w, final float h, final Color c, final long ord, final float z) { return new FillRect(xVal, yVal, w, h, c, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { stream.setNonStrokingColor(color); stream.fillRect(x, y, width, height); } } static class Text extends PdfItem { public final float x, y; public final String t; public final TextStyle style; private Text(final float xCoord, final float yCoord, final String text, TextStyle s, final long ord, final float z) { super(ord, z); x = xCoord; y = yCoord; t = text; style = s; } public static Text of(final float xCoord, final float yCoord, final String text, TextStyle s, final long ord, final float z) { return new Text(xCoord, yCoord, text, s, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { stream.beginText(); stream.setNonStrokingColor(style.textColor()); stream.setFont(style.font(), style.fontSize()); stream.moveTextPositionByAmount(x, y); stream.drawString(t); stream.endText(); } } private static class DrawPng extends PdfItem { private final float x, y; private final PDPixelMap png; private final ScaledPng scaledPng; // private Log logger = LogFactory.getLog(DrawPng.class); private DrawPng(final float xVal, final float yVal, final ScaledPng sj, final PdfLayoutMgr mgr, final long ord, final float z) { super(ord, z); x = xVal; y = yVal; png = mgr.ensureCached(sj); scaledPng = sj; } public static DrawPng of(final float xVal, final float yVal, final ScaledPng sj, final PdfLayoutMgr mgr, final long ord, final float z) { return new DrawPng(xVal, yVal, sj, mgr, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { // stream.drawImage(png, x, y); XyDim dim = scaledPng.dimensions(); stream.drawXObject(png, x, y, dim.x(), dim.y()); } } private static class DrawJpeg extends PdfItem { private final float x, y; private final PDJpeg jpeg; private final ScaledJpeg scaledJpeg; // private Log logger = LogFactory.getLog(DrawJpeg.class); private DrawJpeg(final float xVal, final float yVal, final ScaledJpeg sj, final PdfLayoutMgr mgr, final long ord, final float z) { super(ord, z); x = xVal; y = yVal; jpeg = mgr.ensureCached(sj); scaledJpeg = sj; } public static DrawJpeg of(final float xVal, final float yVal, final ScaledJpeg sj, final PdfLayoutMgr mgr, final long ord, final float z) { return new DrawJpeg(xVal, yVal, sj, mgr, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { // stream.drawImage(jpeg, x, y); XyDim dim = scaledJpeg.dimensions(); stream.drawXObject(jpeg, x, y, dim.x(), dim.y()); } } } private final List<PageBuffer> pages = new ArrayList<PageBuffer>(); private final PDDocument doc; // pages.size() counts the first page as 1, so 0 is the appropriate sentinel value private int unCommittedPageIdx = 0; private final PDColorSpace colorSpace; List<PageBuffer> pages() { return Collections.unmodifiableList(pages); } private PdfLayoutMgr(PDColorSpace cs) throws IOException { doc = new PDDocument(); colorSpace = cs; } /** * Returns a new PdfLayoutMgr with the given color space. * * @param cs the color-space. * @return a new PdfLayoutMgr * @throws IOException */ public static PdfLayoutMgr of(PDColorSpace cs) throws IOException { return new PdfLayoutMgr(cs); } /** * Creates a new PdfLayoutMgr with the PDDeviceRGB color space. * * @return a new Page Manager with an RGB color space * @throws IOException */ @SuppressWarnings("UnusedDeclaration") // Part of end-user public interface public static PdfLayoutMgr newRgbPageMgr() throws IOException { return new PdfLayoutMgr(PDDeviceRGB.INSTANCE); } /** * Returns the correct page for the given value of y. This lets the user use any Y value and * we continue extending their canvas downward (negative) by adding extra pages. * * @param y the un-adjusted y value. * @return the proper page and adjusted y value for that page. */ LogicalPage.PageBufferAndY appropriatePage(LogicalPage lp, float y) { if (pages.size() < 1) { throw new IllegalStateException( "Cannot work with the any pages until one has been created by calling newPage()."); } int idx = unCommittedPageIdx; // Get the first possible page while (y < lp.yPageBottom()) { // logger.info("Adjusting y. Was: " + y + " about to add " + printAreaHeight); y += lp.printAreaHeight(); // y could even be negative. Just keep moving to the top of the next // page until it's in the printable area. idx++; if (pages.size() <= idx) { pages.add(new PageBuffer(pages.size() + 1)); } } PageBuffer ps = pages.get(idx); return new LogicalPage.PageBufferAndY(ps, y); } /** * Call this to commit the PDF information to the underlying stream after it is completely built. */ public void save(OutputStream os) throws IOException, COSVisitorException { doc.save(os); } // TODO: Add logicalPage() method and call pages.add() lazily for the first item actually shown on a page, and logicalPageEnd called before a save. // TODO: Add feature for different paper size or orientation for each group of logical pages. /** * Tells this PdfLayoutMgr that you want to start a new logical page (which may be broken across * two or more physical pages) in the requested page orientation. */ @SuppressWarnings("UnusedDeclaration") // Part of end-user public interface public LogicalPage logicalPageStart(LogicalPage.Orientation o) { PageBuffer pb = new PageBuffer(pages.size() + 1); pages.add(pb); return LogicalPage.of(this, o); } /** * Get a new logical page (which may be broken across two or more physical pages) in Landscape orientation. */ public LogicalPage logicalPageStart() { return logicalPageStart(LogicalPage.Orientation.LANDSCAPE); } // void addLogicalPage(PageBuffer pb) { // pages.add(pb); // } /** * Call this when you are through with your current set of pages to commit all pending text and * drawing operations. This is the only method that throws an IOException because the purpose of * PdfLayoutMgr is to buffer all operations until a page is complete so that it can safely be * written to the underlying stream. This method turns the potential pages into real output. * Call when you need a page break, or your document is done and you need to write it out. * * @throws IOException - if there is a failure writing to the underlying stream. */ @SuppressWarnings("UnusedDeclaration") // Part of end-user public interface void logicalPageEnd(LogicalPage lp) throws IOException { // Write out all uncommitted pages. while (unCommittedPageIdx < pages.size()) { PDPage pdPage = new PDPage(); pdPage.setMediaBox(PDPage.PAGE_SIZE_LETTER); if (lp.orientation() == LogicalPage.Orientation.LANDSCAPE) { pdPage.setRotation(90); } PDPageContentStream stream = null; try { stream = new PDPageContentStream(doc, pdPage); doc.addPage(pdPage); if (lp.orientation() == LogicalPage.Orientation.LANDSCAPE) { stream.concatenate2CTM(0, 1, -1, 0, lp.pageWidth(), 0); } stream.setStrokingColorSpace(colorSpace); stream.setNonStrokingColorSpace(colorSpace); PageBuffer pb = pages.get(unCommittedPageIdx); pb.commit(stream); lp.commitBorderItems(stream); stream.close(); // Set to null to show that no exception was thrown and no need to close again. stream = null; } finally { // Let it throw an exception if the closing doesn't work. if (stream != null) { stream.close(); } } unCommittedPageIdx++; } } @Override public boolean equals(Object other) { // First, the obvious... if (this == other) { return true; } if (other == null) { return false; } if (!(other instanceof PdfLayoutMgr)) { return false; } // Details... final PdfLayoutMgr that = (PdfLayoutMgr) other; return this.doc.equals(that.doc) && (this.pages.equals(that.pages)); } @Override public int hashCode() { return doc.hashCode() + pages.hashCode(); } // public XyOffset putRect(XyOffset outerTopLeft, XyDim outerDimensions, final Color c) { //// System.out.println("putRect(" + outerTopLeft + " " + outerDimensions + " " + //// Utils.toString(c) + ")"); // putRect(outerTopLeft.x(), outerTopLeft.y(), outerDimensions.x(), outerDimensions.y(), c); // return XyOffset.of(outerTopLeft.x() + outerDimensions.x(), // outerTopLeft.y() - outerDimensions.y()); // } // /** // Puts text on the page. // @param x the x-value of the top-left corner. // @param origY the logical-page Y-value of the top-left corner. // @param cell the cell containing the styling and text to render. // @return the bottom Y-value (logical-page) of the rendered cell. // */ // public float putCell(final float x, float origY, final Cell cell) { // return cell.processRows(x, origY, false, this); // } private static final String ISO_8859_1 = "ISO_8859_1"; private static final String UNICODE_BULLET = "\u2022"; // PDFBox uses an encoding that the PDF spec calls WinAnsiEncoding. The spec says this is // Windows Code Page 1252. // http://en.wikipedia.org/wiki/Windows-1252 // It has a lot in common with ISO-8859-1, but it defines some additional characters such as // the Euro symbol. private static final Map<String, String> utf16ToWinAnsi; static { Map<String, String> tempMap = new HashMap<String, String>(); try { // 129, 141, 143, 144, and 157 are undefined in WinAnsi. // I had mapped A0-FF to 160-255 without noticing that that maps each character to // itself, meaning that Unicode and WinAnsii are the same in that range. // Unicode characters with exact WinAnsi equivalents tempMap.put("\u0152", new String(new byte[] { 0, (byte) 140 }, ISO_8859_1)); // OE tempMap.put("\u0153", new String(new byte[] { 0, (byte) 156 }, ISO_8859_1)); // oe tempMap.put("\u0160", new String(new byte[] { 0, (byte) 138 }, ISO_8859_1)); // S Acron tempMap.put("\u0161", new String(new byte[] { 0, (byte) 154 }, ISO_8859_1)); // s acron tempMap.put("\u0178", new String(new byte[] { 0, (byte) 159 }, ISO_8859_1)); // Y Diaeresis tempMap.put("\u017D", new String(new byte[] { 0, (byte) 142 }, ISO_8859_1)); // Capital Z-caron tempMap.put("\u017E", new String(new byte[] { 0, (byte) 158 }, ISO_8859_1)); // Lower-case Z-caron tempMap.put("\u0192", new String(new byte[] { 0, (byte) 131 }, ISO_8859_1)); // F with a hook (like jf put together) tempMap.put("\u02C6", new String(new byte[] { 0, (byte) 136 }, ISO_8859_1)); // circumflex (up-caret) tempMap.put("\u02DC", new String(new byte[] { 0, (byte) 152 }, ISO_8859_1)); // Tilde // Cyrillic letters map to their closest Romanizations according to ISO 9:1995 // http://en.wikipedia.org/wiki/ISO_9 // http://en.wikipedia.org/wiki/A_(Cyrillic) // Cyrillic extensions // 0400 Cyrillic capital letter IE WITH GRAVE // 0415 0300 (left-accent) tempMap.put("\u0400", new String(new byte[] { 0, (byte) 200 }, ISO_8859_1)); // 0401 ? Cyrillic capital letter IO // 0415 0308 (diuresis) tempMap.put("\u0401", new String(new byte[] { 0, (byte) 203 }, ISO_8859_1)); // 0402 Cyrillic capital letter DJE tempMap.put("\u0402", new String(new byte[] { 0, (byte) 208 }, ISO_8859_1)); // 0403 Cyrillic capital letter GJE // 0413 0301 (accent) // Ghe only maps to G-acute, which is not in our charset. // 0404 Cyrillic capital letter UKRAINIAN IE tempMap.put("\u0404", new String(new byte[] { 0, (byte) 202 }, ISO_8859_1)); // 0405 Cyrillic capital letter DZE tempMap.put("\u0405", "S"); // // 0406 Cyrillic capital letter BYELORUSSIAN- // UKRAINIAN I // 0049 I latin capital letter i // 0456 cyrillic small letter byelorussian- // ukrainian i // 04C0 cyrillic letter palochka tempMap.put("\u0406", new String(new byte[] { 0, (byte) 204 }, ISO_8859_1)); // 0407 Cyrillic capital letter YI // 0406 0308 (diuresis) tempMap.put("\u0407", new String(new byte[] { 0, (byte) 207 }, ISO_8859_1)); // 0408 Cyrillic capital letter JE // 0409 Cyrillic capital letter LJE // 040A Cyrillic capital letter NJE // 040B Cyrillic capital letter TSHE // 040C Cyrillic capital letter KJE // 041A 0301 (accent) // 040D ? Cyrillic capital letter I WITH GRAVE // 0418 0300 (accent) // 040E Cyrillic capital letter SHORT U // 0423 0306 (accent) // 040F ? Cyrillic capital letter DZHE // Basic Russian alphabet // See: http://www.unicode.org/charts/PDF/U0400.pdf // 0410 ? Cyrillic capital letter A => Latin A tempMap.put("\u0410", "A"); // 0411 Cyrillic capital letter BE => Latin B // 0183 latin small letter b with topbar tempMap.put("\u0411", "B"); // 0412 Cyrillic capital letter VE => Latin V tempMap.put("\u0412", "V"); // 0413 Cyrillic capital letter GHE => Latin G tempMap.put("\u0413", "G"); // 0414 Cyrillic capital letter DE => Latin D tempMap.put("\u0414", "D"); // 0415 Cyrillic capital letter IE => Latin E tempMap.put("\u0415", "E"); // 0416 Cyrillic capital letter ZHE => Z-caron tempMap.put("\u0416", new String(new byte[] { 0, (byte) 142 }, ISO_8859_1)); // 0417 Cyrillic capital letter ZE => Latin Z tempMap.put("\u0417", "Z"); // 0418 Cyrillic capital letter I => Latin I tempMap.put("\u0418", "I"); // 0419 Cyrillic capital letter SHORT I => Latin J // 0418 0306 (a little mark) // The two-character form (reversed N plus the mark) is not supported. tempMap.put("\u0419", "J"); // 041A Cyrillic capital letter KA => Latin K tempMap.put("\u041A", "K"); // 041B Cyrillic capital letter EL => Latin L tempMap.put("\u041B", "L"); // 041C Cyrillic capital letter EM => Latin M tempMap.put("\u041C", "M"); // 041D ? Cyrillic capital letter EN => Latin N tempMap.put("\u041D", "N"); // 041E Cyrillic capital letter O => Latin O tempMap.put("\u041E", "O"); // 041F Cyrillic capital letter PE => Latin P tempMap.put("\u041F", "P"); // 0420 Cyrillic capital letter ER => Latin R tempMap.put("\u0420", "R"); // 0421 Cyrillic capital letter ES => Latin S tempMap.put("\u0421", "S"); // 0422 Cyrillic capital letter TE => Latin T tempMap.put("\u0422", "T"); // 0423 Cyrillic capital letter U => Latin U // 0478 cyrillic capital letter uk // 04AF cyrillic small letter straight u // A64A cyrillic capital letter monograph uk tempMap.put("\u0423", "U"); tempMap.put("\u0478", "U"); // Is this right? tempMap.put("\u04AF", "U"); // Is this right? tempMap.put("\uA64A", "U"); // Is this right? // 0424 Cyrillic capital letter EF => Latin F tempMap.put("\u0424", "F"); // 0425 Cyrillic capital letter HA => Latin H tempMap.put("\u0425", "H"); // 0426 Cyrillic capital letter TSE => Latin C tempMap.put("\u0426", "C"); // 0427 Cyrillic capital letter CHE => Mapping to "Ch" because there is no // C-caron - hope this is the best choice! A also had this as "CH" but some make it // Tch as in Tchaikovsky, really didn't know what to do here. tempMap.put("\u0427", "Ch"); // 0428 Cyrillic capital letter SHA => S-caron tempMap.put("\u0428", new String(new byte[] { 0, (byte) 138 }, ISO_8859_1)); // 0429 Cyrillic capital letter SHCHA => Latin "Shch" because there is no // S-circumflex to map it to. Should it go to S-caron like SHA? tempMap.put("\u0429", "Shch"); // 042A Cyrillic capital letter HARD SIGN => Latin double prime, or in this case, // right double-quote. tempMap.put("\u042A", new String(new byte[] { 0, (byte) 148 }, ISO_8859_1)); // 042B Cyrillic capital letter YERU => Latin Y tempMap.put("\u042B", "Y"); // 042C Cyrillic capital letter SOFT SIGN => Latin prime, or in this case, // the right-single-quote. tempMap.put("\u042C", new String(new byte[] { 0, (byte) 146 }, ISO_8859_1)); // 042D Cyrillic capital letter E => Latin E-grave tempMap.put("\u042D", new String(new byte[] { 0, (byte) 200 }, ISO_8859_1)); // 042E Cyrillic capital letter YU => Latin U-circumflex tempMap.put("\u042E", new String(new byte[] { 0, (byte) 219 }, ISO_8859_1)); // 042F Cyrillic capital letter YA => A-circumflex tempMap.put("\u042F", new String(new byte[] { 0, (byte) 194 }, ISO_8859_1)); // 0430 Cyrillic small letter A tempMap.put("\u0430", "a"); // 0431 Cyrillic small letter BE tempMap.put("\u0431", "b"); // 0432 Cyrillic small letter VE tempMap.put("\u0432", "v"); // 0433 Cyrillic small letter GHE tempMap.put("\u0433", "g"); // 0434 Cyrillic small letter DE tempMap.put("\u0434", "d"); // 0435 Cyrillic small letter IE tempMap.put("\u0435", "e"); // 0436 Cyrillic small letter ZHE tempMap.put("\u0436", new String(new byte[] { 0, (byte) 158 }, ISO_8859_1)); // 0437 Cyrillic small letter ZE tempMap.put("\u0437", "z"); // 0438 Cyrillic small letter I tempMap.put("\u0438", "i"); // 0439 Cyrillic small letter SHORT I // 0438 0306 (accent) tempMap.put("\u0439", "j"); // 043A Cyrillic small letter KA tempMap.put("\u043A", "k"); // 043B Cyrillic small letter EL tempMap.put("\u043B", "l"); // 043C Cyrillic small letter EM tempMap.put("\u043C", "m"); // 043D Cyrillic small letter EN tempMap.put("\u043D", "n"); // 043E Cyrillic small letter O tempMap.put("\u043E", "o"); // 043F Cyrillic small letter PE tempMap.put("\u043F", "p"); // 0440 Cyrillic small letter ER tempMap.put("\u0440", "r"); // 0441 ? Cyrillic small letter ES tempMap.put("\u0441", "s"); // 0442 Cyrillic small letter TE tempMap.put("\u0442", "t"); // 0443 Cyrillic small letter U tempMap.put("\u0443", "u"); // 0444 Cyrillic small letter EF tempMap.put("\u0444", "f"); // 0445 Cyrillic small letter HA tempMap.put("\u0445", "h"); // 0446 Cyrillic small letter TSE tempMap.put("\u0446", "c"); // 0447 Cyrillic small letter CHE - see notes on capital letter. tempMap.put("\u0447", "ch"); // 0448 Cyrillic small letter SHA tempMap.put("\u0448", new String(new byte[] { 0, (byte) 154 }, ISO_8859_1)); // 0449 Cyrillic small letter SHCHA tempMap.put("\u0449", "shch"); // 044A Cyrillic small letter HARD SIGN tempMap.put("\u044A", new String(new byte[] { 0, (byte) 148 }, ISO_8859_1)); // 044B Cyrillic small letter YERU // A651 cyrillic small letter yeru with back yer tempMap.put("\u044B", "y"); // 044C Cyrillic small letter SOFT SIGN // 0185 latin small letter tone six // A64F ? cyrillic small letter neutral yer tempMap.put("\u044C", new String(new byte[] { 0, (byte) 146 }, ISO_8859_1)); // 044D ? Cyrillic small letter E tempMap.put("\u044D", new String(new byte[] { 0, (byte) 232 }, ISO_8859_1)); // 044E Cyrillic small letter YU // A655 cyrillic small letter reversed yu tempMap.put("\u044E", new String(new byte[] { 0, (byte) 251 }, ISO_8859_1)); tempMap.put("\uA655", new String(new byte[] { 0, (byte) 251 }, ISO_8859_1)); // is this right? // 044F ? Cyrillic small letter YA => a-circumflex tempMap.put("\u044F", new String(new byte[] { 0, (byte) 226 }, ISO_8859_1)); // Cyrillic extensions // 0450 ? CYRILLIC SMALL LETTER IE WITH GRAVE // Macedonian // 0435 0300 $ tempMap.put("\u0450", new String(new byte[] { 0, (byte) 232 }, ISO_8859_1)); // e-grave => e-grave // 0451 CYRILLIC SMALL LETTER IO // Russian, ... // 0435 0308 $ tempMap.put("\u0451", new String(new byte[] { 0, (byte) 235 }, ISO_8859_1)); // 0452 CYRILLIC SMALL LETTER DJE // Serbian // 0111 latin small letter d with stroke tempMap.put("\u0452", new String(new byte[] { 0, (byte) 240 }, ISO_8859_1)); // 0453 CYRILLIC SMALL LETTER GJE - only maps to g-acute, which is not in our charset. // Macedonian // 0433 0301 $? // 0454 CYRILLIC SMALL LETTER UKRAINIAN IE // = Old Cyrillic yest tempMap.put("\u0454", new String(new byte[] { 0, (byte) 234 }, ISO_8859_1)); // 0455 CYRILLIC SMALL LETTER DZE // Macedonian // A643 cyrillic small letter dzelo tempMap.put("\u0455", "s"); // 0456 CYRILLIC SMALL LETTER BYELORUSSIAN- // UKRAINIAN I // = Old Cyrillic i tempMap.put("\u0456", new String(new byte[] { 0, (byte) 236 }, ISO_8859_1)); // 0457 CYRILLIC SMALL LETTER YI // Ukrainian // 0456 0308 $ tempMap.put("\u0457", new String(new byte[] { 0, (byte) 239 }, ISO_8859_1)); // 0458 CYRILLIC SMALL LETTER JE // Serbian, Azerbaijani, Altay // 0459 CYRILLIC SMALL LETTER LJE // Serbian, Macedonian // 01C9 lj latin small letter lj // 045A CYRILLIC SMALL LETTER NJE // Serbian, Macedonian // 01CC nj latin small letter nj // 045B CYRILLIC SMALL LETTER TSHE // Serbian // 0107 latin small letter c with acute // 0127 latin small letter h with stroke // 040B cyrillic capital letter tshe // 210F planck constant over two pi // A649 cyrillic small letter djerv // 045C CYRILLIC SMALL LETTER KJE // Macedonian // 043A 0301 $? // 045D ? CYRILLIC SMALL LETTER I WITH GRAVE // Macedonian, Bulgarian // 0438 0300 $ // 045E CYRILLIC SMALL LETTER SHORT U // Byelorussian, Uzbek // 0443 0306 $ // 045F CYRILLIC SMALL LETTER DZHE // Serbian, Macedonian, Abkhasian // 01C6 d latin small letter dz with caron // Extended Cyrillic // ... // 0490 ? CYRILLIC CAPITAL LETTER GHE WITH UPTURN => G ? tempMap.put("\u0490", "G"); // Ghe with upturn // 0491 CYRILLIC SMALL LETTER GHE WITH UPTURN // Ukrainian tempMap.put("\u0491", "g"); // Other commonly-used unicode characters with exact WinAnsi equivalents tempMap.put("\u2013", new String(new byte[] { 0, (byte) 150 }, ISO_8859_1)); // En-dash tempMap.put("\u2014", new String(new byte[] { 0, (byte) 151 }, ISO_8859_1)); // Em-dash tempMap.put("\u2018", new String(new byte[] { 0, (byte) 145 }, ISO_8859_1)); // Curved single open quote tempMap.put("\u2019", new String(new byte[] { 0, (byte) 146 }, ISO_8859_1)); // Curved single close-quote tempMap.put("\u201A", new String(new byte[] { 0, (byte) 130 }, ISO_8859_1)); // Low single curved-quote tempMap.put("\u201C", new String(new byte[] { 0, (byte) 147 }, ISO_8859_1)); // Curved double open quote tempMap.put("\u201D", new String(new byte[] { 0, (byte) 148 }, ISO_8859_1)); // Curved double close-quote tempMap.put("\u201E", new String(new byte[] { 0, (byte) 132 }, ISO_8859_1)); // Low right double quote. tempMap.put("\u2020", new String(new byte[] { 0, (byte) 134 }, ISO_8859_1)); // Dagger tempMap.put("\u2021", new String(new byte[] { 0, (byte) 135 }, ISO_8859_1)); // Double dagger tempMap.put(UNICODE_BULLET, new String(new byte[] { 0, (byte) 149 }, ISO_8859_1)); // Bullet - use this as replacement character. tempMap.put("\u2026", new String(new byte[] { 0, (byte) 133 }, ISO_8859_1)); // Ellipsis tempMap.put("\u2030", new String(new byte[] { 0, (byte) 137 }, ISO_8859_1)); // Permille tempMap.put("\u2039", new String(new byte[] { 0, (byte) 139 }, ISO_8859_1)); // Left angle-quote tempMap.put("\u203A", new String(new byte[] { 0, (byte) 155 }, ISO_8859_1)); // Right angle-quote tempMap.put("\u20ac", new String(new byte[] { 0, (byte) 128 }, ISO_8859_1)); // Euro symbol tempMap.put("\u2122", new String(new byte[] { 0, (byte) 153 }, ISO_8859_1)); // Trademark symbol } catch (UnsupportedEncodingException uee) { throw new IllegalStateException( "Problem creating translation table due to Unsupported Encoding (coding error)", uee); } utf16ToWinAnsi = Collections.unmodifiableMap(tempMap); } // private static final Pattern whitespacePattern = Pattern.compile("\\p{Z}+"); // What about \u00ba?? // \u00a0-\u00a9 \u00ab-\u00b9 \u00bb-\u00bf \u00d7 \u00f7 private static final Pattern nonAsciiPattern = Pattern.compile("[^\u0000-\u00ff]"); /** * <p>PDF files are limited to the 217 characters of Windows-1252 which the PDF spec calls WinAnsi * and Java calls ISO-8859-1. This method transliterates the standard Java UTF-16 character * representations to their Windows-1252 equivalents where such translation is possible. Any * character (e.g. Kanji) which does not have an appropriate substitute in Windows-1252 will be * mapped to the bullet character (a round dot).</p> * <p/> * <p>This transliteration covers the modern alphabets of the following languages:<br> * <p/> * Afrikaans (af), * Albanian (sq), Basque (eu), Catalan (ca), Danish (da), Dutch (nl), English (en), Faroese (fo), * Finnish (fi), French (fr), Galician (gl), German (de), Icelandic (is), Irish (ga), * Italian (it), Norwegian (no), Portuguese (pt), Scottish (gd), Spanish (es), Swedish (sv).</p> * <p/> * <p>Romanized substitutions are used for the Cyrillic characters of the modern Russian (ru) * alphabet according to ISO 9:1995 with the following phonetic substitutions: 'Ch' for and * 'Shch' for .</p> * <p/> * <p>The PdfLayoutMgr calls this method internally whenever it renders text (transliteration has * to happen before line breaking), but is available externally in case you wish to use it * directly with PDFBox.</p> * * @param in a string in the standard Java UTF-16 encoding * @return a string in Windows-1252 (informally called ISO-8859-1 or WinAnsi) */ public static String convertJavaStringToWinAnsi(String in) { // ByteBuffer bb = StandardCharsets.UTF_16.encode(CharBuffer.wrap(in)); // // then decode those bytes as US-ASCII // return StandardCharsets.ISO_8859_1.decode(bb).toString(); // return java.nio.charset.StandardCharsets.ISO_8859_1.encode(in); Matcher m = nonAsciiPattern.matcher(in); StringBuilder sB = new StringBuilder(); int idx = 0; while (m.find()) { int start = m.start(); // first character of match. if (idx < start) { // Append everything from the last match up to this one. sB.append(in.subSequence(idx, start)); } String s = utf16ToWinAnsi.get(m.group()); // "In WinAnsiEncoding, all unused codes greater than 40 map to the bullet character." // source: PDF spec, Annex D.3 PDFDocEncoding Character Set p. 656 footnote about // WinAnsiEncoding. // // I think the bullet is the closest thing to a "replacement character" in the // WinAnsi character set, so that's what I'll use it for. It looks tons better than // nullnullnull... if (s == null) { s = utf16ToWinAnsi.get(UNICODE_BULLET); } sB.append(s); idx = m.end(); // m.end() is exclusive } if (idx < in.length()) { sB.append(in.subSequence(idx, in.length())); } return sB.toString(); } public PDFont loadTTFFont(InputStream fontStream) throws IOException { return PDTrueTypeFont.loadTTF(doc, fontStream); } public PDFont loadTTFFont(File fontFile) throws IOException { return loadTTFFont(new FileInputStream(fontFile)); } }