Java tutorial
// Copyright 2012-01-10 PlanBase Inc. & Glen Peterson // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.planbase.pdf.layoutmanager; import org.apache.pdfbox.exceptions.COSVisitorException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.edit.PDPageContentStream; import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.pdmodel.graphics.xobject.PDJpeg; import org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap; import java.awt.Color; import java.awt.image.BufferedImage; import java.io.IOException; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; /** <p>The main class in this package; it handles page and line breaks.</p> <h3>Usage (the unit test is a much better example):</h3> <pre><code>// Create a new manager PdfLayoutMgr pageMgr = PdfLayoutMgr.newRgbPageMgr(); LogicalPage lp = pageMgr.logicalPageStart(); // defaults to Landscape orientation // call various lp.tableBuilder() or lp.put...() methods here. // They will page-break and create extra physical pages as needed. // ... lp.commit(); lp = pageMgr.logicalPageStart(LogicalPage.Orientation.PORTRAIT); // These pages will be in Portrait orientation // call various lp methods to put things on the next page grouping // ... lp.commit(); // The file to write to OutputStream os = new FileOutputStream("test.pdf"); // Commit all pages to output stream. pageMgr.save(os);</code></pre> <br> <h3>Note:</h3> <p>Because this class buffers and writes to an underlying stream, it is mutable, has side effects, and is NOT thread-safe!</p> */ public class PdfLayoutMgr { // private Logger logger = Logger.getLogger(PdfLayoutMgr.class); // logger.info("Ascent: " + PDType1Font.HELVETICA.getFontDescriptor().getAscent()); // logger.info("StemH: " + PDType1Font.HELVETICA.getFontDescriptor().getStemH()); // logger.info("CapHeight: " + PDType1Font.HELVETICA.getFontDescriptor().getCapHeight()); // logger.info("XHeight: " + PDType1Font.HELVETICA.getFontDescriptor().getXHeight()); // logger.info("Descent: " + PDType1Font.HELVETICA.getFontDescriptor().getDescent()); // logger.info("Leading: " + PDType1Font.HELVETICA.getFontDescriptor().getLeading()); // // logger.info("Height: " + PDType1Font.HELVETICA.getFontDescriptor().getFontBoundingBox().getHeight()); // // Ascent: 718.0 // StemH: 0.0 // CapHeight: 718.0 // XHeight: 523.0 // Descent: -207.0 // Leading: 0.0 // Height: 1156.0 // CapHeight - descent = 925 // 925 - descent = 1132 which is still less than 1156. // I'm going to make line-height = // Java FontMetrics says getHeight() = getAscent() + getDescent() + getLeading(). // I think ascent and descent are compatible with this. I'm going to make Leading be // -descent/2 /** If you use no scaling when printing the output PDF, PDFBox shows approximately 72 Document-Units Per Inch. This makes one pixel on an average desktop monitor correspond to roughly one document unit. This is a useful constant for page layout math. */ public static final float DOC_UNITS_PER_INCH = 72f; // TODO: add Sensible defaults, such as textStyle? // private TextStyle textStyle; // private PDRectangle pageDimensions; // private Padding pageMargins; // private PDRectangle printableArea; // // public TextStyle textStyle() { return textStyle; } // public PDRectangle pageDimensions() { return pageDimensions; } // public Padding pageMargins() { return pageMargins; } // public PDRectangle printableArea() { return printableArea; } // You can have many DrawJpegs backed by only a few images - it is a flyweight, and this // hash map keeps track of the few underlying images, even as intances of DrawJpeg // represent all the places where these images are used. // CRITICAL: This means that the the set of jpgs must be thrown out and created anew for each // document! Thus, a private final field on the PdfLayoutMgr instead of DrawJpeg, and DrawJpeg // must be an inner class (or this would have to be package scoped). private final Map<BufferedImage, PDJpeg> jpegMap = new HashMap<BufferedImage, PDJpeg>(); private PDJpeg ensureCached(final ScaledJpeg sj) { BufferedImage bufferedImage = sj.bufferedImage(); PDJpeg temp = jpegMap.get(bufferedImage); if (temp == null) { try { temp = new PDJpeg(doc, bufferedImage); } catch (IOException ioe) { // can there ever be an exception here? Doesn't it get written later? throw new IllegalStateException("Caught exception creating a PDJpeg from a bufferedImage", ioe); } jpegMap.put(bufferedImage, temp); } return temp; } // You can have many DrawPngs backed by only a few images - it is a flyweight, and this // hash map keeps track of the few underlying images, even as intances of DrawPng // represent all the places where these images are used. // CRITICAL: This means that the the set of jpgs must be thrown out and created anew for each // document! Thus, a private final field on the PdfLayoutMgr instead of DrawPng, and DrawPng // must be an inner class (or this would have to be package scoped). private final Map<BufferedImage, PDPixelMap> pngMap = new HashMap<BufferedImage, PDPixelMap>(); private PDPixelMap ensureCached(final ScaledPng sj) { BufferedImage bufferedImage = sj.bufferedImage(); PDPixelMap temp = pngMap.get(bufferedImage); if (temp == null) { try { temp = new PDPixelMap(doc, bufferedImage); } catch (IOException ioe) { // can there ever be an exception here? Doesn't it get written later? throw new IllegalStateException("Caught exception creating a PDPixelMap from a bufferedImage", ioe); } pngMap.put(bufferedImage, temp); } return temp; } /** * Please don't access this class directly if you don't have to. It's a little bit like a model for stuff that * needs to be drawn on a page, but much more like a heap of random functionality that sort of landed in an * inner class. This will probably be refactored away in future releases. */ static class PageBuffer { public final int pageNum; private long lastOrd = 0; private final Set<PdfItem> items = new TreeSet<PdfItem>(); private PageBuffer(int pn) { pageNum = pn; } void fillRect(final float xVal, final float yVal, final float w, final float h, final Color c, final float z) { items.add(FillRect.of(xVal, yVal, w, h, c, lastOrd++, z)); } // public void fillRect(final float xVal, final float yVal, final float w, final Color c, // final float h) { // fillRect(xVal, yVal, w, h, c, PdfItem.DEFAULT_Z_INDEX); // } // // public void drawJpeg(final float xVal, final float yVal, final BufferedImage bi, // final PdfLayoutMgr mgr, final float z) { // items.add(DrawJpeg.of(xVal, yVal, bi, mgr, lastOrd++, z)); // } void drawJpeg(final float xVal, final float yVal, final ScaledJpeg sj, final PdfLayoutMgr mgr) { items.add(DrawJpeg.of(xVal, yVal, sj, mgr, lastOrd++, PdfItem.DEFAULT_Z_INDEX)); } void drawPng(final float xVal, final float yVal, final ScaledPng sj, final PdfLayoutMgr mgr) { items.add(DrawPng.of(xVal, yVal, sj, mgr, lastOrd++, PdfItem.DEFAULT_Z_INDEX)); } private void drawLine(final float xa, final float ya, final float xb, final float yb, final LineStyle ls, final float z) { items.add(DrawLine.of(xa, ya, xb, yb, ls, lastOrd++, z)); } void drawLine(final float xa, final float ya, final float xb, final float yb, final LineStyle ls) { drawLine(xa, ya, xb, yb, ls, PdfItem.DEFAULT_Z_INDEX); } private void drawStyledText(final float xCoord, final float yCoord, final String text, TextStyle s, final float z) { items.add(Text.of(xCoord, yCoord, text, s, lastOrd++, z)); } void drawStyledText(final float xCoord, final float yCoord, final String text, TextStyle s) { drawStyledText(xCoord, yCoord, text, s, PdfItem.DEFAULT_Z_INDEX); } private void commit(PDPageContentStream stream) throws IOException { // Since items are z-ordered, then sub-ordered by entry-order, we will draw // everything in the correct order. for (PdfItem item : items) { item.commit(stream); } } private static class DrawLine extends PdfItem { private final float x1, y1, x2, y2; private final LineStyle style; private DrawLine(final float xa, final float ya, final float xb, final float yb, LineStyle s, final long ord, final float z) { super(ord, z); x1 = xa; y1 = ya; x2 = xb; y2 = yb; style = s; } public static DrawLine of(final float xa, final float ya, final float xb, final float yb, LineStyle s, final long ord, final float z) { return new DrawLine(xa, ya, xb, yb, s, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { stream.setStrokingColor(style.color()); stream.setLineWidth(style.width()); stream.drawLine(x1, y1, x2, y2); } } private static class FillRect extends PdfItem { private final float x, y, width, height; private final Color color; private FillRect(final float xVal, final float yVal, final float w, final float h, final Color c, final long ord, final float z) { super(ord, z); x = xVal; y = yVal; width = w; height = h; color = c; } public static FillRect of(final float xVal, final float yVal, final float w, final float h, final Color c, final long ord, final float z) { return new FillRect(xVal, yVal, w, h, c, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { stream.setNonStrokingColor(color); stream.fillRect(x, y, width, height); } } static class Text extends PdfItem { public final float x, y; public final String t; public final TextStyle style; private Text(final float xCoord, final float yCoord, final String text, TextStyle s, final long ord, final float z) { super(ord, z); x = xCoord; y = yCoord; t = text; style = s; } public static Text of(final float xCoord, final float yCoord, final String text, TextStyle s, final long ord, final float z) { return new Text(xCoord, yCoord, text, s, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { stream.beginText(); stream.setNonStrokingColor(style.textColor()); stream.setFont(style.font(), style.fontSize()); stream.moveTextPositionByAmount(x, y); stream.drawString(t); stream.endText(); } } private static class DrawPng extends PdfItem { private final float x, y; private final PDPixelMap png; private final ScaledPng scaledPng; // private Log logger = LogFactory.getLog(DrawPng.class); private DrawPng(final float xVal, final float yVal, final ScaledPng sj, final PdfLayoutMgr mgr, final long ord, final float z) { super(ord, z); x = xVal; y = yVal; png = mgr.ensureCached(sj); scaledPng = sj; } public static DrawPng of(final float xVal, final float yVal, final ScaledPng sj, final PdfLayoutMgr mgr, final long ord, final float z) { return new DrawPng(xVal, yVal, sj, mgr, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { // stream.drawImage(png, x, y); XyDim dim = scaledPng.dimensions(); stream.drawXObject(png, x, y, dim.x(), dim.y()); } } private static class DrawJpeg extends PdfItem { private final float x, y; private final PDJpeg jpeg; private final ScaledJpeg scaledJpeg; // private Log logger = LogFactory.getLog(DrawJpeg.class); private DrawJpeg(final float xVal, final float yVal, final ScaledJpeg sj, final PdfLayoutMgr mgr, final long ord, final float z) { super(ord, z); x = xVal; y = yVal; jpeg = mgr.ensureCached(sj); scaledJpeg = sj; } public static DrawJpeg of(final float xVal, final float yVal, final ScaledJpeg sj, final PdfLayoutMgr mgr, final long ord, final float z) { return new DrawJpeg(xVal, yVal, sj, mgr, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { // stream.drawImage(jpeg, x, y); XyDim dim = scaledJpeg.dimensions(); stream.drawXObject(jpeg, x, y, dim.x(), dim.y()); } } } private final List<PageBuffer> pages = new ArrayList<PageBuffer>(); private final PDDocument doc; // pages.size() counts the first page as 1, so 0 is the appropriate sentinel value private int unCommittedPageIdx = 0; private final PDColorSpace colorSpace; List<PageBuffer> pages() { return Collections.unmodifiableList(pages); } private PdfLayoutMgr(PDColorSpace cs) throws IOException { doc = new PDDocument(); colorSpace = cs; } /** Returns a new PdfLayoutMgr with the given color space. @param cs the color-space. @return a new PdfLayoutMgr @throws IOException */ public static PdfLayoutMgr of(PDColorSpace cs) throws IOException { return new PdfLayoutMgr(cs); } /** Creates a new PdfLayoutMgr with the PDDeviceRGB color space. @return a new Page Manager with an RGB color space @throws IOException */ @SuppressWarnings("UnusedDeclaration") // Part of end-user public interface public static PdfLayoutMgr newRgbPageMgr() throws IOException { return new PdfLayoutMgr(PDDeviceRGB.INSTANCE); } /** Returns the correct page for the given value of y. This lets the user use any Y value and we continue extending their canvas downward (negative) by adding extra pages. @param y the un-adjusted y value. @return the proper page and adjusted y value for that page. */ LogicalPage.PageBufferAndY appropriatePage(LogicalPage lp, float y) { if (pages.size() < 1) { throw new IllegalStateException( "Cannot work with the any pages until one has been created by calling newPage()."); } int idx = unCommittedPageIdx; // Get the first possible page while (y < lp.yPageBottom()) { // logger.info("Adjusting y. Was: " + y + " about to add " + printAreaHeight); y += lp.printAreaHeight(); // y could even be negative. Just keep moving to the top of the next // page until it's in the printable area. idx++; if (pages.size() <= idx) { pages.add(new PageBuffer(pages.size() + 1)); } } PageBuffer ps = pages.get(idx); return new LogicalPage.PageBufferAndY(ps, y); } /** Call this to commit the PDF information to the underlying stream after it is completely built. */ public void save(OutputStream os) throws IOException, COSVisitorException { doc.save(os); doc.close(); } // TODO: Add logicalPage() method and call pages.add() lazily for the first item actually shown on a page, and logicalPageEnd called before a save. // TODO: Add feature for different paper size or orientation for each group of logical pages. /** Tells this PdfLayoutMgr that you want to start a new logical page (which may be broken across two or more physical pages) in the requested page orientation. */ @SuppressWarnings("UnusedDeclaration") // Part of end-user public interface public LogicalPage logicalPageStart(LogicalPage.Orientation o) { PageBuffer pb = new PageBuffer(pages.size() + 1); pages.add(pb); return LogicalPage.of(this, o); } /** Get a new logical page (which may be broken across two or more physical pages) in Landscape orientation. */ public LogicalPage logicalPageStart() { return logicalPageStart(LogicalPage.Orientation.LANDSCAPE); } // void addLogicalPage(PageBuffer pb) { // pages.add(pb); // } /** Call this when you are through with your current set of pages to commit all pending text and drawing operations. This is the only method that throws an IOException because the purpose of PdfLayoutMgr is to buffer all operations until a page is complete so that it can safely be written to the underlying stream. This method turns the potential pages into real output. Call when you need a page break, or your document is done and you need to write it out. @throws IOException - if there is a failure writing to the underlying stream. */ @SuppressWarnings("UnusedDeclaration") // Part of end-user public interface void logicalPageEnd(LogicalPage lp) throws IOException { // Write out all uncommitted pages. while (unCommittedPageIdx < pages.size()) { PDPage pdPage = new PDPage(); pdPage.setMediaBox(PDPage.PAGE_SIZE_LETTER); if (lp.orientation() == LogicalPage.Orientation.LANDSCAPE) { pdPage.setRotation(90); } PDPageContentStream stream = null; try { stream = new PDPageContentStream(doc, pdPage); doc.addPage(pdPage); if (lp.orientation() == LogicalPage.Orientation.LANDSCAPE) { stream.concatenate2CTM(0, 1, -1, 0, lp.pageWidth(), 0); } stream.setStrokingColorSpace(colorSpace); stream.setNonStrokingColorSpace(colorSpace); PageBuffer pb = pages.get(unCommittedPageIdx); pb.commit(stream); lp.commitBorderItems(stream); stream.close(); // Set to null to show that no exception was thrown and no need to close again. stream = null; } finally { // Let it throw an exception if the closing doesn't work. if (stream != null) { stream.close(); } } unCommittedPageIdx++; } } @Override public boolean equals(Object other) { // First, the obvious... if (this == other) { return true; } if (other == null) { return false; } if (!(other instanceof PdfLayoutMgr)) { return false; } // Details... final PdfLayoutMgr that = (PdfLayoutMgr) other; return this.doc.equals(that.doc) && (this.pages.equals(that.pages)); } @Override public int hashCode() { return doc.hashCode() + pages.hashCode(); } // public XyOffset putRect(XyOffset outerTopLeft, XyDim outerDimensions, final Color c) { //// System.out.println("putRect(" + outerTopLeft + " " + outerDimensions + " " + //// Utils.toString(c) + ")"); // putRect(outerTopLeft.x(), outerTopLeft.y(), outerDimensions.x(), outerDimensions.y(), c); // return XyOffset.of(outerTopLeft.x() + outerDimensions.x(), // outerTopLeft.y() - outerDimensions.y()); // } // /** // Puts text on the page. // @param x the x-value of the top-left corner. // @param origY the logical-page Y-value of the top-left corner. // @param cell the cell containing the styling and text to render. // @return the bottom Y-value (logical-page) of the rendered cell. // */ // public float putCell(final float x, float origY, final Cell cell) { // return cell.processRows(x, origY, false, this); // } private static final String ISO_8859_1 = "ISO_8859_1"; private static final String UNICODE_BULLET = "\u2022"; // PDFBox uses an encoding that the PDF spec calls WinAnsiEncoding. The spec says this is // Windows Code Page 1252. // http://en.wikipedia.org/wiki/Windows-1252 // It has a lot in common with ISO-8859-1, but it defines some additional characters such as // the Euro symbol. private static final Map<String, String> utf16ToWinAnsi; static { Map<String, String> tempMap = new HashMap<String, String>(); try { // 129, 141, 143, 144, and 157 are undefined in WinAnsi. // I had mapped A0-FF to 160-255 without noticing that that maps each character to // itself, meaning that Unicode and WinAnsii are the same in that range. // Unicode characters with exact WinAnsi equivalents tempMap.put("\u0152", new String(new byte[] { 0, (byte) 140 }, ISO_8859_1)); // OE tempMap.put("\u0153", new String(new byte[] { 0, (byte) 156 }, ISO_8859_1)); // oe tempMap.put("\u0160", new String(new byte[] { 0, (byte) 138 }, ISO_8859_1)); // S Acron tempMap.put("\u0161", new String(new byte[] { 0, (byte) 154 }, ISO_8859_1)); // s acron tempMap.put("\u0178", new String(new byte[] { 0, (byte) 159 }, ISO_8859_1)); // Y Diaeresis tempMap.put("\u017D", new String(new byte[] { 0, (byte) 142 }, ISO_8859_1)); // Capital Z-caron tempMap.put("\u017E", new String(new byte[] { 0, (byte) 158 }, ISO_8859_1)); // Lower-case Z-caron tempMap.put("\u0192", new String(new byte[] { 0, (byte) 131 }, ISO_8859_1)); // F with a hook (like jf put together) tempMap.put("\u02C6", new String(new byte[] { 0, (byte) 136 }, ISO_8859_1)); // circumflex (up-caret) tempMap.put("\u02DC", new String(new byte[] { 0, (byte) 152 }, ISO_8859_1)); // Tilde // Cyrillic letters map to their closest Romanizations according to ISO 9:1995 // http://en.wikipedia.org/wiki/ISO_9 // http://en.wikipedia.org/wiki/A_(Cyrillic) // Cyrillic extensions // 0400 Cyrillic capital letter IE WITH GRAVE // 0415 0300 (left-accent) tempMap.put("\u0400", new String(new byte[] { 0, (byte) 200 }, ISO_8859_1)); // 0401 ? Cyrillic capital letter IO // 0415 0308 (diuresis) tempMap.put("\u0401", new String(new byte[] { 0, (byte) 203 }, ISO_8859_1)); // 0402 Cyrillic capital letter DJE tempMap.put("\u0402", new String(new byte[] { 0, (byte) 208 }, ISO_8859_1)); // 0403 Cyrillic capital letter GJE // 0413 0301 (accent) // Ghe only maps to G-acute, which is not in our charset. // 0404 Cyrillic capital letter UKRAINIAN IE tempMap.put("\u0404", new String(new byte[] { 0, (byte) 202 }, ISO_8859_1)); // 0405 Cyrillic capital letter DZE tempMap.put("\u0405", "S"); // // 0406 Cyrillic capital letter BYELORUSSIAN- // UKRAINIAN I // 0049 I latin capital letter i // 0456 cyrillic small letter byelorussian- // ukrainian i // 04C0 cyrillic letter palochka tempMap.put("\u0406", new String(new byte[] { 0, (byte) 204 }, ISO_8859_1)); // 0407 Cyrillic capital letter YI // 0406 0308 (diuresis) tempMap.put("\u0407", new String(new byte[] { 0, (byte) 207 }, ISO_8859_1)); // 0408 Cyrillic capital letter JE // 0409 Cyrillic capital letter LJE // 040A Cyrillic capital letter NJE // 040B Cyrillic capital letter TSHE // 040C Cyrillic capital letter KJE // 041A 0301 (accent) // 040D ? Cyrillic capital letter I WITH GRAVE // 0418 0300 (accent) // 040E Cyrillic capital letter SHORT U // 0423 0306 (accent) // 040F ? Cyrillic capital letter DZHE // Basic Russian alphabet // See: http://www.unicode.org/charts/PDF/U0400.pdf // 0410 ? Cyrillic capital letter A => Latin A tempMap.put("\u0410", "A"); // 0411 Cyrillic capital letter BE => Latin B // 0183 latin small letter b with topbar tempMap.put("\u0411", "B"); // 0412 Cyrillic capital letter VE => Latin V tempMap.put("\u0412", "V"); // 0413 Cyrillic capital letter GHE => Latin G tempMap.put("\u0413", "G"); // 0414 Cyrillic capital letter DE => Latin D tempMap.put("\u0414", "D"); // 0415 Cyrillic capital letter IE => Latin E tempMap.put("\u0415", "E"); // 0416 Cyrillic capital letter ZHE => Z-caron tempMap.put("\u0416", new String(new byte[] { 0, (byte) 142 }, ISO_8859_1)); // 0417 Cyrillic capital letter ZE => Latin Z tempMap.put("\u0417", "Z"); // 0418 Cyrillic capital letter I => Latin I tempMap.put("\u0418", "I"); // 0419 Cyrillic capital letter SHORT I => Latin J // 0418 0306 (a little mark) // The two-character form (reversed N plus the mark) is not supported. tempMap.put("\u0419", "J"); // 041A Cyrillic capital letter KA => Latin K tempMap.put("\u041A", "K"); // 041B Cyrillic capital letter EL => Latin L tempMap.put("\u041B", "L"); // 041C Cyrillic capital letter EM => Latin M tempMap.put("\u041C", "M"); // 041D ? Cyrillic capital letter EN => Latin N tempMap.put("\u041D", "N"); // 041E Cyrillic capital letter O => Latin O tempMap.put("\u041E", "O"); // 041F Cyrillic capital letter PE => Latin P tempMap.put("\u041F", "P"); // 0420 Cyrillic capital letter ER => Latin R tempMap.put("\u0420", "R"); // 0421 Cyrillic capital letter ES => Latin S tempMap.put("\u0421", "S"); // 0422 Cyrillic capital letter TE => Latin T tempMap.put("\u0422", "T"); // 0423 Cyrillic capital letter U => Latin U // 0478 cyrillic capital letter uk // 04AF cyrillic small letter straight u // A64A cyrillic capital letter monograph uk tempMap.put("\u0423", "U"); tempMap.put("\u0478", "U"); // Is this right? tempMap.put("\u04AF", "U"); // Is this right? tempMap.put("\uA64A", "U"); // Is this right? // 0424 Cyrillic capital letter EF => Latin F tempMap.put("\u0424", "F"); // 0425 Cyrillic capital letter HA => Latin H tempMap.put("\u0425", "H"); // 0426 Cyrillic capital letter TSE => Latin C tempMap.put("\u0426", "C"); // 0427 Cyrillic capital letter CHE => Mapping to "Ch" because there is no // C-caron - hope this is the best choice! A also had this as "CH" but some make it // Tch as in Tchaikovsky, really didn't know what to do here. tempMap.put("\u0427", "Ch"); // 0428 Cyrillic capital letter SHA => S-caron tempMap.put("\u0428", new String(new byte[] { 0, (byte) 138 }, ISO_8859_1)); // 0429 Cyrillic capital letter SHCHA => Latin "Shch" because there is no // S-circumflex to map it to. Should it go to S-caron like SHA? tempMap.put("\u0429", "Shch"); // 042A Cyrillic capital letter HARD SIGN => Latin double prime, or in this case, // right double-quote. tempMap.put("\u042A", new String(new byte[] { 0, (byte) 148 }, ISO_8859_1)); // 042B Cyrillic capital letter YERU => Latin Y tempMap.put("\u042B", "Y"); // 042C Cyrillic capital letter SOFT SIGN => Latin prime, or in this case, // the right-single-quote. tempMap.put("\u042C", new String(new byte[] { 0, (byte) 146 }, ISO_8859_1)); // 042D Cyrillic capital letter E => Latin E-grave tempMap.put("\u042D", new String(new byte[] { 0, (byte) 200 }, ISO_8859_1)); // 042E Cyrillic capital letter YU => Latin U-circumflex tempMap.put("\u042E", new String(new byte[] { 0, (byte) 219 }, ISO_8859_1)); // 042F Cyrillic capital letter YA => A-circumflex tempMap.put("\u042F", new String(new byte[] { 0, (byte) 194 }, ISO_8859_1)); // 0430 Cyrillic small letter A tempMap.put("\u0430", "a"); // 0431 Cyrillic small letter BE tempMap.put("\u0431", "b"); // 0432 Cyrillic small letter VE tempMap.put("\u0432", "v"); // 0433 Cyrillic small letter GHE tempMap.put("\u0433", "g"); // 0434 Cyrillic small letter DE tempMap.put("\u0434", "d"); // 0435 Cyrillic small letter IE tempMap.put("\u0435", "e"); // 0436 Cyrillic small letter ZHE tempMap.put("\u0436", new String(new byte[] { 0, (byte) 158 }, ISO_8859_1)); // 0437 Cyrillic small letter ZE tempMap.put("\u0437", "z"); // 0438 Cyrillic small letter I tempMap.put("\u0438", "i"); // 0439 Cyrillic small letter SHORT I // 0438 0306 (accent) tempMap.put("\u0439", "j"); // 043A Cyrillic small letter KA tempMap.put("\u043A", "k"); // 043B Cyrillic small letter EL tempMap.put("\u043B", "l"); // 043C Cyrillic small letter EM tempMap.put("\u043C", "m"); // 043D Cyrillic small letter EN tempMap.put("\u043D", "n"); // 043E Cyrillic small letter O tempMap.put("\u043E", "o"); // 043F Cyrillic small letter PE tempMap.put("\u043F", "p"); // 0440 Cyrillic small letter ER tempMap.put("\u0440", "r"); // 0441 ? Cyrillic small letter ES tempMap.put("\u0441", "s"); // 0442 Cyrillic small letter TE tempMap.put("\u0442", "t"); // 0443 Cyrillic small letter U tempMap.put("\u0443", "u"); // 0444 Cyrillic small letter EF tempMap.put("\u0444", "f"); // 0445 Cyrillic small letter HA tempMap.put("\u0445", "h"); // 0446 Cyrillic small letter TSE tempMap.put("\u0446", "c"); // 0447 Cyrillic small letter CHE - see notes on capital letter. tempMap.put("\u0447", "ch"); // 0448 Cyrillic small letter SHA tempMap.put("\u0448", new String(new byte[] { 0, (byte) 154 }, ISO_8859_1)); // 0449 Cyrillic small letter SHCHA tempMap.put("\u0449", "shch"); // 044A Cyrillic small letter HARD SIGN tempMap.put("\u044A", new String(new byte[] { 0, (byte) 148 }, ISO_8859_1)); // 044B Cyrillic small letter YERU // A651 cyrillic small letter yeru with back yer tempMap.put("\u044B", "y"); // 044C Cyrillic small letter SOFT SIGN // 0185 latin small letter tone six // A64F ? cyrillic small letter neutral yer tempMap.put("\u044C", new String(new byte[] { 0, (byte) 146 }, ISO_8859_1)); // 044D ? Cyrillic small letter E tempMap.put("\u044D", new String(new byte[] { 0, (byte) 232 }, ISO_8859_1)); // 044E Cyrillic small letter YU // A655 cyrillic small letter reversed yu tempMap.put("\u044E", new String(new byte[] { 0, (byte) 251 }, ISO_8859_1)); tempMap.put("\uA655", new String(new byte[] { 0, (byte) 251 }, ISO_8859_1)); // is this right? // 044F ? Cyrillic small letter YA => a-circumflex tempMap.put("\u044F", new String(new byte[] { 0, (byte) 226 }, ISO_8859_1)); // Cyrillic extensions // 0450 ? CYRILLIC SMALL LETTER IE WITH GRAVE // Macedonian // 0435 0300 $ tempMap.put("\u0450", new String(new byte[] { 0, (byte) 232 }, ISO_8859_1)); // e-grave => e-grave // 0451 CYRILLIC SMALL LETTER IO // Russian, ... // 0435 0308 $ tempMap.put("\u0451", new String(new byte[] { 0, (byte) 235 }, ISO_8859_1)); // 0452 CYRILLIC SMALL LETTER DJE // Serbian // 0111 latin small letter d with stroke tempMap.put("\u0452", new String(new byte[] { 0, (byte) 240 }, ISO_8859_1)); // 0453 CYRILLIC SMALL LETTER GJE - only maps to g-acute, which is not in our charset. // Macedonian // 0433 0301 $? // 0454 CYRILLIC SMALL LETTER UKRAINIAN IE // = Old Cyrillic yest tempMap.put("\u0454", new String(new byte[] { 0, (byte) 234 }, ISO_8859_1)); // 0455 CYRILLIC SMALL LETTER DZE // Macedonian // A643 cyrillic small letter dzelo tempMap.put("\u0455", "s"); // 0456 CYRILLIC SMALL LETTER BYELORUSSIAN- // UKRAINIAN I // = Old Cyrillic i tempMap.put("\u0456", new String(new byte[] { 0, (byte) 236 }, ISO_8859_1)); // 0457 CYRILLIC SMALL LETTER YI // Ukrainian // 0456 0308 $ tempMap.put("\u0457", new String(new byte[] { 0, (byte) 239 }, ISO_8859_1)); // 0458 CYRILLIC SMALL LETTER JE // Serbian, Azerbaijani, Altay // 0459 CYRILLIC SMALL LETTER LJE // Serbian, Macedonian // 01C9 lj latin small letter lj // 045A CYRILLIC SMALL LETTER NJE // Serbian, Macedonian // 01CC nj latin small letter nj // 045B CYRILLIC SMALL LETTER TSHE // Serbian // 0107 latin small letter c with acute // 0127 latin small letter h with stroke // 040B cyrillic capital letter tshe // 210F planck constant over two pi // A649 cyrillic small letter djerv // 045C CYRILLIC SMALL LETTER KJE // Macedonian // 043A 0301 $? // 045D ? CYRILLIC SMALL LETTER I WITH GRAVE // Macedonian, Bulgarian // 0438 0300 $ // 045E CYRILLIC SMALL LETTER SHORT U // Byelorussian, Uzbek // 0443 0306 $ // 045F CYRILLIC SMALL LETTER DZHE // Serbian, Macedonian, Abkhasian // 01C6 d latin small letter dz with caron // Extended Cyrillic // ... // 0490 ? CYRILLIC CAPITAL LETTER GHE WITH UPTURN => G ? tempMap.put("\u0490", "G"); // Ghe with upturn // 0491 CYRILLIC SMALL LETTER GHE WITH UPTURN // Ukrainian tempMap.put("\u0491", "g"); // Other commonly-used unicode characters with exact WinAnsi equivalents tempMap.put("\u2013", new String(new byte[] { 0, (byte) 150 }, ISO_8859_1)); // En-dash tempMap.put("\u2014", new String(new byte[] { 0, (byte) 151 }, ISO_8859_1)); // Em-dash tempMap.put("\u2018", new String(new byte[] { 0, (byte) 145 }, ISO_8859_1)); // Curved single open quote tempMap.put("\u2019", new String(new byte[] { 0, (byte) 146 }, ISO_8859_1)); // Curved single close-quote tempMap.put("\u201A", new String(new byte[] { 0, (byte) 130 }, ISO_8859_1)); // Low single curved-quote tempMap.put("\u201C", new String(new byte[] { 0, (byte) 147 }, ISO_8859_1)); // Curved double open quote tempMap.put("\u201D", new String(new byte[] { 0, (byte) 148 }, ISO_8859_1)); // Curved double close-quote tempMap.put("\u201E", new String(new byte[] { 0, (byte) 132 }, ISO_8859_1)); // Low right double quote. tempMap.put("\u2020", new String(new byte[] { 0, (byte) 134 }, ISO_8859_1)); // Dagger tempMap.put("\u2021", new String(new byte[] { 0, (byte) 135 }, ISO_8859_1)); // Double dagger tempMap.put(UNICODE_BULLET, new String(new byte[] { 0, (byte) 149 }, ISO_8859_1)); // Bullet - use this as replacement character. tempMap.put("\u2026", new String(new byte[] { 0, (byte) 133 }, ISO_8859_1)); // Ellipsis tempMap.put("\u2030", new String(new byte[] { 0, (byte) 137 }, ISO_8859_1)); // Permille tempMap.put("\u2039", new String(new byte[] { 0, (byte) 139 }, ISO_8859_1)); // Left angle-quote tempMap.put("\u203A", new String(new byte[] { 0, (byte) 155 }, ISO_8859_1)); // Right angle-quote tempMap.put("\u20ac", new String(new byte[] { 0, (byte) 128 }, ISO_8859_1)); // Euro symbol tempMap.put("\u2122", new String(new byte[] { 0, (byte) 153 }, ISO_8859_1)); // Trademark symbol } catch (UnsupportedEncodingException uee) { throw new IllegalStateException( "Problem creating translation table due to Unsupported Encoding (coding error)", uee); } utf16ToWinAnsi = Collections.unmodifiableMap(tempMap); } // private static final Pattern whitespacePattern = Pattern.compile("\\p{Z}+"); // What about \u00ba?? // \u00a0-\u00a9 \u00ab-\u00b9 \u00bb-\u00bf \u00d7 \u00f7 private static final Pattern nonAsciiPattern = Pattern.compile("[^\u0000-\u00ff]"); /** <p>PDF files are limited to the 217 characters of Windows-1252 which the PDF spec calls WinAnsi and Java calls ISO-8859-1. This method transliterates the standard Java UTF-16 character representations to their Windows-1252 equivalents where such translation is possible. Any character (e.g. Kanji) which does not have an appropriate substitute in Windows-1252 will be mapped to the bullet character (a round dot).</p> <p>This transliteration covers the modern alphabets of the following languages:<br> Afrikaans (af), Albanian (sq), Basque (eu), Catalan (ca), Danish (da), Dutch (nl), English (en), Faroese (fo), Finnish (fi), French (fr), Galician (gl), German (de), Icelandic (is), Irish (ga), Italian (it), Norwegian (no), Portuguese (pt), Scottish (gd), Spanish (es), Swedish (sv).</p> <p>Romanized substitutions are used for the Cyrillic characters of the modern Russian (ru) alphabet according to ISO 9:1995 with the following phonetic substitutions: 'Ch' for and 'Shch' for .</p> <p>The PdfLayoutMgr calls this method internally whenever it renders text (transliteration has to happen before line breaking), but is available externally in case you wish to use it directly with PDFBox.</p> @param in a string in the standard Java UTF-16 encoding @return a string in Windows-1252 (informally called ISO-8859-1 or WinAnsi) */ public static String convertJavaStringToWinAnsi(String in) { // ByteBuffer bb = StandardCharsets.UTF_16.encode(CharBuffer.wrap(in)); // // then decode those bytes as US-ASCII // return StandardCharsets.ISO_8859_1.decode(bb).toString(); // return java.nio.charset.StandardCharsets.ISO_8859_1.encode(in); Matcher m = nonAsciiPattern.matcher(in); StringBuilder sB = new StringBuilder(); int idx = 0; while (m.find()) { int start = m.start(); // first character of match. if (idx < start) { // Append everything from the last match up to this one. sB.append(in.subSequence(idx, start)); } String s = utf16ToWinAnsi.get(m.group()); // "In WinAnsiEncoding, all unused codes greater than 40 map to the bullet character." // source: PDF spec, Annex D.3 PDFDocEncoding Character Set p. 656 footnote about // WinAnsiEncoding. // // I think the bullet is the closest thing to a "replacement character" in the // WinAnsi character set, so that's what I'll use it for. It looks tons better than // nullnullnull... if (s == null) { s = utf16ToWinAnsi.get(UNICODE_BULLET); } sB.append(s); idx = m.end(); // m.end() is exclusive } if (idx < in.length()) { sB.append(in.subSequence(idx, in.length())); } return sB.toString(); } }