Java tutorial
/** * Copyright () 2013 Pablo Filetti Moreira & O.J. Sousa Rodrigues * <p> * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * <p> * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * <p> * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.github.ossdevs.jhocr.converter; import com.github.ossdevs.jhocr.element.HocrDocument; import com.github.ossdevs.jhocr.element.HocrPage; import com.github.ossdevs.jhocr.parser.HocrParser; import com.github.ossdevs.jhocr.util.LoggUtilException; import com.github.ossdevs.jhocr.util.enums.PDFF; import com.itextpdf.text.Document; import com.itextpdf.text.DocumentException; import com.itextpdf.text.pdf.ICC_Profile; import com.itextpdf.text.pdf.PdfAConformanceLevel; import com.itextpdf.text.pdf.PdfAWriter; import com.itextpdf.text.pdf.PdfWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; /** * TODO add documentation * TODO improve the way the information is beeing passed to the document e.g.: com-googlecode-jhocr-info * TODO add documentation, for example why exactly 72.0f */ public class HocrToPdf { private final static Logger logger = LoggerFactory.getLogger(new LoggUtilException().toString()); private static final String KEY_JHOCR_INFO = "com-googlecode-jhocr-info"; private static final String KEY_JHOCR_INFO_VALUE = "This document were generated with jhocr, for more information visit: https://code.google.com/p/jhocr"; public static float POINTS_PER_INCH = 72.0f; private final OutputStream outputStream; private final List<HocrDocumentItem> items = new ArrayList<HocrDocumentItem>(); private List<HashMap<String, Object>> outlines = new ArrayList<HashMap<String, Object>>(); private boolean useImageDpi = true; private PDFF pdfFormat = null; /** * @param outputStream of the PDF. */ public HocrToPdf(OutputStream outputStream) { this.outputStream = outputStream; } /** * @return the {@link #outputStream} object. */ public OutputStream getOutputStream() { return outputStream; } /** * @return the {@link #items} collection. */ public List<HocrDocumentItem> getItems() { return items; } /** * The {@link #pdfFormat} will determine which method will be called e.g. <br> * {@link #convertToPDFA(PdfAConformanceLevel)} or <br> * {@link #convertToPDFX(int)} etc. * * @return true if the conversion was successful. */ public boolean convert() { boolean result = false; if (!getItems().isEmpty() && getItems() != null) { PDFF pdff = getPdfFormat(); if (pdff != null) { if (pdff.getValue() instanceof Integer) { result = convertToPDFX((Integer) pdff.getValue()); } else if (pdff instanceof PDFF) { PdfAConformanceLevel pdfCL = (PdfAConformanceLevel) pdff.getValue(); result = convertToPDFA(pdfCL); } } else { result = convertSimple(); } } return result; } /** * This is the old <code>convert()</code> method, almost untouched.<br> * This method will be used if {@link #pdfFormat} is not set. * * @return true if the conversion was successful. */ private boolean convertSimple() { boolean result = false; Document document = new Document(); try { PdfWriter writer = PdfWriter.getInstance(document, getOutputStream()); document.open(); document.addHeader(KEY_JHOCR_INFO, KEY_JHOCR_INFO_VALUE); document.setMargins(0, 0, 0, 0); /** * TODO add documentation */ for (HocrDocumentItem item : getItems()) { HocrParser parser = new HocrParser(item.getHocrInputStream()); HocrDocument hocrDocument = parser.parse(); /** * TODO add documentation * TODO add multipage image support */ if (hocrDocument.getPages().size() > 1) { throw new UnsupportedOperationException( "Multipage tif are not yet implemented, please report: http://code.google.com/p/jhocr/issues/list"); } /** * TODO add documentation */ for (HocrPage hocrPage : hocrDocument.getPages()) { HocrPageProcessor pageProcessor = new HocrPageProcessor(hocrPage, item.getImageInputStream(), isUseImageDpi()); if (pageProcessor.isInitialized()) { pageProcessor.process(document, writer); } } } if (!outlines.isEmpty()) { writer.setOutlines(outlines); } /** * Closing the document body stream. */ document.close(); getOutputStream().close(); result = true; } catch (UnsupportedOperationException e) { document.close(); logger.error("This operation is not yet implemented.", e); result = false; } catch (DocumentException e) { document.close(); logger.error("exception while genrating the PDF.", e); result = false; } catch (IOException e) { document.close(); logger.error("FileSystem I/O Exception, please check the log and file system persmissions.", e); result = false; } return result; } /** * @param pdfXConformance determines into which format the PDF-X will be converted. * @return true if the conversion was successful. */ private boolean convertToPDFX(int pdfXConformance) { boolean result = false; Document document = new Document(); try { PdfWriter writer = PdfWriter.getInstance(document, getOutputStream()); writer.setPDFXConformance(pdfXConformance); document.open(); document.addHeader(KEY_JHOCR_INFO, KEY_JHOCR_INFO_VALUE); document.setMargins(0, 0, 0, 0); /** * TODO add documentation */ for (HocrDocumentItem item : getItems()) { HocrParser parser = new HocrParser(item.getHocrInputStream()); HocrDocument hocrDocument = parser.parse(); /** * TODO add documentation * TODO add multipage image support */ if (hocrDocument.getPages().size() > 1) { throw new UnsupportedOperationException( "Multipage tif are not yet implemented, please report: http://code.google.com/p/jhocr/issues/list"); } /** * TODO add documentation */ for (HocrPage hocrPage : hocrDocument.getPages()) { HocrPageProcessor pageProcessor = new HocrPageProcessor(hocrPage, item.getImageInputStream(), isUseImageDpi()); if (pageProcessor.isInitialized()) { pageProcessor.process(document, writer); } } } if (!outlines.isEmpty()) { writer.setOutlines(outlines); } /** * Closing the document body stream. */ document.close(); getOutputStream().close(); result = true; } catch (UnsupportedOperationException e) { document.close(); logger.error("This operation is not yet implemented.", e); result = false; } catch (DocumentException e) { document.close(); logger.error("exception while genrating the PDF.", e); result = false; } catch (IOException e) { document.close(); logger.error("FileSystem I/O Exception, please check the log and file system persmissions.", e); result = false; } return result; } /** * @param pdfConformanceLevel determines into which format the PDF-A&/B will be converted. * @return true if the conversion was successful. */ private boolean convertToPDFA(PdfAConformanceLevel pdfConformanceLevel) { boolean result = false; Document document = new Document(); ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); if (classLoader == null) { classLoader = Class.class.getClassLoader(); } try { PdfAWriter writer = PdfAWriter.getInstance(document, getOutputStream(), pdfConformanceLevel); writer.createXmpMetadata(); document.open(); document.addHeader(KEY_JHOCR_INFO, KEY_JHOCR_INFO_VALUE); document.setMargins(0, 0, 0, 0); /** * TODO add documentation */ for (HocrDocumentItem item : getItems()) { HocrParser parser = new HocrParser(item.getHocrInputStream()); HocrDocument hocrDocument = parser.parse(); /** * TODO add documentation * TODO add multipage image support */ if (hocrDocument.getPages().size() > 1) { throw new UnsupportedOperationException( "Multipage tif are not yet implemented, please report: http://code.google.com/p/jhocr/issues/list"); } /** * TODO add documentation */ for (HocrPage hocrPage : hocrDocument.getPages()) { HocrPageProcessor pageProcessor = new HocrPageProcessor(hocrPage, item.getImageInputStream(), isUseImageDpi()); if (pageProcessor.isInitialized()) { pageProcessor.process(document, writer); } } } if (!outlines.isEmpty()) { writer.setOutlines(outlines); } InputStream is = this.getClass().getResourceAsStream("/sRGB.profile"); ICC_Profile icc = ICC_Profile.getInstance(is); writer.setOutputIntents(KEY_JHOCR_INFO, KEY_JHOCR_INFO_VALUE, "http://www.color.org", "sRGB IEC61966-2.1", icc); /** * Closing the document body stream. */ document.close(); getOutputStream().close(); result = true; } catch (UnsupportedOperationException e) { document.close(); logger.error("This operation is not yet implemented.", e); result = false; } catch (DocumentException e) { document.close(); logger.error("exception while genrating the PDF.", e); result = false; } catch (IOException e) { document.close(); logger.error("FileSystem I/O Exception, please check the log and file system persmissions.", e); result = false; } return result; } /** * Adds a new {@link HocrDocumentItem} to the {@link #items} collection. * * @param hocrInputStream Html-OCR (HOCR) InputStream * @param imgInputStream Image InputStream, this image will be used to create the pdf searchable */ public void addHocrDocument(InputStream hocrInputStream, InputStream imgInputStream) { this.items.add(new HocrDocumentItem(hocrInputStream, imgInputStream)); } /** * @return the {@link #outlines} collection. */ public List<HashMap<String, Object>> getOutlines() { return outlines; } /** * @param outlines will be set. */ public void setOutlines(List<HashMap<String, Object>> outlines) { this.outlines = outlines; } /** * @param outline will be set. */ public void addOutline(HashMap<String, Object> outline) { getOutlines().add(outline); } /** * @return the {@link #useImageDpi} value. */ public boolean isUseImageDpi() { return useImageDpi; } /** * @param useImageDpi will be set. */ public void setUseImageDpi(boolean useImageDpi) { this.useImageDpi = useImageDpi; } /** * @return the {@link #pdfFormat} that was set for the current document. */ public PDFF getPdfFormat() { return pdfFormat; } /** * The PDF can be converted to a certain format, you can find all current supported and tested formats in the <code>PDFF.java</code> class. * * @param pdfFormat sets the PDF format for the current document to be converted. */ public void setPdfFormat(PDFF pdfFormat) { this.pdfFormat = pdfFormat; } }