Example usage for org.apache.pdfbox.cos COSName CHAR

Introduction

In this page you can find the example usage for org.apache.pdfbox.cos COSName CHAR_SET.

Prototype

COSName CHAR_SET

To view the source code for org.apache.pdfbox.cos COSName CHAR_SET.

Click Source Link

Usage

From source file:modules.PDFFontDependencyExtractorModule.java

License:Apache License

public PDFFontResults extractFontList(File f) throws IOException, InvalidParameterException {
    PDDocument document;/*from  w  w w  .j  a v a2  s  .  c om*/
    try {
        document = PDDocument.load(f);
    } catch (IOException x) {
        throw new InvalidParameterException("Not a PDF file");
    }
    SortedSet<FontInformation> ret = new TreeSet<FontInformation>(new Comparator<FontInformation>() {

        @Override
        public int compare(FontInformation o1, FontInformation o2) {
            int a = o1.fontName.compareTo(o2.fontName);
            if (a != 0)
                return a;
            else
                return o1.fontType.compareTo(o2.fontType);
        }

    });

    document.getDocumentCatalog().getAllPages();
    // The code down here is easier as it gets all the fonts used in the
    // document. Still, this would inlcude unused fonts, so we get the fonts
    // page by page and add them to a Hash table.
    for (COSObject c : document.getDocument().getObjectsByType(COSName.FONT)) {
        if (c == null || !(c.getObject() instanceof COSDictionary)) {
            continue;
            // System.out.println(c.getObject());
        }

        COSDictionary fontDictionary = (COSDictionary) c.getObject();
        // System.out.println(dic.getNameAsString(COSName.BASE_FONT));
        // }
        // }
        // int pagen = document.getNumberOfPages();
        // i=0;
        // for (int p=0;p<pagen;p++){
        // PDPage page = (PDPage)pages.get(p);
        // PDResources res = page.findResources();
        // //for each page resources
        // if (res==null) continue;
        // // get the font dictionary
        // COSDictionary fonts = (COSDictionary)
        // res.getCOSDictionary().getDictionaryObject( COSName.FONT );
        // for( COSName fontName : fonts.keySet() ) {
        // COSObject font = (COSObject) fonts.getItem( fontName );
        // // if the font has already been visited we ingore it
        // long objectId = font.getObjectNumber().longValue();
        // if (ret.get(objectId)!=null)
        // continue;
        // if( font==null || ! (font.getObject() instanceof COSDictionary) )
        // continue;
        // COSDictionary fontDictionary = (COSDictionary)font.getObject();

        // Type MUSt be font
        if (!fontDictionary.getNameAsString(COSName.TYPE).equals("Font")) {
            continue;
        }
        // get the variables
        FontInformation fi = new FontInformation();
        fi.fontType = fontDictionary.getNameAsString(COSName.SUBTYPE);

        String baseFont = fontDictionary.getNameAsString(COSName.BASE_FONT);
        if (baseFont == null) {
            continue;
        }
        if (Arrays.binarySearch(standard14, baseFont) >= 0) {
            continue;
        }
        COSDictionary fontDescriptor = (COSDictionary) fontDictionary.getDictionaryObject(COSName.FONT_DESC);
        COSBase enc = fontDictionary.getItem(COSName.ENCODING);
        COSBase uni = fontDictionary.getItem(COSName.TO_UNICODE);
        fontDictionary.getInt(COSName.FIRST_CHAR);
        fontDictionary.getInt(COSName.LAST_CHAR);
        String encoding;
        boolean toUnicode = uni != null;
        if (enc == null) {
            encoding = "standard14";
        }
        if (enc instanceof COSString) {
            encoding = ((COSString) enc).getString();
        } else {
            encoding = "table";
        }
        fi.isSubset = false;
        boolean t = true;
        // Type one and TT can have subsets defineing the basename see 5.5.3
        // pdfref 1.6
        // if (fi.fontType.lastIndexOf(COSName.TYPE1.getName())!=-1 ||
        // fi.fontType.equals(COSName.TRUE_TYPE.getName()) )
        if (baseFont != null) {
            if (baseFont.length() > 6) {
                for (int k = 0; k < 6; k++)
                    if (!Character.isUpperCase(baseFont.charAt(k))) {
                        t = false;
                    }
                if (baseFont.charAt(6) != '+') {
                    t = false;
                }
            } else {
                t = false;
            }
            fi.isSubset = t;
            if (fi.isSubset) {
                fi.baseName = baseFont.substring(0, 6);
                baseFont = baseFont.substring(7);
            }
        }
        fi.fontFlags = 0;
        if (fi.fontType.equals(COSName.TYPE0.getName()) || fi.fontType.equals(COSName.TYPE3.getName())) {
            fi.isEmbedded = true;
        }

        if (fontDescriptor != null) {
            // in Type1 charset indicates font is subsetted
            if (fontDescriptor.getItem(COSName.CHAR_SET) != null) {
                fi.isSubset = true;
            }
            if (fontDescriptor.getItem(COSName.FONT_FILE) != null
                    || fontDescriptor.getItem(COSName.FONT_FILE3) != null
                    || fontDescriptor.getItem(COSName.FONT_FILE2) != null) {
                fi.isEmbedded = true;
            }
            fi.fontFlags = fontDescriptor.getInt(COSName.getPDFName("Flags"));
            fi.fontFamily = fontDescriptor.getString(COSName.FONT_FAMILY);
            fi.fontStretch = fontDescriptor.getString(COSName.FONT_STRETCH);

        }
        fi.charset = encoding;
        fi.fontName = baseFont;
        fi.isToUnicode = toUnicode;
        fi.encoding = fontDictionary.getNameAsString(COSName.CID_TO_GID_MAP);

        ret.add(fi);

    } // for all fonts

    HashMultimap<String, FontInformation> m = HashMultimap.create();

    for (FontInformation ff : ret) {
        m.put(ff.fontName, ff);
    }
    LinkedList<FontInformation> missing = new LinkedList<FontInformation>();
    Set<String> k = m.keySet();
    for (String kk : k) {
        Set<FontInformation> s = m.get(kk);
        if (s.size() < 1) {
            continue;
        }
        if (s.size() > 1) {
            boolean found = false;
            FontInformation ff = null;
            for (FontInformation fonti : s) {
                if (!fonti.isEmbedded) {
                    ff = fonti;
                } else {
                    found = true;
                }
            }
            if (!found) {
                missing.add(ff);
            }
        } else {
            FontInformation ff = s.iterator().next();
            if (!ff.isEmbedded) {
                missing.add(ff);
            }
        }

    }

    // } // for all pages
    // Iterator<FontInformation> it = ret.iterator();
    // FontInformation prev = null;
    // LinkedList<FontInformation> toDelete = new
    // LinkedList<FontInformation>();
    // while (it.hasNext()) {
    // FontInformation current = it.next();
    //
    // if (prev!= null && prev.fontName.equals(current.fontName) &&
    // (prev.fontType.startsWith("CIDFontType") ||
    // current.fontType.startsWith("CIDFontType")))
    // toDelete.add(current);
    // prev = current;
    // }
    //
    // //ret.removeAll(toDelete);
    // FontInformation[] retArray =toDelete.toArray(new FontInformation[0]);
    //

    if (missing.size() == 0) {
        missing = null;
    } else {
        System.out.println("Found missing fonts: " + f);
        System.out.println(missing);
    }
    return new PDFFontResults(new LinkedList<FontInformation>(ret), missing);
}

From source file:uk.ac.liverpool.thumbnails.PDFService.java

License:Open Source License

@Override
public FontInformation[] extractFontList(URI u, File fff) throws MalformedURLException, IOException {

    SortedSet<FontInformation> ret = new TreeSet<FontInformation>();
    PDDocument document = getPages(u, fff);
    List pages = document.getDocumentCatalog().getAllPages();
    int i = 0;/* w w w .j  a va 2s  .c om*/
    // The code down here is easier as it gets all the fonts used in the document. Still, this would inlcude unused fonts, so we get the fonts page by page and add them to a Hash table.
    for (COSObject c : document.getDocument().getObjectsByType(COSName.FONT)) {
        if (c == null || !(c.getObject() instanceof COSDictionary))
            continue;
        //System.out.println(c.getObject());

        COSDictionary fontDictionary = (COSDictionary) c.getObject();
        // System.out.println(dic.getNameAsString(COSName.BASE_FONT));
        //            }
        //        }
        //        int pagen = document.getNumberOfPages();
        //        i=0;
        //        for (int p=0;p<pagen;p++){
        //            PDPage page = (PDPage)pages.get(p);
        //            PDResources res = page.findResources();
        //            //for each page resources
        //            if (res==null) continue; 
        //            // get the font dictionary
        //            COSDictionary fonts = (COSDictionary) res.getCOSDictionary().getDictionaryObject( COSName.FONT );
        //            for( COSName fontName : fonts.keySet() ) {
        //                COSObject font = (COSObject) fonts.getItem( fontName );
        //                // if the font has already been visited we ingore it
        //                long objectId = font.getObjectNumber().longValue();
        //                if (ret.get(objectId)!=null)
        //                    continue;
        //                if( font==null ||  ! (font.getObject() instanceof COSDictionary) )
        //                    continue;
        //                COSDictionary fontDictionary = (COSDictionary)font.getObject();

        // Type MUSt be font
        if (!fontDictionary.getNameAsString(COSName.TYPE).equals("Font"))
            continue;
        // get the variables
        FontInformation fi = new FontInformation();
        fi.fontType = fontDictionary.getNameAsString(COSName.SUBTYPE);

        String baseFont = fontDictionary.getNameAsString(COSName.BASE_FONT);
        if (baseFont == null)
            continue;
        if (Arrays.binarySearch(standard14, baseFont) >= 0)
            continue;
        COSDictionary fontDescriptor = (COSDictionary) fontDictionary.getDictionaryObject(COSName.FONT_DESC);
        COSBase enc = fontDictionary.getItem(COSName.ENCODING);
        COSBase uni = fontDictionary.getItem(COSName.TO_UNICODE);
        int firstChar = fontDictionary.getInt(COSName.FIRST_CHAR);
        int lastChar = fontDictionary.getInt(COSName.LAST_CHAR);
        String encoding;
        boolean toUnicode = uni != null;
        if (enc == null) {
            encoding = "standard14";
        }
        if (enc instanceof COSString) {
            encoding = ((COSString) enc).getString();
        } else {
            encoding = "table";
        }
        fi.isSubset = false;
        boolean t = true;
        // Type one and TT can have subsets defineing the basename see 5.5.3 pdfref 1.6
        //  if (fi.fontType.lastIndexOf(COSName.TYPE1.getName())!=-1 || fi.fontType.equals(COSName.TRUE_TYPE.getName()) )
        if (baseFont != null) {
            if (baseFont.length() > 6) {
                for (int k = 0; k < 6; k++)
                    if (!Character.isUpperCase(baseFont.charAt(k)))
                        t = false;
                if (baseFont.charAt(6) != '+')
                    t = false;
            } else
                t = false;
            fi.isSubset = t;
            if (fi.isSubset)
                baseFont = baseFont.substring(7);
        }
        fi.fontFlags = 0;
        if (fi.fontType.equals(COSName.TYPE0) || fi.fontType.equals(COSName.TYPE3))
            fi.isEmbedded = true;

        if (fontDescriptor != null) {
            // in Type1 charset indicates font is subsetted
            if (fontDescriptor.getItem(COSName.CHAR_SET) != null)
                fi.isSubset = true;
            if (fontDescriptor.getItem(COSName.FONT_FILE) != null
                    || fontDescriptor.getItem(COSName.FONT_FILE3) != null
                    || fontDescriptor.getItem(COSName.FONT_FILE2) != null)
                fi.isEmbedded = true;
            fi.fontFlags = fontDescriptor.getInt(COSName.getPDFName("Flags"));
            fi.fontFamily = fontDescriptor.getString(COSName.FONT_FAMILY);
            fi.fontStretch = fontDescriptor.getString(COSName.FONT_STRETCH);
        }
        fi.charset = encoding;
        fi.fontName = baseFont;
        fi.isToUnicode = toUnicode;

        ret.add(fi);

    } // for all fonts 

    //    } // for all pages
    Iterator<FontInformation> it = ret.iterator();
    FontInformation prev = null;
    LinkedList<FontInformation> toDelete = new LinkedList<FontInformation>();
    while (it.hasNext()) {
        FontInformation current = it.next();

        if (prev != null && prev.fontName.equals(current.fontName) && prev.fontType.startsWith("CIDFontType"))
            toDelete.add(current);
        prev = current;
    }
    ret.removeAll(toDelete);
    FontInformation[] retArray = ret.toArray(new FontInformation[0]);

    return retArray;
}

Example usage for org.apache.pdfbox.cos COSName CHAR_SET

Introduction

Prototype

Usage