org.apache.pdfbox.encoding.Encoding.java Source code

Introduction

Here is the source code for org.apache.pdfbox.encoding.Encoding.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.encoding;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.StringTokenizer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.util.ResourceLoader;

/**
 * This is an interface to a text encoder.
 *
 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
 * @version $Revision: 1.15 $
 */
public abstract class Encoding implements COSObjectable {

    /**
     * Log instance.
     */
    private static final Log LOG = LogFactory.getLog(Encoding.class);

    /** Identifies a non-mapped character. */
    public static final String NOTDEF = ".notdef";

    /**
     * This is a mapping from a character code to a character name.
     */
    protected final Map<Integer, String> codeToName = new HashMap<Integer, String>();

    /**
     * This is a mapping from a character name to a character code.
     */
    protected final Map<String, Integer> nameToCode = new HashMap<String, Integer>();

    private static final Map<String, String> NAME_TO_CHARACTER = new HashMap<String, String>();

    private static final Map<String, String> CHARACTER_TO_NAME = new HashMap<String, String>();

    static {
        //Loads the official Adobe Glyph List
        loadGlyphList("org/apache/pdfbox/resources/glyphlist.txt");
        //Loads some additional glyph mappings
        loadGlyphList("org/apache/pdfbox/resources/additional_glyphlist.txt");

        // Load an external glyph list file that user can give as JVM property
        String location = System.getProperty("glyphlist_ext");
        if (location != null) {
            File external = new File(location);
            if (external.exists()) {
                loadGlyphList(location);
            }
        }

        NAME_TO_CHARACTER.put(NOTDEF, "");
        NAME_TO_CHARACTER.put("fi", "fi");
        NAME_TO_CHARACTER.put("fl", "fl");
        NAME_TO_CHARACTER.put("ffi", "ffi");
        NAME_TO_CHARACTER.put("ff", "ff");
        NAME_TO_CHARACTER.put("pi", "pi");

        for (Map.Entry<String, String> entry : NAME_TO_CHARACTER.entrySet()) {
            CHARACTER_TO_NAME.put(entry.getValue(), entry.getKey());
        }
    }

    /**
     * Loads a glyph list from a given location and populates the NAME_TO_CHARACTER hashmap
     * for character lookups.
     * @param location - The string location of the glyphlist file
     */
    private static void loadGlyphList(String location) {
        BufferedReader glyphStream = null;
        try {
            InputStream resource = ResourceLoader.loadResource(location);
            if (resource == null) {
                throw new MissingResourceException("Glyphlist not found: " + location, Encoding.class.getName(),
                        location);
            }
            glyphStream = new BufferedReader(new InputStreamReader(resource));
            String line = null;
            while ((line = glyphStream.readLine()) != null) {
                line = line.trim();
                //lines starting with # are comments which we can ignore.
                if (!line.startsWith("#")) {
                    int semicolonIndex = line.indexOf(';');
                    if (semicolonIndex >= 0) {
                        String unicodeValue = null;
                        try {
                            String characterName = line.substring(0, semicolonIndex);
                            unicodeValue = line.substring(semicolonIndex + 1, line.length());
                            StringTokenizer tokenizer = new StringTokenizer(unicodeValue, " ", false);
                            StringBuilder value = new StringBuilder();
                            while (tokenizer.hasMoreTokens()) {
                                int characterCode = Integer.parseInt(tokenizer.nextToken(), 16);
                                value.append((char) characterCode);
                            }
                            if (NAME_TO_CHARACTER.containsKey(characterName)) {
                                LOG.warn("duplicate value for characterName=" + characterName + "," + value);
                            } else {
                                NAME_TO_CHARACTER.put(characterName, value.toString());
                            }
                        } catch (NumberFormatException nfe) {
                            LOG.error("malformed unicode value " + unicodeValue, nfe);
                        }
                    }
                }
            }
        } catch (IOException io) {
            LOG.error("error while reading the glyph list.", io);
        } finally {
            if (glyphStream != null) {
                try {
                    glyphStream.close();
                } catch (IOException e) {
                    LOG.error("error when closing the glyph list.", e);
                }

            }
        }
    }

    /**
     * Returns an unmodifiable view of the Code2Name mapping.
     * @return the Code2Name map
     */
    public Map<Integer, String> getCodeToNameMap() {
        return Collections.unmodifiableMap(codeToName);
    }

    /**
     * Returns an unmodifiable view of the Name2Code mapping.
     * @return the Name2Code map
     */
    public Map<String, Integer> getNameToCodeMap() {
        return Collections.unmodifiableMap(nameToCode);
    }

    /**
     * This will add a character encoding.
     *
     * @param code The character code that matches the character.
     * @param name The name of the character.
     */
    public void addCharacterEncoding(int code, String name) {
        codeToName.put(code, name);
        nameToCode.put(name, code);
    }

    /**
     * This will get the character code for the name.
     *
     * @param name The name of the character.
     *
     * @return The code for the character.
     *
     * @throws IOException If there is no character code for the name.
     */
    public int getCode(String name) throws IOException {
        Integer code = nameToCode.get(name);
        if (code == null) {
            throw new IOException("No character code for character name '" + name + "'");
        }
        return code;
    }

    /**
     * This will take a character code and get the name from the code.
     *
     * @param code The character code.
     *
     * @return The name of the character.
     *
     * @throws IOException If there is no name for the code.
     */
    public String getName(int code) throws IOException {
        return codeToName.get(code);
    }

    /**
     * This will take a character code and get the name from the code.
     *
     * @param c The character.
     *
     * @return The name of the character.
     *
     * @throws IOException If there is no name for the character.
     */
    public String getNameFromCharacter(char c) throws IOException {
        String name = CHARACTER_TO_NAME.get(Character.toString(c));
        if (name == null) {
            throw new IOException("No name for character '" + c + "'");
        }
        return name;
    }

    /**
     * This will get the character from the code.
     *
     * @param code The character code.
     *
     * @return The printable character for the code.
     *
     * @throws IOException If there is not name for the character.
     */
    public String getCharacter(int code) throws IOException {
        String name = getName(code);
        if (name != null) {
            return getCharacter(getName(code));
        }
        return null;
    }

    /**
     * This will get the character from the name.
     *
     * @param name The name of the character.
     *
     * @return The printable character for the code.
     */
    public String getCharacter(String name) {
        String character = NAME_TO_CHARACTER.get(name);
        if (character == null) {
            // test if we have a suffix and if so remove it
            if (name.indexOf('.') > 0) {
                character = getCharacter(name.substring(0, name.indexOf('.')));
            }
            // test for Unicode name
            // (uniXXXX - XXXX must be a multiple of four;
            // each representing a hexadecimal Unicode code point)
            else if (name.startsWith("uni")) {
                int nameLength = name.length();
                StringBuilder uniStr = new StringBuilder();
                try {
                    for (int chPos = 3; chPos + 4 <= nameLength; chPos += 4) {
                        int characterCode = Integer.parseInt(name.substring(chPos, chPos + 4), 16);

                        if (characterCode > 0xD7FF && characterCode < 0xE000) {
                            LOG.warn("Unicode character name with not allowed code area: " + name);
                        } else {
                            uniStr.append((char) characterCode);
                        }
                    }
                    character = uniStr.toString();
                    NAME_TO_CHARACTER.put(name, character);
                } catch (NumberFormatException nfe) {
                    LOG.warn("Not a number in Unicode character name: " + name);
                    character = name;
                }
            }
            // test for an alternate Unicode name representation 
            else if (name.startsWith("u")) {
                try {
                    int characterCode = Integer.parseInt(name.substring(1), 16);
                    if (characterCode > 0xD7FF && characterCode < 0xE000) {
                        LOG.warn("Unicode character name with not allowed code area: " + name);
                    } else {
                        character = String.valueOf((char) characterCode);
                        NAME_TO_CHARACTER.put(name, character);
                    }
                } catch (NumberFormatException nfe) {
                    LOG.warn("Not a number in Unicode character name: " + name);
                    character = name;
                }
            } else if (nameToCode.containsKey(name)) {
                int code = nameToCode.get(name);
                character = Character.toString((char) code);
            } else {
                character = name;
            }
        }
        return character;
    }

}