org.zilverline.core.ExtractorFactory.java Source code

Introduction

Here is the source code for org.zilverline.core.ExtractorFactory.java
Source

/*
 * Copyright 2003-2004 Michael Franken, Zilverline.
 *
 * The contents of this file, or the files included with this file, are subject to
 * the current version of ZILVERLINE Collaborative Source License for the
 * Zilverline Search Engine (the "License"); You may not use this file except in
 * compliance with the License.
 *
 * You may obtain a copy of the License at
 *
 *     http://www.zilverline.org.
 *
 * See the License for the rights, obligations and
 * limitations governing use of the contents of the file.
 *
 * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
 * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
 * copyrights in the portions it created. All Rights Reserved.
 *
 */

package org.zilverline.core;

import java.io.File;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

import javax.activation.MimetypesFileTypeMap;

import net.sf.jmimemagic.Magic;
import net.sf.jmimemagic.MagicException;
import net.sf.jmimemagic.MagicMatch;
import net.sf.jmimemagic.MagicMatchNotFoundException;
import net.sf.jmimemagic.MagicParseException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.util.StringUtils;
import org.zilverline.extractors.ExcelExtractor;
import org.zilverline.extractors.FileInfoExtractor;
import org.zilverline.extractors.HTMLExtractor;
import org.zilverline.extractors.PDFExtractor;
import org.zilverline.extractors.PowerPointExtractor;
import org.zilverline.extractors.RTFExtractor;
import org.zilverline.extractors.TextExtractor;
import org.zilverline.extractors.WordExtractor;
import org.zilverline.util.FileUtils;

/**
 * Factory for creating Extractors based on file's extension.
 * 
 * @author Michael Franken
 * @version $Revision: 1.19 $
 * 
 * @see Extractor
 */
public final class ExtractorFactory {
    /** logger for Commons logging. */
    private static Log log = LogFactory.getLog(ExtractorFactory.class);

    /** Map holding mappings from file extension to extractor classname. */
    private Map mappings = new TreeMap();

    /** Map holding mappings from mime types to extractor classname. */
    private Map mimeMappings = new TreeMap();

    /** The Factory ignores case by default. */
    private boolean caseSensitive = false;

    /** The Factory does not store file info for unknown formats by default. */
    private boolean defaultFileinfo = false;

    /**
     * Create a factory with defaults set with the extractors provided by Zilverline.
     * 
     */
    public ExtractorFactory() {
        mappings.put("pdf", "org.zilverline.extractors.PDFExtractor");
        mappings.put("doc", "org.zilverline.extractors.WordExtractor");
        mappings.put("rtf", "org.zilverline.extractors.RTFExtractor");
        mappings.put("html", "org.zilverline.extractors.HTMLExtractor");
        mappings.put("htm", "org.zilverline.extractors.HTMLExtractor");
        mappings.put("txt", "org.zilverline.extractors.TextExtractor");
        mappings.put("xls", "org.zilverline.extractors.ExcelExtractor");
        mappings.put("ppt", "org.zilverline.extractors.PowerPointExtractor");

        addMimeMappings();
    }

    private void addMimeMappings() {
        mimeMappings.put("application/pdf", "org.zilverline.extractors.PDFExtractor");
        mimeMappings.put("*.pdf", "org.zilverline.extractors.PDFExtractor");
        mimeMappings.put("*.pdf/octet-stream", "org.zilverline.extractors.PDFExtractor");
        mimeMappings.put("application/msword", "org.zilverline.extractors.WordExtractor");
        mimeMappings.put("application/rtf", "org.zilverline.extractors.RTFExtractor");
        mimeMappings.put("text/html", "org.zilverline.extractors.HTMLExtractor");
        mimeMappings.put("text/plain", "org.zilverline.extractors.TextExtractor");
        mimeMappings.put("application/vnd.ms-excel", "org.zilverline.extractors.ExcelExtractor");
        mimeMappings.put("application/vnd.ms-powerpoint", "org.zilverline.extractors.PowerPointExtractor");
    }

    /**
     * Returns whether a given File can be extracted based on its extension.
     * 
     * @param f File that needs an Extractor
     * 
     * @return Extractor for File, or null if mapping is unknown
     */
    public boolean canExtract(final File f) {
        log.debug("Can we extract: " + f.getName() + "?");

        String extension = FileUtils.getExtension(f);

        if (!caseSensitive) {
            extension = extension.toLowerCase();
        }

        log.debug("" + mappings.containsKey(extension));

        return mappings.containsKey(extension);
    }

    /**
     * Returns whether a MIME-type can be extracted.
     * 
     * @param type MIME-type that needs an Extractor
     * 
     * @return Extractor for File, or null if mapping is unknown
     */
    public boolean canExtract(final String type) {
        log.debug("Can we extract: " + type + "?");
        String theType = type;
        if (!caseSensitive) {
            theType = type.toLowerCase();
        }
        theType = theType.split(";")[0];
        boolean canExtract = mimeMappings.containsKey(theType);
        log.debug("" + canExtract);
        return canExtract;
    }

    /**
     * Returns an Extractor for a given File, based on its extension. A new Extractor object is created everytime, preventing any
     * threadsafety issues
     * 
     * @param f File that needs an Extractor
     * 
     * @return Extractor for File, or null if mapping is unknown
     */
    public Extractor createExtractor(final File f) {
        log.debug("returning Extractor for: " + f.getName());

        String extension = FileUtils.getExtension(f);

        if (!caseSensitive) {
            extension = extension.toLowerCase();
        }

        String className = (String) mappings.get(extension);
        // if we found nothing return the FileInfoExtractor if that's the default
        if (!StringUtils.hasText(className) && defaultFileinfo)
            className = "org.zilverline.extractors.FileInfoExtractor";

        try {
            if (className != null) {
                Class c = Class.forName(className);

                if (c != null) {
                    log.debug("Returning Extractor: " + className);

                    return (Extractor) c.newInstance();
                }
            }
        } catch (InstantiationException e1) {
            log.debug("Can not initiate Extractor '" + className + "' for " + f.getName(), e1);
        } catch (IllegalAccessException e1) {
            log.debug("Can not access Extractor " + className + "' for " + f.getName(), e1);
        } catch (ClassNotFoundException e) {
            log.debug("Class not found: " + className + "' for " + f.getName(), e);
        }

        log.debug("Unknown format: " + f.getName());

        return null;
    }

    /**
     * Get the MIME-type of a given file.
     * 
     * @param f the File
     * @return the MIME-type of String
     */
    public static String getMimeType(final File f) {
        String type = new MimetypesFileTypeMap().getContentType(f);
        if ("application/octet-stream".equalsIgnoreCase(type)) {
            try {
                Magic parser = new Magic();
                // getMagicMatch accepts Files or byte[],
                // which is nice if you want to test streams
                MagicMatch match = parser.getMagicMatch(f);
                return match.getMimeType();
            } catch (MagicParseException e) {
                log.warn("Can't parse " + f.getName(), e);
            } catch (MagicMatchNotFoundException e) {
                log.warn("Can't find type for " + f.getName(), e);
            } catch (MagicException e) {
                log.warn("Can't find type for " + f.getName(), e);
            }
        }
        return type;
    }

    /**
     * Returns an Extractor for a given MIME-type. A new Extractor object is created everytime, preventing any threadsafety issues
     * 
     * @param type File that needs an Extractor
     * 
     * @return Extractor for MIME-type, or null if mapping is unknown
     */
    public Extractor createExtractor(final String type) {
        log.debug("returning Extractor for: " + type);

        String theType = type;

        if (!caseSensitive) {
            theType = theType.toLowerCase();
        }
        theType = theType.split(";")[0];

        String className = (String) mimeMappings.get(theType);

        try {
            if (className != null) {
                Class c = Class.forName(className);

                if (c != null) {
                    log.debug("Returning Extractor: " + className);

                    return (Extractor) c.newInstance();
                }
            }
        } catch (InstantiationException e1) {
            log.debug("Can not initiate Extractor '" + className + "' for " + theType, e1);
        } catch (IllegalAccessException e1) {
            log.debug("Can not access Extractor " + className + "' for " + theType, e1);
        } catch (ClassNotFoundException e) {
            log.debug("Class not found: " + className + "' for " + theType, e);
        }

        log.warn("Unknown format: " + theType);

        return null;
    }

    /**
     * Set mappings from a Map object. The mappings are file extensions with commands as values. For instance
     * 'pdf=org.zilverline.core.PDFExtractor'.
     * 
     * @param props properties as a Map with extension as key and command as value
     */
    public void setMappings(final Map props) {
        mappings.clear();

        if (caseSensitive) {
            // copy as-is
            mappings.putAll(props);
        } else {
            // convert the keys to lowercase
            Iterator iter = props.entrySet().iterator();

            while (iter.hasNext()) {
                Map.Entry element = (Map.Entry) iter.next();

                mappings.put(((String) element.getKey()).toLowerCase(), element.getValue());
            }
        }

        log.debug("Map now is: " + mappings);
    }

    /**
     * Check whether the Factory ignores case or not.
     * 
     * @return value indicating case sensitivity
     */
    public boolean isCaseSensitive() {
        return caseSensitive;
    }

    /**
     * Sets whether the Factory ignores case or not.
     * 
     * @param b indicates whether to handle mappings casesensitively
     */
    public void setCaseSensitive(final boolean b) {
        caseSensitive = b;
    }

    /**
     * Get the mappings for the Factory.
     * 
     * @return the mappings
     */
    public Map getMappings() {
        return mappings;
    }

    /**
     * Find all Extractors on the classpath. This is an expensive operation, use with care.
     * 
     * @return array of names of found Extractors
     */
    public static String[] findExtractorsOnClasspath() {
        log.debug("Known Extractors on classpath");
        String[] extractorNames = null;
        Class[] extractors = { FileInfoExtractor.class, PDFExtractor.class, WordExtractor.class, RTFExtractor.class,
                HTMLExtractor.class, TextExtractor.class, ExcelExtractor.class, PowerPointExtractor.class };
        extractorNames = new String[extractors.length];
        for (int i = 0; i < extractors.length; i++) {
            extractorNames[i] = extractors[i].getName();
            log.debug("Extractor: " + extractors[i].getName());
        }
        return extractorNames;
    }

    public Map getMimeMappings() {
        return mimeMappings;
    }

    public void setMimeMappings(Map mimeMappings) {
        this.mimeMappings = mimeMappings;
    }

    /**
     * @return the defaultFileinfo
     */
    public boolean isDefaultFileinfo() {
        return defaultFileinfo;
    }

    /**
     * @param defaultFileinfo the defaultFileinfo to set
     */
    public void setDefaultFileinfo(boolean defaultFileinfo) {
        this.defaultFileinfo = defaultFileinfo;
    }

}