it.unimi.di.big.mg4j.document.DispatchingDocumentFactory.java Source code

Java tutorial

Introduction

Here is the source code for it.unimi.di.big.mg4j.document.DispatchingDocumentFactory.java

Source

package it.unimi.di.big.mg4j.document;

/*       
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2005-2015 Paolo Boldi 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.NullReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.util.Properties;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.configuration.ConfigurationException;

/** A document factory that actually dispatches the task of building documents to various factories
 *  according to some strategy.
 * 
 * <p>The strategy is specified as (an object embedding) a method that determines which factory
 * should be used on the basis of the metadata that are provided to the {@link #getDocument(InputStream, Reference2ObjectMap)}
 * method. Since usually the strategy will have to resolve the name of metadata, it is also passed
 * this factory, so that the correct 
 * {@link it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory#resolve(Enum,Reference2ObjectMap)} method can be invoked. 
 * 
 * <p>Moreover, at construction one must specify, for each subfactory and for each field of this
 * factory, which field of the subfactory should be used. Note that to guarantee sequential access,
 * fields specified for each subfactory should appear in increasing order.
 */
public class DispatchingDocumentFactory extends PropertyBasedDocumentFactory {
    private static final long serialVersionUID = 1L;

    private static final boolean DEBUG = false;

    /** Case-insensitive keys for metadata. 
     * 
     *  @see PropertyBasedDocumentFactory.MetadataKeys
     */
    public static enum MetadataKeys {
        /** The property containing the (comma-separated) sequence of field names. */
        FIELDNAME,
        /** The property containing the key that should be checked (e.g., mimetype). */
        KEY,
        /** The property containing comma-separated sequence of colon-separated pairs value/document factory names. */
        RULE,
        /** The property containing a comma-separated list with as many items as there are factories; each item will be
         *  a colon-separated list of as many integers as there are fields. The <var>k</var>-th integer in the <var>f</var>-th
         *  list is the number of the field of the <var>f</var>-th factory that should be used to extract field number <var>k</var>,
         *  or -1 if the field should be empty. */
        MAP
    }

    /** The value to be used in <code>RULE</code> to introduce the default factory. Otherwise, no default factory is
     *  provided for documents that do not match. */
    public final static String OTHERWISE_IN_RULE = "?";

    /** A strategy that decides which factory is appropriate using the document metadata. */

    public static interface DispatchingStrategy extends Serializable {
        /** Decides the index of the factory to be used for the given metadata, possibly using
         *  a factory to resolve property names.
         * 
         * @param metadata the metadata of the document to be produced.
         * @param factory the factory used to resolve metadata names.
         * @return the factory index.
         */
        public int factoryNumber(Reference2ObjectMap<Enum<?>, Object> metadata,
                PropertyBasedDocumentFactory factory);
    };

    /** A strategy that is based on trying to match the value of the metadata with a given key with respect to a
     *  certain set of values.
     */
    public static class StringBasedDispatchingStrategy implements DispatchingStrategy {
        private static final long serialVersionUID = 1L;
        /** The key to be resolved. */
        private final Enum<?> key;
        /** The values that should be used for comparisons. */
        private final Object2IntMap<String> value;

        /** The strategy works as follows: the property named <code>key</code> is resolved; if this property
         *  is not set, the default return value of <var>value</var> is returned. 
         *  Otherwise, its value is compared, using the <code>equals</code>,
         *  method with the elements of the <code>value</code> set, and the corresponding integer is returned.
         * 
         * @param key the key to be resolved.
         * @param value the map of values.
         */
        public StringBasedDispatchingStrategy(final Enum<?> key, final Object2IntMap<String> value) {
            this.key = key;
            this.value = value;
        }

        public int factoryNumber(final Reference2ObjectMap<Enum<?>, Object> metadata,
                final PropertyBasedDocumentFactory factory) {
            final Object val = factory.resolve(key, metadata);
            if (DEBUG)
                System.out.println("key " + key + " resolved using " + metadata + " into " + val);
            return value.getInt(val);
        }

    };

    /** The number of subfactories used. */
    private int n;
    /** The subfactories used. */
    private DocumentFactory[] documentFactory;
    /** The number of fields of this factory. */
    private int numberOfFields;
    /** The names of the fields. */
    private String[] fieldName;
    /** The types of the fields. */
    private FieldType[] fieldType;
    /** The array specifying how subfactory fields should be mapped into fields of this factory. More precisely,
     *  <code>rename[f][k]</code> specifies which field of factory <code>documentFactory[f]</code> should be used
     *  to return the field named <code>fieldName[k]</code>: it is assumed that the type of the field in the subfactory
     *  is correct (i.e., that <code>documentFactory[f].fieldType(k)==fieldType[k]</code>). The value -1 is used to
     *  return an empty textual field (i.e., a word reader on an empty string).
     */
    private int[][] rename;
    /** The strategy to be used. */
    private DispatchingStrategy strategy;
    /** If a {@link StringBasedDispatchingStrategy} should be used, this field represents the property key to be checked. 
     *  Otherwise, this is <code>null</code>. */
    private Enum<?> dispatchingKey;
    /** If a {@link StringBasedDispatchingStrategy} should be used, this field represents the map from values to factories. */
    private Object2ObjectLinkedOpenHashMap<String, Class<? extends DocumentFactory>> value2factoryClass;

    private void init(final DocumentFactory[] documentFactory, final String[] fieldName,
            final FieldType[] fieldType, final int[][] rename, final DispatchingStrategy strategy) {
        n = documentFactory.length;
        this.documentFactory = documentFactory;
        numberOfFields = fieldName.length;
        this.fieldName = fieldName;
        this.fieldType = fieldType;
        this.rename = rename;
        this.strategy = strategy;
    }

    // TODO: All IllegalArgumentException where ConfigurationException; check that now it's OK
    private void checkAttributes() {
        if (fieldName.length != fieldType.length || rename.length != documentFactory.length
                || documentFactory.length != n || fieldName.length != numberOfFields)
            throw new IllegalArgumentException("Length mismatch in defining the dispatching factory");
        for (int f = 0; f < n; f++) {
            if (rename[f].length != numberOfFields)
                throw new IllegalArgumentException(
                        "The number of fields (" + numberOfFields + ") does not match the mapping rule for factory "
                                + documentFactory[f].getClass().getName());
            for (int k = 0; k < numberOfFields; k++) {
                if (rename[f][k] < -1 || rename[f][k] >= documentFactory[f].numberOfFields())
                    throw new IllegalArgumentException(
                            rename[f][k] + " is not a field of factory " + documentFactory[f]);
                if (rename[f][k] >= 0 && fieldType[k] != documentFactory[f].fieldType(rename[f][k]))
                    throw new IllegalArgumentException("Field " + rename[f][k] + " of factory " + documentFactory[f]
                            + " has a type different from the type of the field it is mapped to");
            }
        }
        if (n == 0 || numberOfFields == 0)
            throw new IllegalArgumentException("Zero factories or fields specified");
        if (strategy == null)
            throw new IllegalArgumentException("No strategy was specified");
    }

    private void setExtraArguments(final Object xtraPars) throws IllegalArgumentException {
        if (value2factoryClass == null)
            throw new IllegalArgumentException(
                    "No " + MetadataKeys.RULE + " property was specified for the dispatching factory");
        n = value2factoryClass.values().size();
        documentFactory = new DocumentFactory[n];
        Iterator<Class<? extends DocumentFactory>> it = value2factoryClass.values().iterator();
        for (int f = 0; f < n; f++) {
            Class<? extends DocumentFactory> documentFactoryClass = it.next();
            try {
                if (xtraPars == null)
                    documentFactory[f] = documentFactoryClass.newInstance();
                else
                    documentFactory[f] = documentFactoryClass.getConstructor(xtraPars.getClass())
                            .newInstance(xtraPars);
            } catch (Exception e) {
                throw new IllegalArgumentException(e);
            }
        }

        fieldType = new FieldType[numberOfFields];
        if (rename == null)
            throw new IllegalArgumentException(
                    "No " + MetadataKeys.MAP + " property was specified for the dispatching factory");
        for (int f = 0; f < n; f++) {
            for (int k = 0; k < numberOfFields; k++) {
                int kk = rename[f][k];
                if (kk >= 0 && fieldType[k] != null && fieldType[k] != documentFactory[f].fieldType(kk))
                    throw new IllegalArgumentException("Mismatch between field types for field " + f
                            + ", relative to the remapping of factory " + documentFactory[f].getClass().getName()
                            + " (the type used to be " + fieldType[k] + ", but now we want it to be "
                            + documentFactory[f].fieldType(kk) + ")");
                if (kk >= 0)
                    fieldType[k] = documentFactory[f].fieldType(kk);
            }
        }
        for (int f = 0; f < numberOfFields; f++)
            if (fieldType[f] == null)
                throw new IllegalArgumentException("The type of field " + fieldName[f]
                        + " could not be deduced, because it is never mapped to");
        if (dispatchingKey == null)
            throw new IllegalArgumentException(
                    "No " + MetadataKeys.KEY + " property was specified for the dispatching factory");
        Object2IntMap<String> value2int = new Object2IntOpenHashMap<String>();
        value2int.defaultReturnValue(-1);
        for (Map.Entry<String, Class<? extends DocumentFactory>> e : value2factoryClass.entrySet()) {
            int k;
            for (k = 0; k < n; k++)
                if (e.getValue() == documentFactory[k].getClass()) {
                    if (e.getKey().equals(OTHERWISE_IN_RULE))
                        value2int.defaultReturnValue(k);
                    else
                        value2int.put(e.getKey(), k);
                    break;
                }
            if (k == n)
                throw new IllegalArgumentException(
                        "Mismatch in the rule mapping " + e.getKey() + " to " + e.getValue());
        }
        System.out.println("Building a strategy mapping " + dispatchingKey + " to " + value2int);
        strategy = new StringBasedDispatchingStrategy(dispatchingKey, value2int);

    }

    /** Creates a new dispatching factory. 
     * 
     * @param documentFactory the array of subfactories.
     * @param fieldName the names of this factory's fields.
     * @param fieldType the types of this factory's fields. 
     * @param rename the way fields of this class are mapped to fields of the subfactories.
     * @param strategy the strategy to decide which factory should be used.
     */
    public DispatchingDocumentFactory(final DocumentFactory[] documentFactory, final String[] fieldName,
            final FieldType[] fieldType, final int[][] rename, final DispatchingStrategy strategy) {
        init(documentFactory, fieldName, fieldType, rename, strategy);
        checkAttributes();
    }

    public DispatchingDocumentFactory copy() {
        final DocumentFactory[] documentFactory = new DocumentFactory[this.documentFactory.length];
        for (int i = documentFactory.length; i-- != 0;)
            documentFactory[i] = this.documentFactory[i].copy();
        return new DispatchingDocumentFactory(documentFactory, fieldName, fieldType, rename, strategy);
    }

    public DispatchingDocumentFactory(final Properties properties) throws ConfigurationException {
        super(properties);
        setExtraArguments(properties);
        checkAttributes();
    }

    public DispatchingDocumentFactory(final String[] property) throws ConfigurationException {
        super(property);
        setExtraArguments(property);
        checkAttributes();
    }

    public DispatchingDocumentFactory(final Reference2ObjectMap<Enum<?>, Object> defaultMetadata) {
        super(defaultMetadata);
        checkAttributes(); // Will certainly fail because the configuration is actually missing
    }

    public DispatchingDocumentFactory() {
        super();
        checkAttributes(); // Will certainly fail because the configuration is actually missing
    }

    @SuppressWarnings({ "rawtypes", "unchecked" })
    @Override
    protected boolean parseProperty(final String key, final String[] values,
            final Reference2ObjectMap<Enum<?>, Object> metadata) throws ConfigurationException {
        if (sameKey(MetadataKeys.FIELDNAME, key)) {
            fieldName = values;
            numberOfFields = fieldName.length;
            return true;
        } else if (sameKey(MetadataKeys.KEY, key)) {
            final String dispatchingKeyName = ensureJustOne(key, values);
            final int lastDot = dispatchingKeyName.lastIndexOf('.');
            try {
                dispatchingKey = Enum.valueOf((Class<Enum>) Class.forName(dispatchingKeyName.substring(0, lastDot)),
                        dispatchingKeyName.substring(lastDot + 1));
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException(
                        "The class specified in the key " + dispatchingKeyName + " cannot be found");
            }
            return true;
        } else if (sameKey(MetadataKeys.RULE, key)) {
            String[] rules = values;
            value2factoryClass = new Object2ObjectLinkedOpenHashMap<String, Class<? extends DocumentFactory>>();
            int i, m = rules.length;
            for (i = 0; i < m; i++) {
                int pos = rules[i].indexOf(':');
                if (pos <= 0 || pos == rules[i].length() - 1)
                    throw new ConfigurationException(
                            "Rule " + rules[i] + " does not contain a colon or it is malformed");
                if (rules[i].indexOf(':', pos + 1) >= 0)
                    throw new ConfigurationException("Rule " + rules[i] + " contains too many colons");
                String factoryName = rules[i].substring(pos + 1);
                Class<? extends DocumentFactory> factoryClass = null;
                try {
                    factoryClass = (Class<? extends DocumentFactory>) Class.forName(factoryName);
                    if (!(DocumentFactory.class.isAssignableFrom(factoryClass)))
                        throw new ClassNotFoundException();
                } catch (ClassNotFoundException e) {
                    throw new ConfigurationException(
                            "ParsingFactory " + factoryName + " is invalid; maybe the package name is missing");
                }
                value2factoryClass.put(rules[i].substring(0, pos), factoryClass);
            }
            m = value2factoryClass.values().size();
            return true;

        } else if (sameKey(MetadataKeys.MAP, key)) {
            String[] pieces = values;
            int i, m = pieces.length;
            rename = new int[m][];
            for (i = 0; i < m; i++) {
                String[] subpieces = pieces[i].split(":");
                if (i > 0 && subpieces.length != rename[0].length)
                    throw new ConfigurationException("Length mismatch in the map " + values);
                rename[i] = new int[subpieces.length];
                for (int k = 0; k < subpieces.length; k++) {
                    try {
                        rename[i][k] = Integer.parseInt(subpieces[k]);
                    } catch (NumberFormatException e) {
                        throw new ConfigurationException("Number format exception in the map " + values);
                    }
                }
            }
        }
        return super.parseProperty(key, values, metadata);
    }

    public int numberOfFields() {
        return numberOfFields;
    }

    public String fieldName(final int field) {
        ensureFieldIndex(field);
        return fieldName[field];
    }

    public int fieldIndex(final String fieldName) {
        for (int k = 0; k < numberOfFields; k++)
            if (this.fieldName[k].equals(fieldName))
                return k;
        return -1;
    }

    public FieldType fieldType(final int field) {
        ensureFieldIndex(field);
        return fieldType[field];
    }

    /** A word reader that is returned when a null field should be returned. */
    final private WordReader nullReader = new FastBufferedReader();

    public Document getDocument(final InputStream rawContent, final Reference2ObjectMap<Enum<?>, Object> metadata)
            throws IOException {

        final int factoryIndex = strategy.factoryNumber(metadata, this);
        System.out.println("The strategy returned " + factoryIndex);
        if (factoryIndex < 0 || factoryIndex >= n)
            throw new IllegalArgumentException();

        System.out.println("Going to parse a document with " + metadata + ", using "
                + documentFactory[factoryIndex].getClass().getName());

        final DocumentFactory factory = documentFactory[factoryIndex];
        final Document document = factory.getDocument(rawContent, metadata);

        return new AbstractDocument() {
            public CharSequence title() {
                return document.title();
            }

            public String toString() {
                return document.toString();
            }

            public CharSequence uri() {
                return document.uri();
            }

            public Object content(final int field) throws IOException {
                ensureFieldIndex(field);
                if (rename[factoryIndex][field] < 0)
                    return NullReader.getInstance();
                return document.content(rename[factoryIndex][field]);
            }

            public WordReader wordReader(final int field) {
                ensureFieldIndex(field);
                if (rename[factoryIndex][field] < 0)
                    return nullReader;
                return document.wordReader(rename[factoryIndex][field]);
            }

            public void close() throws IOException {
                super.close();
                document.close();
            }
        };

    }

    public static void main(final String[] arg) throws IOException, ConfigurationException {
        //PdfDocumentFactory pdfFactory = new PdfDocumentFactory();
        //HtmlDocumentFactory htmlFactory = new HtmlDocumentFactory();
        //IdentityDocumentFactory idFactory = new IdentityDocumentFactory();
        //Object2IntMap map = new Object2IntOpenHashMap(
        //      new String[] { "application/pdf", "text/html" },
        //      new int[] { 0, 1 }
        //   );
        //map.defaultReturnValue( 2 );
        //DispatchingStrategy strategy = new StringBasedDispatchingStrategy( MetadataKeys.MIMETYPE, map   );

        Properties p = new Properties();
        p.addProperty(MetadataKeys.FIELDNAME.name().toLowerCase(), "text,title");
        p.addProperty(MetadataKeys.KEY.name().toLowerCase(),
                PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE.name());
        p.addProperty(MetadataKeys.RULE.name().toLowerCase(),
                "application/pdf:it.unimi.di.big.mg4j.document.PdfDocumentFactory,text/html:it.unimi.di.big.mg4j.document.HtmlDocumentFactory,?:it.unimi.di.big.mg4j.document.IdentityDocumentFactory");
        p.addProperty(MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1");
        p.addProperty(MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1");
        p.addProperty(MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1");
        p.addProperty(PropertyBasedDocumentFactory.MetadataKeys.ENCODING.name().toLowerCase(), "iso-8859-1");

        DispatchingDocumentFactory factory = new DispatchingDocumentFactory(p);
        DocumentCollection dc = new FileSetDocumentCollection(arg, factory);
        BinIO.storeObject(dc, "test.collection");
    }
}