org.eclipse.smila.connectivity.framework.crawler.web.metadata.Metadata.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.smila.connectivity.framework.crawler.web.metadata.Metadata.java

Source

/***********************************************************************************************************************
 * Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the accompanying
 * materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this distribution,
 * and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Dmitry Hazin (brox IT Solutions GmbH) - initial creator Sebastian Voigt (brox IT Solutions GmbH)
 *
 * This File is based on the MetaData.java from Nutch 0.8.1 (see below the licene). 
 * The original File was modified by the Smila Team 
 **********************************************************************************************************************/
/**
 * Copyright 2005 The Apache Software Foundation
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package org.eclipse.smila.connectivity.framework.crawler.web.metadata;

import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.lang.StringUtils;

/**
 * A syntax tolerant and multi-valued meta data container.
 * 
 * All the static String fields declared by this class are used as reference names for syntax correction on metadata
 * naming.
 * 
 * @author Chris Mattmann
 * @author Jérôme Charron
 * @author Dmitry Hazin (brox IT Solutions GmbH) - updates
 * @author Sebastian Voigt (brox IT Solutions GmbH) - updates
 */
public class Metadata implements HttpHeaders, Crawler {

    /** Used to format DC dates for the DATE meta data field. */
    public static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");

    /** The Constant THRESHOLD_FACTOR. */
    public static final int THRESHOLD_FACTOR = 3;

    /** The names index. */
    private static Map<String, String> s_namesIdx = new HashMap<String, String>();

    /** The s_normalized. */
    private static String[] s_normalized;

    // Uses self introspection to fill the meta names index and the
    // meta names list.
    static {
        final Field[] fields = Metadata.class.getFields();
        for (int i = 0; i < fields.length; i++) {
            final int mods = fields[i].getModifiers();
            if (Modifier.isFinal(mods) && Modifier.isPublic(mods) && Modifier.isStatic(mods)
                    && fields[i].getType().equals(String.class)) {
                try {
                    final String val = (String) fields[i].get(null);
                    s_namesIdx.put(normalize(val), val);
                } catch (final Exception e) {
                    // Simply ignore.
                    ;
                }
            }
        }
        s_normalized = s_namesIdx.keySet().toArray(new String[s_namesIdx.size()]);
    }

    /** A map of all meta data attributes. */
    private final Map<String, Object> _metadata;

    /** Constructs a new, empty meta data. */
    public Metadata() {
        _metadata = new HashMap<String, Object>();
    }

    /**
     * @param name
     *          Name of the meta
     * @return boolean
     */
    public boolean isMultiValued(final String name) {
        return getValues(name).length > 1;
    }

    /**
     * Returns an array of the names contained in the meta data.
     * 
     * @return String[]
     */
    public String[] names() {
        final Iterator<String> iter = _metadata.keySet().iterator();
        final List<String> names = new ArrayList<String>();
        while (iter.hasNext()) {
            names.add(getNormalizedName(iter.next()));
        }
        return names.toArray(new String[names.size()]);
    }

    /**
     * Get the value associated to a _metadata name. If many values are associated to the specified name, then the first
     * one is returned.
     * 
     * @param name
     *          of the meta data.
     * @return the value associated to the specified meta data name.
     */
    public String get(final String name) {
        final Object values = _metadata.get(getNormalizedName(name));
        if ((values != null) && (values instanceof List)) {
            return (String) ((List<?>) values).get(0);
        } else {
            return (String) values;
        }
    }

    /**
     * Get the values associated to a meta data name.
     * 
     * @param name
     *          of the meta data.
     * @return the values associated to a meta data name.
     */
    public String[] getValues(final String name) {
        final Object values = _metadata.get(getNormalizedName(name));
        if (values != null) {
            if (values instanceof List) {
                final List list = (List) values;
                return (String[]) list.toArray(new String[list.size()]);
            } else {
                return new String[] { (String) values };
            }
        }
        return new String[0];
    }

    /**
     * Add a meta data name/value mapping. Add the specified value to the list of values associated to the specified meta
     * data name.
     * 
     * @param name
     *          the meta data name.
     * @param value
     *          the meta data value.
     */
    public void add(final String name, final String value) {
        final String normalized = getNormalizedName(name);
        final Object values = _metadata.get(normalized);
        if (values != null) {
            if (values instanceof String) {
                final List<String> list = new ArrayList<String>();
                list.add((String) values);
                list.add(value);
                _metadata.put(normalized, list);
            } else if (values instanceof List) {
                ((List) values).add(value);
            }
        } else {
            _metadata.put(normalized, value);
        }
    }

    /**
     * Assigns a meta data names/values mapping from the given properties.
     * 
     * @param properties
     *          set of properties representing name/value mapping.
     */
    public void setAll(final Properties properties) {
        final Enumeration<?> names = properties.propertyNames();
        while (names.hasMoreElements()) {
            final String name = (String) names.nextElement();
            set(name, properties.getProperty(name));
        }
    }

    /**
     * Set _metadata name/value. Associate the specified value to the specified _metadata name. If some previous values
     * were associated to this name, they are removed.
     * 
     * @param name
     *          the _metadata name.
     * @param value
     *          the _metadata value.
     */
    public void set(final String name, final String value) {
        remove(name);
        add(name, value);
    }

    /**
     * Remove a meta data and all its associated values.
     * 
     * @param name
     *          Name of the meta data element.
     */
    public void remove(final String name) {
        _metadata.remove(getNormalizedName(name));
    }

    /**
     * Returns the number of meta data names in this meta data.
     * 
     * @return size
     */
    public int size() {
        return _metadata.size();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public boolean equals(final Object o) {

        if (o == null) {
            return false;
        }

        Metadata other = null;
        try {
            other = (Metadata) o;
        } catch (final ClassCastException cce) {
            return false;
        }

        if (other.size() != size()) {
            return false;
        }

        final String[] names = names();
        for (int i = 0; i < names.length; i++) {
            final String[] otherValues = other.getValues(names[i]);
            final String[] thisValues = getValues(names[i]);
            if (otherValues.length != thisValues.length) {
                return false;
            }
            for (int j = 0; j < otherValues.length; j++) {
                if (!otherValues[j].equals(thisValues[j])) {
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public int hashCode() {
        return names().hashCode();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public String toString() {
        final StringBuffer buf = new StringBuffer();
        final String[] names = names();
        for (int i = 0; i < names.length; i++) {
            final String[] values = getValues(names[i]);
            for (int j = 0; j < values.length; j++) {
                buf.append(names[i]).append("=").append(values[j]).append(" ");
            }
        }
        return buf.toString();
    }

    /**
     * Returns ArrayList representation of the meta data for further indexing.
     * 
     * @return ArrayList
     */
    public List<String> toArrayList() {
        final List<String> metadataArray = new ArrayList<String>();
        final String[] names = names();
        for (int i = 0; i < names.length; i++) {
            final String[] values = getValues(names[i]);
            for (int j = 0; j < values.length; j++) {
                metadataArray.add(names[i] + ":" + values[j]);
            }
        }
        return metadataArray;
    }

    /**
     * Get the normalized name of meta data attribute name. This method tries to find a well-known meta data name (one of
     * the meta data names defined in this class) that matches the specified name. The matching is error tolerent. For
     * instance,
     * <ul>
     * <li>content-type gives Content-Type</li>
     * <li>CoNtEntType gives Content-Type</li>
     * <li>ConTnTtYpe gives Content-Type</li>
     * </ul>
     * If no matching with a well-known meta data name is found, then the original name is returned.
     * 
     * @param name
     *          Meta data attribute name.
     * @return String
     */
    public static String getNormalizedName(final String name) {
        final String searched = normalize(name);
        String value = s_namesIdx.get(searched);

        if ((value == null) && (s_normalized != null)) {
            final int threshold = searched.length() / THRESHOLD_FACTOR;
            for (int i = 0; i < s_normalized.length && value == null; i++) {
                if (StringUtils.getLevenshteinDistance(searched, s_normalized[i]) < threshold) {
                    value = s_namesIdx.get(s_normalized[i]);
                }
            }
        }
        if (value != null) {
            return value;
        } else {
            return name;
        }
    }

    /**
     * Normalize.
     * 
     * @param str
     *          the string to normalize
     * 
     * @return the string
     */
    private static String normalize(final String str) {
        char c;
        final StringBuffer buf = new StringBuffer();
        for (int i = 0; i < str.length(); i++) {
            c = str.charAt(i);
            if (Character.isLetter(c)) {
                buf.append(Character.toLowerCase(c));
            }
        }
        return buf.toString();
    }

}