com.yahoo.glimmer.indexing.RDFDocumentFactory.java Source code

Java tutorial

Introduction

Here is the source code for com.yahoo.glimmer.indexing.RDFDocumentFactory.java

Source

package com.yahoo.glimmer.indexing;

/*
 * Copyright (c) 2012 Yahoo! Inc. All rights reserved.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software distributed under the License is 
 *  distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and limitations under the License.
 *  See accompanying LICENSE file.
 */

import it.unimi.di.big.mg4j.document.DocumentFactory.FieldType;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction;

import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.semanticweb.owlapi.model.IRI;
import org.semanticweb.owlapi.model.OWLClass;
import org.semanticweb.owlapi.model.OWLClassExpression;
import org.semanticweb.owlapi.model.OWLOntology;

import com.yahoo.glimmer.util.Util;

/* Common superclass to HorizontalDocumentFactory and VerticalDocumentFactory.
 * 
 */
public abstract class RDFDocumentFactory {
    private static final Log LOG = LogFactory.getLog(RDFDocumentFactory.class);

    private static final String CONF_FIELDNAMES_KEY = "RdfFieldNames";
    private static final String CONF_INDEX_TYPE_KEY = "IndexType";
    private static final String CONF_WITH_CONTEXTS_KEY = "WithContexts";
    private static final String CONF_RESOURCES_HASH_KEY = "ResourcesFilename";
    private static final String CONF_RESOURCE_ID_PREFIX_KEY = "resourceIdPrefix";

    private static final Collection<String> PREDICATE_BLACKLIST = Arrays.asList("stag", "tagspace", "ctag", "rel",
            "mm");

    private String[] fields;
    private AbstractObject2LongFunction<CharSequence> resourcesHashFunction;
    private OWLOntology ontology;
    private String resourceIdPrefix = "";

    // TODO How to read these?
    private Counters counters = new Counters();

    // Include NQuad contexts in processing.
    private boolean withContexts;

    public static enum IndexType {
        VERTICAL(VerticalDocumentFactory.class), HORIZONTAL(HorizontalDocumentFactory.class), UNDEFINED(null);

        private final Class<?> factoryClass;

        private IndexType(Class<?> factoryClass) {
            this.factoryClass = factoryClass;
        }

        public Class<?> getFactoryClass() {
            return factoryClass;
        }
    }

    public abstract RDFDocument getDocument();

    protected static void setupConf(Configuration conf, IndexType type, boolean withContexts, String resourcesHash,
            String resourceIdPrefix, String... fields) {
        conf.setEnum(CONF_INDEX_TYPE_KEY, type);
        conf.setBoolean(CONF_WITH_CONTEXTS_KEY, withContexts);
        if (resourcesHash != null) {
            conf.set(CONF_RESOURCES_HASH_KEY, resourcesHash);
        }
        conf.set(CONF_RESOURCE_ID_PREFIX_KEY, resourceIdPrefix);
        conf.setStrings(CONF_FIELDNAMES_KEY, fields);
    }

    public static String[] getFieldsFromConf(Configuration conf) {
        String[] fields = conf.getStrings(CONF_FIELDNAMES_KEY);
        if (fields == null) {
            throw new IllegalStateException("Fields not set set in the config.");
        }
        return fields;
    }

    public static IndexType getIndexType(Configuration conf) {
        IndexType indexType = conf.getEnum(CONF_INDEX_TYPE_KEY, IndexType.UNDEFINED);
        if (indexType == IndexType.UNDEFINED) {
            throw new IllegalStateException("Index type not set in config.");
        }
        return indexType;
    }

    public static boolean getWithContexts(Configuration conf) {
        return conf.getBoolean(CONF_WITH_CONTEXTS_KEY, true);
    }

    public static String getHashValuePrefix(Configuration conf) {
        return conf.get(CONF_RESOURCE_ID_PREFIX_KEY, "");
    }

    public static RDFDocumentFactory buildFactory(Configuration conf) throws IOException {
        IndexType indexType = getIndexType(conf);

        RDFDocumentFactory factory;
        try {
            Constructor<?> factoryConstructor = indexType.factoryClass.getConstructor();
            factory = ((RDFDocumentFactory) factoryConstructor.newInstance());
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        factory.setFields(getFieldsFromConf(conf));
        factory.setWithContexts(getWithContexts(conf));
        factory.setResourceIdPrefix(getHashValuePrefix(conf));
        String resourcesHashFilename = conf.get(CONF_RESOURCES_HASH_KEY);
        if (resourcesHashFilename != null) {
            // Load the hash func.
            Path resourcesHashPath = new Path(resourcesHashFilename);
            FileSystem fs = FileSystem.get(conf);
            InputStream resourcesHashInputStream = fs.open(resourcesHashPath);
            try {
                @SuppressWarnings("unchecked")
                AbstractObject2LongFunction<CharSequence> hash = (AbstractObject2LongFunction<CharSequence>) BinIO
                        .loadObject(resourcesHashInputStream);
                factory.setResourcesHashFunction(hash);
                LOG.info("Loaded resource hash from " + resourcesHashFilename + " with " + hash.size()
                        + " entires.");
            } catch (Exception e) {
                throw new RuntimeException(e);
            } finally {
                resourcesHashInputStream.close();
            }
        } else {
            LOG.info("No resource hash filename set in conf.  No hash has been loaded.");
        }

        OWLOntology ontology = OntologyLoader.load(conf);
        if (ontology != null) {
            LOG.info("Loaded ontology with " + ontology.getAxiomCount() + " axioms from distrubuted cache.");
            factory.setOntology(ontology);
        } else {
            LOG.info("No ontology file found in distrubuted cache.");
        }

        return factory;
    }

    public void setResourcesHashFunction(AbstractObject2LongFunction<CharSequence> resourcesHashFunction) {
        this.resourcesHashFunction = resourcesHashFunction;
    }

    public void setOntology(OWLOntology ontology) {
        this.ontology = ontology;
    }

    public String getResourceIdPrefix() {
        return resourceIdPrefix;
    }

    public void setResourceIdPrefix(String resourceIdPrefix) {
        this.resourceIdPrefix = resourceIdPrefix;
    }

    /**
     * @param url
     * @return The hash value for the given URL/BNode or null. The exact behavior depends on the implementation of the hash function used.
     *  
     */
    public Long lookupResource(String key) {
        Long value = resourcesHashFunction.get(key);
        if (value != null && value < 0) {
            // throw new RuntimeException("Negative hash value for " + key);
            throw new ResourceHashLookupException("Resource lookup resulted in a negative value.", key, value);
        }
        return value;
    }

    public String lookupResource(String key, boolean prefixed) {
        Long value = lookupResource(key);
        if (value != null) {
            if (prefixed) {
                return resourceIdPrefix + value.toString();
            } else {
                return value.toString();
            }
        }
        return null;
    }

    public boolean isWithContexts() {
        return withContexts;
    }

    public void setWithContexts(Boolean withContexts) {
        this.withContexts = withContexts;
    }

    public static boolean isOnPredicateBlacklist(final String predicate) {
        return PREDICATE_BLACKLIST.contains(predicate);
    }

    public void setFields(String[] fields) {
        this.fields = fields;
    }

    public int getFieldCount() {
        ensureFieldIndex(0);
        return fields.length;
    }

    public String getFieldName(final int index) {
        ensureFieldIndex(index);
        return fields[index];
    }

    public int getFieldIndex(final String fieldName) {
        ensureFieldIndex(0);
        for (int i = 0; i < fields.length; i++) {
            if (fields[i].equals(fieldName)) {
                return i;
            }
        }
        return -1;
    }

    public FieldType getFieldType(final int index) {
        ensureFieldIndex(index);
        return FieldType.TEXT;
    }

    public void ensureFieldIndex(final int index) {
        if (fields == null) {
            throw new IllegalStateException("Fields not loaded.");
        }
        if (index < 0 || index >= fields.length) {
            throw new IndexOutOfBoundsException(
                    "For field index " + index + ". There are only " + fields.length + " fields.");
        }
    }

    /**
     * Get all the ancestors of the give class.
     * @param className
     * @return
     */
    public Collection<String> getAncestors(String className) {
        OWLClass owlClass = null;
        // Remove version if the class name contains a version
        // number
        if (ontology.containsClassInSignature(IRI.create(className))) {
            owlClass = ontology.getOWLOntologyManager().getOWLDataFactory().getOWLClass(IRI.create(className));
        } else {
            owlClass = ontology.getOWLOntologyManager().getOWLDataFactory()
                    .getOWLClass(IRI.create(Util.removeVersion(className)));
        }
        if (owlClass == null) {
            return Collections.emptySet();
        }

        Set<OWLClassExpression> superClasses = new HashSet<OWLClassExpression>();
        expandOntologyTypesR(superClasses, owlClass);

        if (superClasses.isEmpty()) {
            return Collections.emptySet();
        }

        ArrayList<String> classAndAncestors = new ArrayList<String>(1 + superClasses.size());
        for (OWLClassExpression superClass : superClasses) {
            classAndAncestors.add(((OWLClass) superClass).getIRI().toString());
        }
        return classAndAncestors;
    }

    private void expandOntologyTypesR(Set<OWLClassExpression> allSuperClasses, OWLClass owlClass) {
        for (OWLClassExpression owlClassExpression : owlClass.getSuperClasses(ontology)) {
            if (owlClassExpression instanceof OWLClass && allSuperClasses.add(owlClassExpression)) {
                expandOntologyTypesR(allSuperClasses, (OWLClass) owlClassExpression);
            }
        }
    }

    public static enum RdfCounters {
        EMPTY_LINES, EMPTY_DOCUMENTS, BLACKLISTED_TRIPLES, UNINDEXED_PREDICATE_TRIPLES, RDF_TYPE_TRIPLES, INDEXED_TRIPLES, PARSE_ERROR, ONTOLOGY_SUPER_TYPE_NOT_IN_HASH, PREDICATES_NOT_IN_HASH, CONTEXT_NOT_IN_HASH, OBJECT_NOT_IN_HASH, ANCESTOR_OBJECT_NOT_IN_HASH
    }

    public void incrementCounter(RdfCounters counter, int by) {
        counters.findCounter(counter).increment(by);
    }

    public Counter getCounter(RdfCounters counter) {
        return counters.findCounter(counter);
    }

    public String getInputStreamEncodeing() {
        return "UTF-8";
    }

    public static class ResourceHashLookupException extends RuntimeException {
        private static final long serialVersionUID = -7483398161700656105L;
        private final String key;
        private final Long value;

        public ResourceHashLookupException(String message, String key, Long value) {
            super(message);
            this.key = key;
            this.value = value;
        }

        public String getKey() {
            return key;
        }

        public Long getValue() {
            return value;
        }
    }
}