Java tutorial
package com.yahoo.glimmer.indexing; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import it.unimi.di.big.mg4j.document.DocumentFactory.FieldType; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Constructor; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Counters; import org.semanticweb.owlapi.model.IRI; import org.semanticweb.owlapi.model.OWLClass; import org.semanticweb.owlapi.model.OWLClassExpression; import org.semanticweb.owlapi.model.OWLOntology; import com.yahoo.glimmer.util.Util; /* Common superclass to HorizontalDocumentFactory and VerticalDocumentFactory. * */ public abstract class RDFDocumentFactory { private static final Log LOG = LogFactory.getLog(RDFDocumentFactory.class); private static final String CONF_FIELDNAMES_KEY = "RdfFieldNames"; private static final String CONF_INDEX_TYPE_KEY = "IndexType"; private static final String CONF_WITH_CONTEXTS_KEY = "WithContexts"; private static final String CONF_RESOURCES_HASH_KEY = "ResourcesFilename"; private static final String CONF_RESOURCE_ID_PREFIX_KEY = "resourceIdPrefix"; private static final Collection<String> PREDICATE_BLACKLIST = Arrays.asList("stag", "tagspace", "ctag", "rel", "mm"); private String[] fields; private AbstractObject2LongFunction<CharSequence> resourcesHashFunction; private OWLOntology ontology; private String resourceIdPrefix = ""; // TODO How to read these? private Counters counters = new Counters(); // Include NQuad contexts in processing. private boolean withContexts; public static enum IndexType { VERTICAL(VerticalDocumentFactory.class), HORIZONTAL(HorizontalDocumentFactory.class), UNDEFINED(null); private final Class<?> factoryClass; private IndexType(Class<?> factoryClass) { this.factoryClass = factoryClass; } public Class<?> getFactoryClass() { return factoryClass; } } public abstract RDFDocument getDocument(); protected static void setupConf(Configuration conf, IndexType type, boolean withContexts, String resourcesHash, String resourceIdPrefix, String... fields) { conf.setEnum(CONF_INDEX_TYPE_KEY, type); conf.setBoolean(CONF_WITH_CONTEXTS_KEY, withContexts); if (resourcesHash != null) { conf.set(CONF_RESOURCES_HASH_KEY, resourcesHash); } conf.set(CONF_RESOURCE_ID_PREFIX_KEY, resourceIdPrefix); conf.setStrings(CONF_FIELDNAMES_KEY, fields); } public static String[] getFieldsFromConf(Configuration conf) { String[] fields = conf.getStrings(CONF_FIELDNAMES_KEY); if (fields == null) { throw new IllegalStateException("Fields not set set in the config."); } return fields; } public static IndexType getIndexType(Configuration conf) { IndexType indexType = conf.getEnum(CONF_INDEX_TYPE_KEY, IndexType.UNDEFINED); if (indexType == IndexType.UNDEFINED) { throw new IllegalStateException("Index type not set in config."); } return indexType; } public static boolean getWithContexts(Configuration conf) { return conf.getBoolean(CONF_WITH_CONTEXTS_KEY, true); } public static String getHashValuePrefix(Configuration conf) { return conf.get(CONF_RESOURCE_ID_PREFIX_KEY, ""); } public static RDFDocumentFactory buildFactory(Configuration conf) throws IOException { IndexType indexType = getIndexType(conf); RDFDocumentFactory factory; try { Constructor<?> factoryConstructor = indexType.factoryClass.getConstructor(); factory = ((RDFDocumentFactory) factoryConstructor.newInstance()); } catch (Exception e) { throw new RuntimeException(e); } factory.setFields(getFieldsFromConf(conf)); factory.setWithContexts(getWithContexts(conf)); factory.setResourceIdPrefix(getHashValuePrefix(conf)); String resourcesHashFilename = conf.get(CONF_RESOURCES_HASH_KEY); if (resourcesHashFilename != null) { // Load the hash func. Path resourcesHashPath = new Path(resourcesHashFilename); FileSystem fs = FileSystem.get(conf); InputStream resourcesHashInputStream = fs.open(resourcesHashPath); try { @SuppressWarnings("unchecked") AbstractObject2LongFunction<CharSequence> hash = (AbstractObject2LongFunction<CharSequence>) BinIO .loadObject(resourcesHashInputStream); factory.setResourcesHashFunction(hash); LOG.info("Loaded resource hash from " + resourcesHashFilename + " with " + hash.size() + " entires."); } catch (Exception e) { throw new RuntimeException(e); } finally { resourcesHashInputStream.close(); } } else { LOG.info("No resource hash filename set in conf. No hash has been loaded."); } OWLOntology ontology = OntologyLoader.load(conf); if (ontology != null) { LOG.info("Loaded ontology with " + ontology.getAxiomCount() + " axioms from distrubuted cache."); factory.setOntology(ontology); } else { LOG.info("No ontology file found in distrubuted cache."); } return factory; } public void setResourcesHashFunction(AbstractObject2LongFunction<CharSequence> resourcesHashFunction) { this.resourcesHashFunction = resourcesHashFunction; } public void setOntology(OWLOntology ontology) { this.ontology = ontology; } public String getResourceIdPrefix() { return resourceIdPrefix; } public void setResourceIdPrefix(String resourceIdPrefix) { this.resourceIdPrefix = resourceIdPrefix; } /** * @param url * @return The hash value for the given URL/BNode or null. The exact behavior depends on the implementation of the hash function used. * */ public Long lookupResource(String key) { Long value = resourcesHashFunction.get(key); if (value != null && value < 0) { // throw new RuntimeException("Negative hash value for " + key); throw new ResourceHashLookupException("Resource lookup resulted in a negative value.", key, value); } return value; } public String lookupResource(String key, boolean prefixed) { Long value = lookupResource(key); if (value != null) { if (prefixed) { return resourceIdPrefix + value.toString(); } else { return value.toString(); } } return null; } public boolean isWithContexts() { return withContexts; } public void setWithContexts(Boolean withContexts) { this.withContexts = withContexts; } public static boolean isOnPredicateBlacklist(final String predicate) { return PREDICATE_BLACKLIST.contains(predicate); } public void setFields(String[] fields) { this.fields = fields; } public int getFieldCount() { ensureFieldIndex(0); return fields.length; } public String getFieldName(final int index) { ensureFieldIndex(index); return fields[index]; } public int getFieldIndex(final String fieldName) { ensureFieldIndex(0); for (int i = 0; i < fields.length; i++) { if (fields[i].equals(fieldName)) { return i; } } return -1; } public FieldType getFieldType(final int index) { ensureFieldIndex(index); return FieldType.TEXT; } public void ensureFieldIndex(final int index) { if (fields == null) { throw new IllegalStateException("Fields not loaded."); } if (index < 0 || index >= fields.length) { throw new IndexOutOfBoundsException( "For field index " + index + ". There are only " + fields.length + " fields."); } } /** * Get all the ancestors of the give class. * @param className * @return */ public Collection<String> getAncestors(String className) { OWLClass owlClass = null; // Remove version if the class name contains a version // number if (ontology.containsClassInSignature(IRI.create(className))) { owlClass = ontology.getOWLOntologyManager().getOWLDataFactory().getOWLClass(IRI.create(className)); } else { owlClass = ontology.getOWLOntologyManager().getOWLDataFactory() .getOWLClass(IRI.create(Util.removeVersion(className))); } if (owlClass == null) { return Collections.emptySet(); } Set<OWLClassExpression> superClasses = new HashSet<OWLClassExpression>(); expandOntologyTypesR(superClasses, owlClass); if (superClasses.isEmpty()) { return Collections.emptySet(); } ArrayList<String> classAndAncestors = new ArrayList<String>(1 + superClasses.size()); for (OWLClassExpression superClass : superClasses) { classAndAncestors.add(((OWLClass) superClass).getIRI().toString()); } return classAndAncestors; } private void expandOntologyTypesR(Set<OWLClassExpression> allSuperClasses, OWLClass owlClass) { for (OWLClassExpression owlClassExpression : owlClass.getSuperClasses(ontology)) { if (owlClassExpression instanceof OWLClass && allSuperClasses.add(owlClassExpression)) { expandOntologyTypesR(allSuperClasses, (OWLClass) owlClassExpression); } } } public static enum RdfCounters { EMPTY_LINES, EMPTY_DOCUMENTS, BLACKLISTED_TRIPLES, UNINDEXED_PREDICATE_TRIPLES, RDF_TYPE_TRIPLES, INDEXED_TRIPLES, PARSE_ERROR, ONTOLOGY_SUPER_TYPE_NOT_IN_HASH, PREDICATES_NOT_IN_HASH, CONTEXT_NOT_IN_HASH, OBJECT_NOT_IN_HASH, ANCESTOR_OBJECT_NOT_IN_HASH } public void incrementCounter(RdfCounters counter, int by) { counters.findCounter(counter).increment(by); } public Counter getCounter(RdfCounters counter) { return counters.findCounter(counter); } public String getInputStreamEncodeing() { return "UTF-8"; } public static class ResourceHashLookupException extends RuntimeException { private static final long serialVersionUID = -7483398161700656105L; private final String key; private final Long value; public ResourceHashLookupException(String message, String key, Long value) { super(message); this.key = key; this.value = value; } public String getKey() { return key; } public Long getValue() { return value; } } }