mvm.rya.indexing.accumulo.ConfigUtils.java Source code

Java tutorial

Introduction

Here is the source code for mvm.rya.indexing.accumulo.ConfigUtils.java

Source

package mvm.rya.indexing.accumulo;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.util.HashSet;
import java.util.List;
import java.util.Set;

import mvm.rya.accumulo.AccumuloRdfConfiguration;
import mvm.rya.api.RdfCloudTripleStoreConfiguration;
import mvm.rya.indexing.FilterFunctionOptimizer;
import mvm.rya.indexing.accumulo.entity.EntityCentricIndex;
import mvm.rya.indexing.accumulo.entity.EntityOptimizer;
import mvm.rya.indexing.accumulo.freetext.AccumuloFreeTextIndexer;
import mvm.rya.indexing.accumulo.freetext.LuceneTokenizer;
import mvm.rya.indexing.accumulo.freetext.Tokenizer;
import mvm.rya.indexing.accumulo.geo.GeoMesaGeoIndexer;
import mvm.rya.indexing.accumulo.temporal.AccumuloTemporalIndexer;
import mvm.rya.indexing.external.PrecompJoinOptimizer;
import mvm.rya.indexing.mongodb.MongoGeoIndexer;

import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.BatchScanner;
import org.apache.accumulo.core.client.BatchWriter;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.Instance;
import org.apache.accumulo.core.client.MultiTableBatchWriter;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.TableExistsException;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.ZooKeeperInstance;
import org.apache.accumulo.core.client.admin.TableOperations;
import org.apache.accumulo.core.client.mock.MockInstance;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.commons.lang.Validate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.log4j.Logger;
import org.openrdf.model.URI;
import org.openrdf.model.impl.URIImpl;

import com.google.common.collect.Lists;

/**
 * A set of configuration utils to read a Hadoop {@link Configuration} object and create Cloudbase/Accumulo objects.
 */
public class ConfigUtils {
    private static final Logger logger = Logger.getLogger(ConfigUtils.class);

    public static final String CLOUDBASE_TBL_PREFIX = "sc.cloudbase.tableprefix";
    public static final String CLOUDBASE_AUTHS = "sc.cloudbase.authorizations";
    public static final String CLOUDBASE_INSTANCE = "sc.cloudbase.instancename";
    public static final String CLOUDBASE_ZOOKEEPERS = "sc.cloudbase.zookeepers";
    public static final String CLOUDBASE_USER = "sc.cloudbase.username";
    public static final String CLOUDBASE_PASSWORD = "sc.cloudbase.password";

    public static final String CLOUDBASE_WRITER_MAX_WRITE_THREADS = "sc.cloudbase.writer.maxwritethreads";
    public static final String CLOUDBASE_WRITER_MAX_LATENCY = "sc.cloudbase.writer.maxlatency";
    public static final String CLOUDBASE_WRITER_MAX_MEMORY = "sc.cloudbase.writer.maxmemory";

    public static final String FREE_TEXT_QUERY_TERM_LIMIT = "sc.freetext.querytermlimit";

    public static final String FREE_TEXT_DOC_TABLENAME = "sc.freetext.doctable";
    public static final String FREE_TEXT_TERM_TABLENAME = "sc.freetext.termtable";
    public static final String GEO_TABLENAME = "sc.geo.table";
    public static final String GEO_NUM_PARTITIONS = "sc.geo.numPartitions";
    public static final String TEMPORAL_TABLENAME = "sc.temporal.index";
    public static final String ENTITY_TABLENAME = "sc.entity.index";

    public static final String USE_GEO = "sc.use_geo";
    public static final String USE_FREETEXT = "sc.use_freetext";
    public static final String USE_TEMPORAL = "sc.use_temporal";
    public static final String USE_ENTITY = "sc.use_entity";
    public static final String USE_PCJ = "sc.use_pcj";
    public static final String USE_OPTIMAL_PCJ = "sc.use.optimal.pcj";

    public static final String USE_INDEXING_SAIL = "sc.use.indexing.sail";
    public static final String USE_EXTERNAL_SAIL = "sc.use.external.sail";

    public static final String USE_MOCK_INSTANCE = ".useMockInstance";

    public static final String NUM_PARTITIONS = "sc.cloudbase.numPartitions";

    private static final int WRITER_MAX_WRITE_THREADS = 1;
    private static final long WRITER_MAX_LATNECY = Long.MAX_VALUE;
    private static final long WRITER_MAX_MEMORY = 10000L;

    public static final String DISPLAY_QUERY_PLAN = "query.printqueryplan";

    public static final String FREETEXT_PREDICATES_LIST = "sc.freetext.predicates";
    public static final String FREETEXT_DOC_NUM_PARTITIONS = "sc.freetext.numPartitions.text";
    public static final String FREETEXT_TERM_NUM_PARTITIONS = "sc.freetext.numPartitions.term";

    public static final String TOKENIZER_CLASS = "sc.freetext.tokenizer.class";

    public static final String GEO_PREDICATES_LIST = "sc.geo.predicates";

    public static final String TEMPORAL_PREDICATES_LIST = "sc.temporal.predicates";

    public static final String USE_MONGO = "sc.useMongo";

    public static boolean isDisplayQueryPlan(Configuration conf) {
        return conf.getBoolean(DISPLAY_QUERY_PLAN, false);
    }

    /**
     * get a value from the configuration file and throw an exception if the value does not exist.
     * 
     * @param conf
     * @param key
     * @return
     */
    private static String getStringCheckSet(Configuration conf, String key) {
        String value = conf.get(key);
        Validate.notNull(value, key + " not set");
        return value;
    }

    /**
     * @param conf
     * @param tablename
     * @return if the table was created
     * @throws AccumuloException
     * @throws AccumuloSecurityException
     * @throws TableExistsException
     */
    public static boolean createTableIfNotExists(Configuration conf, String tablename)
            throws AccumuloException, AccumuloSecurityException, TableExistsException {
        TableOperations tops = getConnector(conf).tableOperations();
        if (!tops.exists(tablename)) {
            logger.info("Creating table: " + tablename);
            tops.create(tablename);
            return true;
        }
        return false;
    }

    private static String getIndexTableName(Configuration conf, String indexTableNameConf, String altSuffix) {
        String value = conf.get(indexTableNameConf);
        if (value == null) {
            String defaultTableName = conf.get(RdfCloudTripleStoreConfiguration.CONF_TBL_PREFIX);
            Validate.notNull(defaultTableName, indexTableNameConf + " not set and "
                    + RdfCloudTripleStoreConfiguration.CONF_TBL_PREFIX + " not set.  Cannot generate table name.");
            value = conf.get(RdfCloudTripleStoreConfiguration.CONF_TBL_PREFIX) + altSuffix;
        }
        return value;
    }

    public static String getFreeTextDocTablename(Configuration conf) {
        return getIndexTableName(conf, FREE_TEXT_DOC_TABLENAME, "freetext");
    }

    public static String getFreeTextTermTablename(Configuration conf) {
        return getIndexTableName(conf, FREE_TEXT_TERM_TABLENAME, "freetext_term");
    }

    public static int getFreeTextTermLimit(Configuration conf) {
        return conf.getInt(FREE_TEXT_QUERY_TERM_LIMIT, 100);
    }

    public static String getGeoTablename(Configuration conf) {
        return getIndexTableName(conf, GEO_TABLENAME, "geo");
    }

    public static String getTemporalTableName(Configuration conf) {
        return getIndexTableName(conf, TEMPORAL_TABLENAME, "temporal");
    }

    public static String getEntityTableName(Configuration conf) {
        return getIndexTableName(conf, ENTITY_TABLENAME, "entity");
    }

    public static Set<URI> getFreeTextPredicates(Configuration conf) {
        return getPredicates(conf, FREETEXT_PREDICATES_LIST);
    }

    public static Set<URI> getGeoPredicates(Configuration conf) {
        return getPredicates(conf, GEO_PREDICATES_LIST);
    }

    /**
     * Used for indexing statements about date & time instances and intervals. 
     * @param conf
     * @return Set of predicate URI's whose objects should be date time literals.
     */
    public static Set<URI> getTemporalPredicates(Configuration conf) {
        return getPredicates(conf, TEMPORAL_PREDICATES_LIST);
    }

    private static Set<URI> getPredicates(Configuration conf, String confName) {
        String[] validPredicateStrings = conf.getStrings(confName, new String[] {});
        Set<URI> predicates = new HashSet<URI>();
        for (String prediateString : validPredicateStrings) {
            predicates.add(new URIImpl(prediateString));
        }
        return predicates;
    }

    public static Tokenizer getFreeTextTokenizer(Configuration conf) {
        Class<? extends Tokenizer> c = conf.getClass(TOKENIZER_CLASS, LuceneTokenizer.class, Tokenizer.class);
        return ReflectionUtils.newInstance(c, conf);
    }

    public static BatchWriter createDefaultBatchWriter(String tablename, Configuration conf)
            throws TableNotFoundException, AccumuloException, AccumuloSecurityException {
        Long DEFAULT_MAX_MEMORY = getWriterMaxMemory(conf);
        Long DEFAULT_MAX_LATENCY = getWriterMaxLatency(conf);
        Integer DEFAULT_MAX_WRITE_THREADS = getWriterMaxWriteThreads(conf);
        Connector connector = ConfigUtils.getConnector(conf);
        return connector.createBatchWriter(tablename, DEFAULT_MAX_MEMORY, DEFAULT_MAX_LATENCY,
                DEFAULT_MAX_WRITE_THREADS);
    }

    public static MultiTableBatchWriter createMultitableBatchWriter(Configuration conf)
            throws AccumuloException, AccumuloSecurityException {
        Long DEFAULT_MAX_MEMORY = getWriterMaxMemory(conf);
        Long DEFAULT_MAX_LATENCY = getWriterMaxLatency(conf);
        Integer DEFAULT_MAX_WRITE_THREADS = getWriterMaxWriteThreads(conf);
        Connector connector = ConfigUtils.getConnector(conf);
        return connector.createMultiTableBatchWriter(DEFAULT_MAX_MEMORY, DEFAULT_MAX_LATENCY,
                DEFAULT_MAX_WRITE_THREADS);
    }

    public static Scanner createScanner(String tablename, Configuration conf)
            throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
        Connector connector = ConfigUtils.getConnector(conf);
        Authorizations auths = ConfigUtils.getAuthorizations(conf);
        return connector.createScanner(tablename, auths);

    }

    public static BatchScanner createBatchScanner(String tablename, Configuration conf)
            throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
        Connector connector = ConfigUtils.getConnector(conf);
        Authorizations auths = ConfigUtils.getAuthorizations(conf);
        Integer numThreads = null;
        if (conf instanceof RdfCloudTripleStoreConfiguration)
            numThreads = ((RdfCloudTripleStoreConfiguration) conf).getNumThreads();
        else
            numThreads = conf.getInt(RdfCloudTripleStoreConfiguration.CONF_NUM_THREADS, 2);
        return connector.createBatchScanner(tablename, auths, numThreads);
    }

    public static int getWriterMaxWriteThreads(Configuration conf) {
        return conf.getInt(CLOUDBASE_WRITER_MAX_WRITE_THREADS, WRITER_MAX_WRITE_THREADS);
    }

    public static long getWriterMaxLatency(Configuration conf) {
        return conf.getLong(CLOUDBASE_WRITER_MAX_LATENCY, WRITER_MAX_LATNECY);
    }

    public static long getWriterMaxMemory(Configuration conf) {
        return conf.getLong(CLOUDBASE_WRITER_MAX_MEMORY, WRITER_MAX_MEMORY);
    }

    public static String getUsername(JobContext job) {
        return getUsername(job.getConfiguration());
    }

    public static String getUsername(Configuration conf) {
        return conf.get(CLOUDBASE_USER);
    }

    public static Authorizations getAuthorizations(JobContext job) {
        return getAuthorizations(job.getConfiguration());
    }

    public static Authorizations getAuthorizations(Configuration conf) {
        String authString = conf.get(CLOUDBASE_AUTHS, "");
        if (authString.isEmpty()) {
            return new Authorizations();
        }
        return new Authorizations(authString.split(","));
    }

    public static Instance getInstance(JobContext job) {
        return getInstance(job.getConfiguration());
    }

    public static Instance getInstance(Configuration conf) {
        if (useMockInstance(conf)) {
            return new MockInstance(conf.get(CLOUDBASE_INSTANCE));
        }
        return new ZooKeeperInstance(conf.get(CLOUDBASE_INSTANCE), conf.get(CLOUDBASE_ZOOKEEPERS));
    }

    public static String getPassword(JobContext job) {
        return getPassword(job.getConfiguration());
    }

    public static String getPassword(Configuration conf) {
        return conf.get(CLOUDBASE_PASSWORD, "");
    }

    public static Connector getConnector(JobContext job) throws AccumuloException, AccumuloSecurityException {
        return getConnector(job.getConfiguration());
    }

    public static Connector getConnector(Configuration conf) throws AccumuloException, AccumuloSecurityException {
        Instance instance = ConfigUtils.getInstance(conf);

        return instance.getConnector(getUsername(conf), getPassword(conf));
    }

    public static boolean useMockInstance(Configuration conf) {
        return conf.getBoolean(USE_MOCK_INSTANCE, false);
    }

    private static int getNumPartitions(Configuration conf) {
        return conf.getInt(NUM_PARTITIONS, 25);
    }

    public static int getFreeTextDocNumPartitions(Configuration conf) {
        return conf.getInt(FREETEXT_DOC_NUM_PARTITIONS, getNumPartitions(conf));
    }

    public static int getFreeTextTermNumPartitions(Configuration conf) {
        return conf.getInt(FREETEXT_TERM_NUM_PARTITIONS, getNumPartitions(conf));
    }

    public static int getGeoNumPartitions(Configuration conf) {
        return conf.getInt(GEO_NUM_PARTITIONS, getNumPartitions(conf));
    }

    public static boolean getUseGeo(Configuration conf) {
        return conf.getBoolean(USE_GEO, false);
    }

    public static boolean getUseFreeText(Configuration conf) {
        return conf.getBoolean(USE_FREETEXT, false);
    }

    public static boolean getUseTemporal(Configuration conf) {
        return conf.getBoolean(USE_TEMPORAL, false);
    }

    public static boolean getUseEntity(Configuration conf) {
        return conf.getBoolean(USE_ENTITY, false);
    }

    public static boolean getUsePCJ(Configuration conf) {
        return conf.getBoolean(USE_PCJ, false);
    }

    public static boolean getUseOptimalPCJ(Configuration conf) {
        return conf.getBoolean(USE_OPTIMAL_PCJ, false);
    }

    public static boolean getUseMongo(Configuration conf) {
        return conf.getBoolean(USE_MONGO, false);
    }

    public static void setIndexers(RdfCloudTripleStoreConfiguration conf) {

        List<String> indexList = Lists.newArrayList();
        List<String> optimizers = Lists.newArrayList();

        boolean useFilterIndex = false;

        if (ConfigUtils.getUseMongo(conf)) {
            if (getUseGeo(conf)) {
                indexList.add(MongoGeoIndexer.class.getName());
                useFilterIndex = true;
            }
        } else {

            if (getUsePCJ(conf) || getUseOptimalPCJ(conf)) {
                conf.setPcjOptimizer(PrecompJoinOptimizer.class);
            }

            if (getUseGeo(conf)) {
                indexList.add(GeoMesaGeoIndexer.class.getName());
                useFilterIndex = true;
            }

            if (getUseFreeText(conf)) {
                indexList.add(AccumuloFreeTextIndexer.class.getName());
                useFilterIndex = true;
            }

            if (getUseTemporal(conf)) {
                indexList.add(AccumuloTemporalIndexer.class.getName());
                useFilterIndex = true;
            }

        }

        if (useFilterIndex) {
            optimizers.add(FilterFunctionOptimizer.class.getName());
        }

        if (getUseEntity(conf)) {
            indexList.add(EntityCentricIndex.class.getName());
            optimizers.add(EntityOptimizer.class.getName());

        }

        conf.setStrings(AccumuloRdfConfiguration.CONF_ADDITIONAL_INDEXERS, indexList.toArray(new String[] {}));
        conf.setStrings(AccumuloRdfConfiguration.CONF_OPTIMIZERS, optimizers.toArray(new String[] {}));

    }

}