com.msd.gin.halyard.sail.HBaseSail.java Source code

Java tutorial

Introduction

Here is the source code for com.msd.gin.halyard.sail.HBaseSail.java

Source

/*
 * Copyright 2016 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co.,
 * Inc., Kenilworth, NJ, USA.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.msd.gin.halyard.sail;

import com.msd.gin.halyard.common.HalyardTableUtils;
import com.msd.gin.halyard.strategy.HalyardEvaluationStrategy;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.io.hfile.FixedFileTrailer;
import org.apache.hadoop.hbase.util.FSUtils;
import org.eclipse.rdf4j.IsolationLevel;
import org.eclipse.rdf4j.IsolationLevels;
import org.eclipse.rdf4j.common.iteration.CloseableIteration;
import org.eclipse.rdf4j.common.iteration.CloseableIteratorIteration;
import org.eclipse.rdf4j.common.iteration.EmptyIteration;
import org.eclipse.rdf4j.common.iteration.ExceptionConvertingIteration;
import org.eclipse.rdf4j.common.iteration.TimeLimitIteration;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Namespace;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.impl.SimpleNamespace;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.query.BindingSet;
import org.eclipse.rdf4j.query.Dataset;
import org.eclipse.rdf4j.query.QueryEvaluationException;
import org.eclipse.rdf4j.query.algebra.QueryRoot;
import org.eclipse.rdf4j.query.algebra.StatementPattern;
import org.eclipse.rdf4j.query.algebra.TupleExpr;
import org.eclipse.rdf4j.query.algebra.Var;
import org.eclipse.rdf4j.query.algebra.evaluation.EvaluationStrategy;
import org.eclipse.rdf4j.query.algebra.evaluation.TripleSource;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.BindingAssigner;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.CompareOptimizer;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.ConjunctiveConstraintSplitter;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.ConstantOptimizer;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.DisjunctiveConstraintOptimizer;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.EvaluationStatistics;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.FilterOptimizer;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.IterativeEvaluationOptimizer;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.OrderLimitOptimizer;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.QueryJoinOptimizer;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.QueryModelNormalizer;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.SameTermFilterOptimizer;
import org.eclipse.rdf4j.query.algebra.evaluation.impl.StrictEvaluationStrategy;
import org.eclipse.rdf4j.query.impl.EmptyBindingSet;
import org.eclipse.rdf4j.sail.Sail;
import org.eclipse.rdf4j.sail.SailConnection;
import org.eclipse.rdf4j.sail.SailException;
import org.eclipse.rdf4j.sail.UnknownSailTransactionStateException;
import org.eclipse.rdf4j.sail.UpdateContext;

/**
 * HBaseSail is RDF storage implementation on top of Apache HBase.
 * It implements both interfaces - Sail and SailConnection.
 * @author Adam Sotona (MSD)
 */
public final class HBaseSail implements Sail, SailConnection {

    /**
     * Ticker is a simple service interface that is notified when some data are processed.
     * It's purpose is to notify caller (for example MapReduce task) that the execution is still alive.
     */
    public interface Ticker {

        /**
         * This method is called whenever a new Statement is populated from HBase.
         */
        public void tick();
    }

    private static final IRI NAMESPACE_PREFIX_PREDICATE = SimpleValueFactory.getInstance()
            .createIRI("http://gin.msd.com/halyard/namespace#prefix");
    private static final Logger LOG = Logger.getLogger(HBaseSail.class.getName());
    private static final long STATUS_CACHING_TIMEOUT = 60000l;

    private final Configuration config;
    final String tableName;
    final boolean create;
    final boolean pushStrategy;
    final int splitBits;
    private final EvaluationStatistics statistics;
    final int evaluationTimeout;
    private long size = 0, sizeTimestamp = -1;
    private boolean readOnly = false;
    private long readOnlyTimestamp = -1;
    private final Ticker ticker;

    HTable table = null;

    //TODO non-persistent namespaces
    private final Map<String, Namespace> namespaces = new HashMap<>();

    /**
     * Construct HBaseSail object with given arguments.
     * @param config Hadoop Configuration to access HBase
     * @param tableName HBase table name
     * @param create boolean option to create the table if does not exists
     * @param splitBits int number of bits used for calculation of HTable region pre-splits (applies for new tables only)
     * @param pushStrategy boolean option to use {@link com.msd.gin.halyard.strategy.HalyardEvaluationStrategy} instead of {@link org.eclipse.rdf4j.query.algebra.evaluation.impl.StrictEvaluationStrategy}
     * @param evaluationTimeout int timeout in seconds for each query evaluation, negative values mean no timeout
     * @param ticker optional Ticker callback for keep-alive notifications
     */
    public HBaseSail(Configuration config, String tableName, boolean create, int splitBits, boolean pushStrategy,
            int evaluationTimeout, Ticker ticker) {
        this.config = config;
        this.tableName = tableName;
        this.create = create;
        this.splitBits = splitBits;
        this.pushStrategy = pushStrategy;
        this.statistics = new EvaluationStatistics() {
            @Override
            protected EvaluationStatistics.CardinalityCalculator createCardinalityCalculator() {
                return new CardinalityCalculator() {
                    @Override
                    protected double getCardinality(StatementPattern sp) {
                        List<Var> vars = sp.getVarList();
                        int constantVarCount = countConstantVars(vars);
                        double shift = RDF.TYPE.equals(sp.getPredicateVar().getValue()) ? 0.1 : 0.0;
                        double unboundVarFactor = (vars.size() - constantVarCount + shift) / vars.size();
                        return Math.pow(1000.0, unboundVarFactor);
                    }
                };
            }
        };
        this.evaluationTimeout = evaluationTimeout;
        this.ticker = ticker;
    }

    @Override
    public void setDataDir(File dataDir) {
    }

    @Override
    public File getDataDir() {
        throw new UnsupportedOperationException();
    }

    @Override
    public void initialize() throws SailException {
        try {
            table = HalyardTableUtils.getTable(config, tableName, create, splitBits, null);
            try (CloseableIteration<? extends Statement, SailException> nsIter = getStatements(null,
                    NAMESPACE_PREFIX_PREDICATE, null, true)) {
                while (nsIter.hasNext()) {
                    Statement st = nsIter.next();
                    if (st.getObject() instanceof Literal) {
                        String prefix = st.getObject().stringValue();
                        String name = st.getSubject().stringValue();
                        namespaces.put(prefix, new SimpleNamespace(prefix, name));
                    }
                }
            }
        } catch (IOException ex) {
            throw new SailException(ex);
        }
    }

    @Override
    public void shutDown() throws SailException {
        try {
            table.close();
            table = null;
        } catch (IOException ex) {
            throw new SailException(ex);
        }
    }

    @Override
    public boolean isWritable() throws SailException {
        if (readOnlyTimestamp + STATUS_CACHING_TIMEOUT < System.currentTimeMillis())
            try {
                readOnly = table.getTableDescriptor().isReadOnly();
                readOnlyTimestamp = System.currentTimeMillis();
            } catch (IOException ex) {
                throw new SailException(ex);
            }
        return !readOnly;
    }

    @Override
    public SailConnection getConnection() throws SailException {
        return this;
    }

    @Override
    public ValueFactory getValueFactory() {
        return SimpleValueFactory.getInstance();
    }

    @Override
    public List<IsolationLevel> getSupportedIsolationLevels() {
        return Collections.singletonList((IsolationLevel) IsolationLevels.NONE);
    }

    @Override
    public IsolationLevel getDefaultIsolationLevel() {
        return IsolationLevels.NONE;
    }

    @Override
    public boolean isOpen() throws SailException {
        return table != null;
    }

    @Override
    public void close() throws SailException {
    }

    private static Resource[] normalizeContexts(Resource... contexts) {
        if (contexts == null || contexts.length == 0) {
            return new Resource[] { null };
        } else {
            return contexts;
        }
    }

    @Override
    public CloseableIteration<? extends BindingSet, QueryEvaluationException> evaluate(TupleExpr tupleExpr,
            Dataset dataset, BindingSet bindings, final boolean includeInferred) throws SailException {
        tupleExpr = tupleExpr.clone();
        if (!(tupleExpr instanceof QueryRoot)) {
            // Add a dummy root node to the tuple expressions to allow the
            // optimizers to modify the actual root node
            tupleExpr = new QueryRoot(tupleExpr);
        }
        final long startTime = System.currentTimeMillis();
        TripleSource source = new TripleSource() {
            @Override
            public CloseableIteration<? extends Statement, QueryEvaluationException> getStatements(Resource subj,
                    IRI pred, Value obj, Resource... contexts) throws QueryEvaluationException {
                try {
                    return new ExceptionConvertingIteration<Statement, QueryEvaluationException>(
                            new StatementScanner(startTime, subj, pred, obj, contexts)) {
                        @Override
                        protected QueryEvaluationException convert(Exception e) {
                            return new QueryEvaluationException(e);
                        }

                    };
                } catch (SailException ex) {
                    throw new QueryEvaluationException(ex);
                }
            }

            @Override
            public ValueFactory getValueFactory() {
                return SimpleValueFactory.getInstance();
            }
        };

        EvaluationStrategy strategy = pushStrategy
                ? new HalyardEvaluationStrategy(source, dataset, evaluationTimeout)
                : new StrictEvaluationStrategy(source, dataset, null);

        new BindingAssigner().optimize(tupleExpr, dataset, bindings);
        new ConstantOptimizer(strategy).optimize(tupleExpr, dataset, bindings);
        new CompareOptimizer().optimize(tupleExpr, dataset, bindings);
        new ConjunctiveConstraintSplitter().optimize(tupleExpr, dataset, bindings);
        new DisjunctiveConstraintOptimizer().optimize(tupleExpr, dataset, bindings);
        new SameTermFilterOptimizer().optimize(tupleExpr, dataset, bindings);
        new QueryModelNormalizer().optimize(tupleExpr, dataset, bindings);
        new QueryJoinOptimizer(statistics).optimize(tupleExpr, dataset, bindings);
        // new SubSelectJoinOptimizer().optimize(tupleExpr, dataset, bindings);
        new IterativeEvaluationOptimizer().optimize(tupleExpr, dataset, bindings);
        new FilterOptimizer().optimize(tupleExpr, dataset, bindings);
        new OrderLimitOptimizer().optimize(tupleExpr, dataset, bindings);

        try {
            CloseableIteration<? extends BindingSet, QueryEvaluationException> iter = strategy.evaluate(tupleExpr,
                    EmptyBindingSet.getInstance());
            return evaluationTimeout <= 0 ? iter
                    : new TimeLimitIteration<BindingSet, QueryEvaluationException>(iter,
                            1000l * evaluationTimeout) {
                        @Override
                        protected void throwInterruptedException() throws QueryEvaluationException {
                            throw new QueryEvaluationException(
                                    "Query evaluation exceeded specified timeout " + evaluationTimeout + "s");
                        }
                    };
        } catch (QueryEvaluationException ex) {
            throw new SailException(ex);
        }
    }

    @Override
    public CloseableIteration<? extends Resource, SailException> getContextIDs() throws SailException {
        return new EmptyIteration<>();
    }

    @Override
    public CloseableIteration<? extends Statement, SailException> getStatements(Resource subj, IRI pred, Value obj,
            boolean includeInferred, Resource... contexts) throws SailException {
        return new StatementScanner(System.currentTimeMillis(), subj, pred, obj, contexts);
    }

    @Override
    public synchronized long size(Resource... contexts) throws SailException {
        if (contexts != null && contexts.length > 0 && contexts[0] != null) {
            throw new SailException("Size calculation is not supported for named graphs");
        }
        if (sizeTimestamp < 0
                || (isWritable() && sizeTimestamp + STATUS_CACHING_TIMEOUT < System.currentTimeMillis()))
            try {
                long entries = 0;
                FileSystem fs = FileSystem.get(config);
                Collection<HColumnDescriptor> families = table.getTableDescriptor().getFamilies();
                Set<String> familyNames = new HashSet<>(families.size());
                for (HColumnDescriptor hcd : families) {
                    familyNames.add(hcd.getNameAsString());
                }
                Path tableDir = FSUtils.getTableDir(FSUtils.getRootDir(config), table.getName());
                PathFilter dirFilter = new FSUtils.DirFilter(fs);
                int divider = 1;
                for (HRegionLocation hrl : table.getRegionLocator().getAllRegionLocations()) {
                    HRegionInfo hri = hrl.getRegionInfo();
                    byte[] skey = hri.getStartKey();
                    if (skey.length == 0 || skey[0] == HalyardTableUtils.SPO_PREFIX) {
                        byte[] ekey = hri.getEndKey();
                        if (ekey.length == 0 || ekey[0] > HalyardTableUtils.POS_PREFIX) {
                            divider = 3;
                        }
                        for (FileStatus familyDir : fs.listStatus(new Path(tableDir, hri.getEncodedName()),
                                dirFilter)) {
                            if (familyNames.contains(familyDir.getPath().getName())) {
                                for (FileStatus file : fs.listStatus(familyDir.getPath())) {
                                    if (file.isFile()) {
                                        try (FSDataInputStream in = fs.open(file.getPath())) {
                                            entries += FixedFileTrailer.readFromStream(in, file.getLen())
                                                    .getEntryCount();
                                        } catch (Exception e) {
                                            LOG.log(Level.WARNING,
                                                    "Exception while reading trailer from hfile: " + file.getPath(),
                                                    e);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                size = entries / divider;
                sizeTimestamp = System.currentTimeMillis();
            } catch (IOException e) {
                throw new SailException(e);
            }
        return size;
    }

    @Override
    public void begin() throws SailException {
    }

    @Override
    public void begin(IsolationLevel level) throws UnknownSailTransactionStateException, SailException {
        if (level != null && level != IsolationLevels.NONE) {
            throw new UnknownSailTransactionStateException(
                    "Isolation level " + level + " is not compatible with this HBaseSail");
        }
    }

    @Override
    public void flush() throws SailException {
    }

    @Override
    public void prepare() throws SailException {
    }

    @Override
    public void commit() throws SailException {
        try {
            table.flushCommits();
        } catch (IOException ex) {
            throw new SailException(ex);
        }
    }

    @Override
    public void rollback() throws SailException {
        throw new UnsupportedOperationException();
    }

    @Override
    public boolean isActive() throws UnknownSailTransactionStateException {
        return true;
    }

    @Override
    public void addStatement(UpdateContext op, Resource subj, IRI pred, Value obj, Resource... contexts)
            throws SailException {
        addStatement(subj, pred, obj, contexts);
    }

    @Override
    public void addStatement(Resource subj, IRI pred, Value obj, Resource... contexts) throws SailException {
        for (Resource ctx : normalizeContexts(contexts)) {
            addStatementInternal(subj, pred, obj, ctx);
        }
    }

    private void addStatementInternal(Resource subj, IRI pred, Value obj, Resource context) throws SailException {
        if (!isWritable())
            throw new SailException(tableName + " is read only");
        try {
            for (KeyValue kv : HalyardTableUtils.toKeyValues(subj, pred, obj, context)) {
                table.put(
                        new Put(kv.getRowArray(), kv.getRowOffset(), kv.getRowLength(), kv.getTimestamp()).add(kv));
            }
        } catch (IOException e) {
            throw new SailException(e);
        }
    }

    @Override
    public void removeStatement(UpdateContext op, Resource subj, IRI pred, Value obj, Resource... contexts)
            throws SailException {
        if (!isWritable())
            throw new SailException(tableName + " is read only");
        try {
            List<Delete> deletes = new ArrayList<>();
            for (Resource ctx : normalizeContexts(contexts)) {
                for (KeyValue kv : HalyardTableUtils.toKeyValues(subj, pred, obj, ctx)) {
                    deletes.add(new Delete(kv.getRowArray(), kv.getRowOffset(), kv.getRowLength())
                            .addColumn(kv.getFamily(), kv.getQualifier()));
                }
            }
            table.delete(deletes);
        } catch (IOException e) {
            throw new SailException(e);
        }
    }

    @Override
    public void removeStatements(Resource subj, IRI pred, Value obj, Resource... contexts) throws SailException {
        if (!isWritable())
            throw new SailException(tableName + " is read only");
        contexts = normalizeContexts(contexts);
        if (subj == null && pred == null && obj == null && contexts[0] == null) {
            clearAll();
        } else {
            try (CloseableIteration<? extends Statement, SailException> iter = getStatements(subj, pred, obj, true,
                    contexts)) {
                while (iter.hasNext()) {
                    Statement st = iter.next();
                    removeStatement(null, st.getSubject(), st.getPredicate(), st.getObject(), st.getContext());
                }
            }
        }
    }

    @Override
    public void startUpdate(UpdateContext op) throws SailException {
    }

    @Override
    public void endUpdate(UpdateContext op) throws SailException {
    }

    @Override
    public void clear(Resource... contexts) throws SailException {
        removeStatements(null, null, null, contexts);
    }

    private void clearAll() throws SailException {
        if (!isWritable())
            throw new SailException(tableName + " is read only");
        try {
            table = HalyardTableUtils.truncateTable(table);
        } catch (IOException ex) {
            throw new SailException(ex);
        }
    }

    @Override
    public String getNamespace(String prefix) throws SailException {
        Namespace namespace = namespaces.get(prefix);
        return (namespace == null) ? null : namespace.getName();
    }

    @Override
    public CloseableIteration<? extends Namespace, SailException> getNamespaces() {
        return new CloseableIteratorIteration<>(namespaces.values().iterator());
    }

    @Override
    public void setNamespace(String prefix, String name) throws SailException {
        Namespace oldNS = namespaces.put(prefix, new SimpleNamespace(prefix, name));
        ValueFactory vf = SimpleValueFactory.getInstance();
        try {
            if (oldNS != null) {
                removeStatement(null, vf.createIRI(oldNS.getName()), NAMESPACE_PREFIX_PREDICATE,
                        vf.createLiteral(prefix));
            }
            addStatementInternal(vf.createIRI(name), NAMESPACE_PREFIX_PREDICATE, vf.createLiteral(prefix), null);
        } catch (SailException e) {
            LOG.log(Level.WARNING, "Namespace prefix could not be presisted due to an exception", e);
        }
    }

    @Override
    public void removeNamespace(String prefix) throws SailException {
        ValueFactory vf = SimpleValueFactory.getInstance();
        Namespace ns = namespaces.remove(prefix);
        if (ns != null)
            try {
                removeStatement(null, vf.createIRI(ns.getName()), NAMESPACE_PREFIX_PREDICATE,
                        vf.createLiteral(prefix));
            } catch (SailException e) {
                LOG.log(Level.WARNING, "Namespace prefix could not be removed due to an exception", e);
            }
    }

    @Override
    public void clearNamespaces() throws SailException {
        try {
            removeStatements(null, NAMESPACE_PREFIX_PREDICATE, null);
        } catch (SailException e) {
            LOG.log(Level.WARNING, "Namespaces could not be cleared due to an exception", e);
        }
        namespaces.clear();
    }

    private class StatementScanner implements CloseableIteration<Statement, SailException> {

        private final Resource subj;
        private final IRI pred;
        private final Value obj;
        private final Iterator<Resource> contexts;
        private ResultScanner rs = null;
        private final long endTime;
        private Statement next = null;
        private Iterator<Statement> iter = null;

        public StatementScanner(long startTime, Resource subj, IRI pred, Value obj, Resource... contexts)
                throws SailException {
            this.subj = subj;
            this.pred = pred;
            this.obj = obj;
            this.contexts = Arrays.asList(normalizeContexts(contexts)).iterator();
            this.endTime = startTime + (1000l * evaluationTimeout);
        }

        private Result nextResult() throws IOException {
            while (true) {
                if (rs == null) {
                    if (contexts.hasNext()) {
                        rs = table.getScanner(HalyardTableUtils.scan(subj, pred, obj, contexts.next()));
                    } else {
                        return null;
                    }
                }
                Result res = rs.next();
                if (ticker != null)
                    ticker.tick();
                if (res == null) {
                    rs.close();
                    rs = null;
                } else {
                    return res;
                }
            }
        }

        @Override
        public void close() throws SailException {
            if (rs != null) {
                rs.close();
            }
        }

        @Override
        public synchronized boolean hasNext() throws SailException {
            if (evaluationTimeout > 0 && System.currentTimeMillis() > endTime) {
                throw new SailException(
                        "Statements scanning exceeded specified timeout " + evaluationTimeout + "s");
            }
            if (next == null)
                try {
                    while (true) {
                        if (iter == null) {
                            Result res = nextResult();
                            if (res == null) {
                                return false;
                            } else {
                                iter = HalyardTableUtils.parseStatements(res).iterator();
                            }
                        }
                        while (iter.hasNext()) {
                            Statement s = iter.next();
                            if ((subj == null || subj.equals(s.getSubject()))
                                    && (pred == null || pred.equals(s.getPredicate()))
                                    && (obj == null || obj.equals(s.getObject()))) {
                                next = s;
                                return true;
                            }
                        }
                        iter = null;
                    }
                } catch (IOException e) {
                    throw new SailException(e);
                }
            else {
                return true;
            }
        }

        @Override
        public synchronized Statement next() throws SailException {
            if (hasNext()) {
                Statement st = next;
                next = null;
                return st;
            } else {
                throw new NoSuchElementException();
            }
        }

        @Override
        public void remove() throws SailException {
            throw new UnsupportedOperationException();
        }
    }
}