mvm.rya.accumulo.pig.SparqlQueryPigEngine.java Source code

Java tutorial

Introduction

Here is the source code for mvm.rya.accumulo.pig.SparqlQueryPigEngine.java

Source

package mvm.rya.accumulo.pig;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import com.google.common.base.Preconditions;
import com.google.common.io.ByteStreams;
import mvm.rya.accumulo.AccumuloRdfConfiguration;
import mvm.rya.accumulo.AccumuloRdfEvalStatsDAO;
import mvm.rya.accumulo.AccumuloRyaDAO;
import mvm.rya.accumulo.pig.optimizer.SimilarVarJoinOptimizer;
import mvm.rya.rdftriplestore.evaluation.QueryJoinOptimizer;
import mvm.rya.rdftriplestore.evaluation.RdfCloudTripleStoreEvaluationStatistics;
import mvm.rya.rdftriplestore.inference.InferenceEngine;
import mvm.rya.rdftriplestore.inference.InverseOfVisitor;
import mvm.rya.rdftriplestore.inference.SymmetricPropertyVisitor;
import mvm.rya.rdftriplestore.inference.TransitivePropertyVisitor;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.ZooKeeperInstance;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.openrdf.query.algebra.QueryRoot;
import org.openrdf.query.parser.ParsedQuery;
import org.openrdf.query.parser.QueryParser;
import org.openrdf.query.parser.sparql.SPARQLParser;

import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;

/**
 * Created by IntelliJ IDEA.
 * Date: 4/23/12
 * Time: 9:31 AM
 * To change this template use File | Settings | File Templates.
 */
public class SparqlQueryPigEngine {
    private static final Log logger = LogFactory.getLog(SparqlQueryPigEngine.class);

    private String hadoopDir;
    private ExecType execType = ExecType.MAPREDUCE; //default to mapreduce
    private boolean inference = true;
    private boolean stats = true;
    private SparqlToPigTransformVisitor sparqlToPigTransformVisitor;
    private PigServer pigServer;
    private InferenceEngine inferenceEngine = null;
    private RdfCloudTripleStoreEvaluationStatistics rdfCloudTripleStoreEvaluationStatistics;
    private AccumuloRyaDAO ryaDAO;
    AccumuloRdfConfiguration conf = new AccumuloRdfConfiguration();

    private AccumuloRdfEvalStatsDAO rdfEvalStatsDAO;

    public AccumuloRdfConfiguration getConf() {
        return conf;
    }

    public void setConf(AccumuloRdfConfiguration conf) {
        this.conf = conf;
    }

    public void init() throws Exception {
        Preconditions.checkNotNull(sparqlToPigTransformVisitor, "Sparql To Pig Transform Visitor must not be null");
        logger.info("Initializing Sparql Query Pig Engine");
        if (hadoopDir != null) {
            //set hadoop dir property
            System.setProperty("HADOOPDIR", hadoopDir);
        }
        //TODO: Maybe have validation of the HadoopDir system property

        if (pigServer == null) {
            pigServer = new PigServer(execType);
        }

        if (inference || stats) {
            String instance = sparqlToPigTransformVisitor.getInstance();
            String zoo = sparqlToPigTransformVisitor.getZk();
            String user = sparqlToPigTransformVisitor.getUser();
            String pass = sparqlToPigTransformVisitor.getPassword();

            Connector connector = new ZooKeeperInstance(instance, zoo).getConnector(user, pass.getBytes());

            String tablePrefix = sparqlToPigTransformVisitor.getTablePrefix();
            conf.setTablePrefix(tablePrefix);
            if (inference) {
                logger.info("Using inference");
                inferenceEngine = new InferenceEngine();
                ryaDAO = new AccumuloRyaDAO();
                ryaDAO.setConf(conf);
                ryaDAO.setConnector(connector);
                ryaDAO.init();

                inferenceEngine.setRyaDAO(ryaDAO);
                inferenceEngine.setConf(conf);
                inferenceEngine.setSchedule(false);
                inferenceEngine.init();
            }
            if (stats) {
                logger.info("Using stats");
                rdfEvalStatsDAO = new AccumuloRdfEvalStatsDAO();
                rdfEvalStatsDAO.setConf(conf);
                rdfEvalStatsDAO.setConnector(connector);
                //                rdfEvalStatsDAO.setEvalTable(tablePrefix + RdfCloudTripleStoreConstants.TBL_EVAL_SUFFIX);
                rdfEvalStatsDAO.init();
                rdfCloudTripleStoreEvaluationStatistics = new RdfCloudTripleStoreEvaluationStatistics(conf,
                        rdfEvalStatsDAO);
            }
        }
    }

    public void destroy() throws Exception {
        logger.info("Shutting down Sparql Query Pig Engine");
        pigServer.shutdown();
        if (ryaDAO != null) {
            ryaDAO.destroy();
        }
        if (inferenceEngine != null) {
            inferenceEngine.destroy();
        }
        if (rdfEvalStatsDAO != null) {
            rdfEvalStatsDAO.destroy();
        }
    }

    /**
     * Transform a sparql query into a pig script and execute it. Save results in hdfsSaveLocation
     *
     * @param sparql           to execute
     * @param hdfsSaveLocation to save the execution
     * @throws java.io.IOException
     */
    public void runQuery(String sparql, String hdfsSaveLocation) throws IOException {
        Preconditions.checkNotNull(sparql, "Sparql query cannot be null");
        Preconditions.checkNotNull(hdfsSaveLocation, "Hdfs save location cannot be null");
        logger.info("Running query[" + sparql + "]\n to Location[" + hdfsSaveLocation + "]");
        pigServer.deleteFile(hdfsSaveLocation);
        try {
            String pigScript = generatePigScript(sparql);
            if (logger.isDebugEnabled()) {
                logger.debug("Pig script [" + pigScript + "]");
            }
            pigServer.registerScript(new ByteArrayInputStream(pigScript.getBytes()));
            pigServer.store("PROJ", hdfsSaveLocation); //TODO: Make this a constant
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    public String generatePigScript(String sparql) throws Exception {
        Preconditions.checkNotNull(sparql, "Sparql query cannot be null");
        QueryParser parser = new SPARQLParser();
        ParsedQuery parsedQuery = parser.parseQuery(sparql, null);
        QueryRoot tupleExpr = new QueryRoot(parsedQuery.getTupleExpr());

        //        SimilarVarJoinOptimizer similarVarJoinOptimizer = new SimilarVarJoinOptimizer();
        //        similarVarJoinOptimizer.optimize(tupleExpr, null, null);

        if (inference || stats) {
            if (inference) {
                tupleExpr.visit(new TransitivePropertyVisitor(conf, inferenceEngine));
                tupleExpr.visit(new SymmetricPropertyVisitor(conf, inferenceEngine));
                tupleExpr.visit(new InverseOfVisitor(conf, inferenceEngine));
            }
            if (stats) {
                (new QueryJoinOptimizer(rdfCloudTripleStoreEvaluationStatistics)).optimize(tupleExpr, null, null);
            }
        }

        sparqlToPigTransformVisitor.meet(tupleExpr);
        return sparqlToPigTransformVisitor.getPigScript();
    }

    public static void main(String[] args) {
        try {
            Preconditions.checkArgument(args.length == 7,
                    "Usage: java -cp <jar>:$PIG_LIB <class> sparqlFile hdfsSaveLocation cbinstance cbzk cbuser cbpassword rdfTablePrefix.\n "
                            + "Sample command: java -cp java -cp cloudbase.pig-2.0.0-SNAPSHOT-shaded.jar:/usr/local/hadoop-etc/hadoop-0.20.2/hadoop-0.20.2-core.jar:/srv_old/hdfs-tmp/pig/pig-0.9.2/pig-0.9.2.jar:$HADOOP_HOME/conf mvm.rya.accumulo.pig.SparqlQueryPigEngine "
                            + "tstSpqrl.query temp/engineTest stratus stratus13:2181 root password l_");
            String sparql = new String(ByteStreams.toByteArray(new FileInputStream(args[0])));
            String hdfsSaveLocation = args[1];
            SparqlToPigTransformVisitor visitor = new SparqlToPigTransformVisitor();
            visitor.setTablePrefix(args[6]);
            visitor.setInstance(args[2]);
            visitor.setZk(args[3]);
            visitor.setUser(args[4]);
            visitor.setPassword(args[5]);

            SparqlQueryPigEngine engine = new SparqlQueryPigEngine();
            engine.setSparqlToPigTransformVisitor(visitor);
            engine.setInference(false);
            engine.setStats(false);

            engine.init();

            engine.runQuery(sparql, hdfsSaveLocation);

            engine.destroy();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public String getHadoopDir() {
        return hadoopDir;
    }

    public void setHadoopDir(String hadoopDir) {
        this.hadoopDir = hadoopDir;
    }

    public PigServer getPigServer() {
        return pigServer;
    }

    public void setPigServer(PigServer pigServer) {
        this.pigServer = pigServer;
    }

    public ExecType getExecType() {
        return execType;
    }

    public void setExecType(ExecType execType) {
        this.execType = execType;
    }

    public boolean isInference() {
        return inference;
    }

    public void setInference(boolean inference) {
        this.inference = inference;
    }

    public boolean isStats() {
        return stats;
    }

    public void setStats(boolean stats) {
        this.stats = stats;
    }

    public SparqlToPigTransformVisitor getSparqlToPigTransformVisitor() {
        return sparqlToPigTransformVisitor;
    }

    public void setSparqlToPigTransformVisitor(SparqlToPigTransformVisitor sparqlToPigTransformVisitor) {
        this.sparqlToPigTransformVisitor = sparqlToPigTransformVisitor;
    }
}