org.apache.pig.scripting.Pig.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.scripting.Pig.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.scripting;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FsShell;
import org.apache.pig.PigServer;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.impl.PigContext;
import org.apache.pig.tools.grunt.GruntParser;

/**
 * The class being used in scripts to interact with Pig
 */
public class Pig {

    private static final Log LOG = LogFactory.getLog(Pig.class);

    private static List<String> defineCache = new ArrayList<String>();

    private static List<String> scriptUDFCache = new ArrayList<String>();

    /**
     * Run a filesystem command.  Any output from this command is written to
     * stdout or stderr as appropriate.
     * @param cmd Filesystem command to run along with its arguments as one
     * string.
     * @throws IOException
     */
    public static int fs(String cmd) throws IOException {
        ScriptPigContext ctx = getScriptContext();
        FsShell shell = new FsShell(ConfigurationUtil.toConfiguration(ctx.getPigContext().getProperties()));
        int code = -1;
        if (cmd != null) {
            String[] cmdTokens = cmd.split("\\s+");
            if (!cmdTokens[0].startsWith("-"))
                cmdTokens[0] = "-" + cmdTokens[0];
            try {
                code = shell.run(cmdTokens);
            } catch (Exception e) {
                throw new IOException("Run filesystem command failed", e);
            }
        }
        return code;
    }

    /**
     * Run a sql command.  Any output from this command is written to
     * stdout or stderr as appropriate.
     * @param cmd sql command to run along with its arguments as one
     * string. Currently only hcat is supported as a sql backend
     * @throws IOException
     */
    public static int sql(String cmd) throws IOException {
        ScriptPigContext ctx = getScriptContext();
        if (!ctx.getPigContext().getProperties().get("pig.sql.type").equals("hcat")) {
            throw new IOException("sql command only support hcat currently");
        }
        if (ctx.getPigContext().getProperties().get("hcat.bin") == null) {
            throw new IOException(
                    "hcat.bin is not defined. Define it to be your hcat script (Usually $HCAT_HOME/bin/hcat");
        }
        String hcatBin = (String) ctx.getPigContext().getProperties().get("hcat.bin");
        if (new File("hcat.bin").exists()) {
            throw new IOException(
                    hcatBin + " does not exist. Please check your 'hcat.bin' setting in pig.properties.");
        }
        int ret = GruntParser.runSQLCommand(hcatBin, cmd, false);
        return ret;
    }

    /**
     * Register a jar for use in Pig.  Once this is done this jar will be
     * registered for <b>all subsequent</b> Pig pipelines in this script.
     * If you wish to register it for only a single Pig pipeline, use
     * register within that definition.
     * @param jarfile Path of jar to include.
     * @throws IOException if the indicated jarfile cannot be found.
     */
    public static void registerJar(String jarfile) throws IOException {
        LOG.info("Register jar: " + jarfile);
        ScriptPigContext ctx = getScriptContext();
        PigServer pigServer = new PigServer(ctx.getPigContext(), false);
        pigServer.registerJar(jarfile);
    }

    /**
     * Register scripting UDFs for use in Pig. Once this is done all UDFs
     * defined in the file will be available for <b>all subsequent</b>
     * Pig pipelines in this script. If you wish to register UDFS for
     * only a single Pig pipeline, use register within that definition.
     * @param udffile Path of the script UDF file
     * @param namespace namespace of the UDFs
     * @throws IOException
     */
    public static void registerUDF(String udffile, String namespace) throws IOException {
        LOG.info("Register script UDF file: " + udffile);
        ScriptPigContext ctx = getScriptContext();
        ScriptEngine engine = ctx.getScriptEngine();
        // script file contains only functions, no need to separate
        // functions from control flow code
        if (namespace != null && namespace.isEmpty())
            namespace = null;
        engine.registerFunctions(udffile, namespace, ctx.getPigContext());
        addRegisterScriptUDFClause(udffile, namespace);
    }

    /**
     * Define an alias for a UDF or a streaming command.  This definition
     * will then be present for <b>all subsequent</b> Pig pipelines defined in this
     * script.  If you wish to define it for only a single Pig pipeline, use
     * define within that definition.
     * @param alias name of the defined alias
     * @param definition string this alias is defined as
     */
    public static void define(String alias, String definition) throws IOException {
        LOG.info("Add define clause: " + alias + " -- " + definition);
        addDefineClause(alias, definition);
    }

    /**
     * Set a variable for use in Pig Latin.  This set
     * will then be present for <b>all subsequent</b> Pig pipelines defined in this
     * script.  If you wish to set it for only a single Pig pipeline, use
     * set within that definition.
     * @param var variable to set
     * @param value to set it to
     */
    public static void set(String var, String value) throws IOException {
        ScriptPigContext ctx = getScriptContext();
        PigServer pigServer = new PigServer(ctx.getPigContext(), false);
        pigServer.getPigContext().getProperties().setProperty(var, value);
    }

    /**
     * Define a Pig pipeline.
     * @param pl Pig Latin definition of the pipeline.
     * @return Pig object representing this pipeline.
     * @throws IOException if the Pig Latin does not compile.
     */
    public static Pig compile(String pl) throws IOException {
        return compile(null, pl);
    }

    /**
     * Define a named portion of a Pig pipeline.  This allows it
     * to be imported into another pipeline.
     * @param name Name that will be used to define this pipeline.
     * The namespace is global.
     * @param pl Pig Latin definition of the pipeline.
     * @return Pig object representing this pipeline.
     * @throws IOException if the Pig Latin does not compile.
     */
    public static Pig compile(String name, String pl) throws IOException {
        ScriptPigContext ctx = getScriptContext();
        StringBuilder sb = new StringBuilder();
        sb.append(getRegisterScriptUDFClauses()).append(getDefineClauses());
        sb.append(pl).append("\n");
        return new Pig(sb.toString(), ctx, name);
    }

    /**
     * Define a Pig pipeline based on Pig Latin in a separate file.
     * @param filename File to read Pig Latin from.  This must be a purely
     * Pig Latin file.  It cannot contain host language constructs in it.
     * @return Pig object representing this pipeline.
     * @throws IOException if the Pig Latin does not compile or the file
     * cannot be found.
     */
    public static Pig compileFromFile(String filename) throws IOException {
        return compileFromFile(null, filename);
    }

    /**
     * Define a named Pig pipeline based on Pig Latin in a separate file.
     * This allows it to be imported into another pipeline.
     * @param name Name that will be used to define this pipeline.
     * The namespace is global.
     * @param filename File to read Pig Latin from.  This must be a purely
     * Pig Latin file.  It cannot contain host language constructs in it.
     * @return Pig object representing this pipeline.
     * @throws IOException if the Pig Latin does not compile or the file
     * cannot be found.
     */
    public static Pig compileFromFile(String name, String filename) throws IOException {
        return compile(name, getScriptFromFile(filename));
    }

    //-------------------------------------------------------------------------

    /**
     * Bind this to a set of variables. Values must be provided
     * for all Pig Latin parameters.
     * @param vars map of variables to bind.  Keys should be parameters defined
     * in the Pig Latin.  Values should be strings that provide values for those
     * parameters.  They can be either constants or variables from the host
     * language.  Host language variables must contain strings.
     * @return a {@link BoundScript} object
     * @throws IOException if there is not a key for each
     * Pig Latin parameter or if they contain unsupported types.
     */
    public BoundScript bind(Map<String, Object> vars) throws IOException {
        return new BoundScript(replaceParameters(script, vars), scriptContext, name);
    }

    /**
     * Bind this to multiple sets of variables.  This will
     * cause the Pig Latin script to be executed in parallel over these sets of
     * variables.
     * @param vars list of maps of variables to bind.  Keys should be parameters defined
     * in the Pig Latin.  Values should be strings that provide values for those
     * variables.  They can be either constants or variables from the host
     * language.  Host language variables must be strings.
     * @return a {@link BoundScript} object
     * @throws IOException  if there is not a key for each
     * Pig Latin parameter or if they contain unsupported types.
     */
    public BoundScript bind(List<Map<String, Object>> vars) throws IOException {
        List<String> lst = new ArrayList<String>();
        for (Map<String, Object> var : vars) {
            lst.add(replaceParameters(script, var));
        }
        return new BoundScript(lst, scriptContext, name);
    }

    /**
     * Bind a Pig object to variables in the host language (optional
     * operation).  This does an implicit mapping of variables in the host
     * language to parameters in Pig Latin.  For example, if the user
     * provides a Pig Latin statement
     * <tt> p = Pig.compile("A = load '$input';");</tt>
     * and then calls this function it will look for a variable called
     * <tt>input</tt> in the host language.  Scoping rules of the host
     * language will be followed in selecting which variable to bind.  The
     * variable bound must contain a string value.  This method is optional
     * because not all host languages may support searching for in scope
     * variables.
     * @throws IOException if host language variables are not found to resolve all
     * Pig Latin parameters or if they contain unsupported types.
     */
    public BoundScript bind() throws IOException {
        ScriptEngine engine = scriptContext.getScriptEngine();
        int index = script.indexOf('$');
        if (index == -1) { // no parameter substitution is needed
            return new BoundScript(script, scriptContext, name);
        }
        Map<String, Object> vars = engine.getParamsFromVariables();
        return bind(vars);
    }

    //-------------------------------------------------------------------------

    private String script = null;

    private ScriptPigContext scriptContext = null;

    private String name = null;

    protected Pig(String script, ScriptPigContext scriptContext, String name) {
        this.script = script;
        this.scriptContext = scriptContext;
        this.name = name;
    }

    /**
     * Replaces the $<identifier> with their actual values
     * @param qstr the pig script to rewrite
     * @param vars parameters and their values
     * @return the modified version
     */
    private String replaceParameters(String qstr, Map<String, Object> vars) throws IOException {

        List<String> params = new ArrayList<String>();
        for (Entry<String, Object> entry : vars.entrySet()) {
            params.add(entry.getKey() + "=" + fixNonEscapedDollarSign(entry.getValue().toString()));
        }

        PigContext context = getScriptContext().getPigContext();
        List<String> contextParams = context.getParams();
        if (contextParams != null) {
            for (String param : contextParams) {
                params.add(param);
            }
        }

        BufferedReader reader = new BufferedReader(new StringReader(qstr));
        String substituted = context.doParamSubstitution(reader, params, context.getParamFiles());
        context.setParams(contextParams); // reset params that were originally in PigContext
        return substituted;
    }

    // Escape the $ so that we can use the parameter substitution
    // to perform bind operation. Parameter substitution will un-escape $
    private static String fixNonEscapedDollarSign(String s) {
        String[] tkns = s.split("\\$", -1);

        if (tkns.length == 1)
            return s;

        StringBuilder sb = new StringBuilder();

        for (int i = 0; i < tkns.length - 1; i++) {
            if (tkns[i].isEmpty()) {
                sb.append("\\\\");
            } else {
                sb.append(tkns[i]);
                if (tkns[i].charAt(tkns[i].length() - 1) != '\\') {
                    sb.append("\\\\");
                }
            }
            sb.append("$");
        }
        sb.append(tkns[tkns.length - 1]);

        return sb.toString();
    }

    //-------------------------------------------------------------------------

    private static String getScriptFromFile(String filename) throws IOException {
        LineNumberReader rd = new LineNumberReader(new FileReader(filename));
        StringBuilder sb = new StringBuilder();
        try {
            String line = rd.readLine();
            while (line != null) {
                sb.append(line);
                sb.append("\n");
                line = rd.readLine();
            }
        } finally {
            rd.close();
        }
        return sb.toString();
    }

    private static void addDefineClause(String alias, String definition) {
        defineCache.add("DEFINE " + alias + " " + definition + ";\n");
    }

    private static void addRegisterScriptUDFClause(String path, String namespace) throws IOException {
        ScriptPigContext ctx = getScriptContext();
        ScriptEngine engine = ctx.getScriptEngine();
        String clause = "REGISTER '" + path + "' USING " + engine.getScriptingLang();
        if (namespace != null && !namespace.isEmpty()) {
            clause += " AS " + namespace;
        }
        scriptUDFCache.add(clause + ";\n");
    }

    private static String getDefineClauses() {
        StringBuilder sb = new StringBuilder();
        for (String def : defineCache) {
            sb.append(def);
        }
        return sb.toString();
    }

    private static String getRegisterScriptUDFClauses() {
        StringBuilder sb = new StringBuilder();
        for (String udf : scriptUDFCache) {
            sb.append(udf);
        }
        return sb.toString();
    }

    private static ScriptPigContext getScriptContext() throws IOException {
        ScriptPigContext ctx = ScriptPigContext.get();
        if (ctx == null) {
            throw new IOException("Script context is not set");
        }
        return ctx;
    }

}