org.apache.pig.PigServer.java Source code

Introduction

Here is the source code for org.apache.pig.PigServer.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Deque;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.pig.backend.datastorage.ContainerDescriptor;
import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.backend.datastorage.ElementDescriptor;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.executionengine.ExecJob;
import org.apache.pig.backend.executionengine.ExecJob.JOB_STATUS;
import org.apache.pig.backend.hadoop.PigATSClient;
import org.apache.pig.backend.hadoop.executionengine.HJob;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.classification.InterfaceAudience;
import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileLocalizer;
import org.apache.pig.impl.io.FileLocalizer.FetchFileRet;
import org.apache.pig.impl.io.compress.BZip2CodecWithExtensionBZ;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.streaming.StreamingCommand;
import org.apache.pig.impl.util.LogUtils;
import org.apache.pig.impl.util.PropertiesUtil;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.UriUtil;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.newplan.DependencyOrderWalker;
import org.apache.pig.newplan.Operator;
import org.apache.pig.newplan.logical.Util;
import org.apache.pig.newplan.logical.expression.LogicalExpressionPlan;
import org.apache.pig.newplan.logical.expression.LogicalExpressionVisitor;
import org.apache.pig.newplan.logical.expression.ScalarExpression;
import org.apache.pig.newplan.logical.optimizer.AllExpressionVisitor;
import org.apache.pig.newplan.logical.relational.LOForEach;
import org.apache.pig.newplan.logical.relational.LOLoad;
import org.apache.pig.newplan.logical.relational.LOStore;
import org.apache.pig.newplan.logical.relational.LogicalPlan;
import org.apache.pig.newplan.logical.relational.LogicalPlanData;
import org.apache.pig.newplan.logical.relational.LogicalRelationalOperator;
import org.apache.pig.newplan.logical.relational.LogicalSchema;
import org.apache.pig.parser.QueryParserDriver;
import org.apache.pig.parser.QueryParserUtils;
import org.apache.pig.pen.ExampleGenerator;
import org.apache.pig.scripting.ScriptEngine;
import org.apache.pig.tools.grunt.GruntParser;
import org.apache.pig.tools.pigstats.EmptyPigStats;
import org.apache.pig.tools.pigstats.JobStats;
import org.apache.pig.tools.pigstats.OutputStats;
import org.apache.pig.tools.pigstats.PigStats;
import org.apache.pig.tools.pigstats.PigStats.JobGraph;
import org.apache.pig.tools.pigstats.ScriptState;
import org.apache.pig.validator.BlackAndWhitelistFilter;
import org.apache.pig.validator.PigCommandFilter;

import com.google.common.annotations.VisibleForTesting;

/**
 *
 * A class for Java programs to connect to Pig. Typically a program will create a PigServer
 * instance. The programmer then registers queries using registerQuery() and
 * retrieves results using openIterator() or store(). After doing so, the
 * shutdown() method should be called to free any resources used by the current
 * PigServer instance. Not doing so could result in a memory leak.
 *
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class PigServer {

    protected final Log log = LogFactory.getLog(getClass());

    public static final String PRETTY_PRINT_SCHEMA_PROPERTY = "pig.pretty.print.schema";
    private static final String PIG_LOCATION_CHECK_STRICT = "pig.location.check.strict";

    /*
     * The data structure to support grunt shell operations.
     * The grunt shell can only work on one graph at a time.
     * If a script is contained inside another script, the grunt
     * shell first saves the current graph on the stack and works
     * on a new graph. After the nested script is done, the grunt
     * shell pops up the saved graph and continues working on it.
     */
    protected final Deque<Graph> graphs = new LinkedList<Graph>();

    /*
     * The current Graph the grunt shell is working on.
     */
    private Graph currDAG;

    protected final PigContext pigContext;

    private String jobName;

    private String jobPriority;

    private final static AtomicInteger scopeCounter = new AtomicInteger(0);

    protected final String scope = constructScope();

    private boolean validateEachStatement = false;
    private boolean skipParseInRegisterForBatch = false;

    private final BlackAndWhitelistFilter filter;

    private String constructScope() {
        // scope servers for now as a session id

        // String user = System.getProperty("user.name", "DEFAULT_USER_ID");
        // String date = (new Date()).toString();

        // scope is not really used in the system right now. It will
        // however make your explain statements look lengthy if set to
        // username-date. For now let's simplify the scope, if a real
        // scope is needed again, we might need to update all the
        // operators to not include scope in their name().
        return "" + scopeCounter.incrementAndGet();
    }

    @VisibleForTesting
    public static void resetScope() {
        scopeCounter.set(0);
    }

    /**
     * @param execTypeString can be 'mapreduce' or 'local'.  Local mode will
     * use Hadoop's local job runner to execute the job on the local machine.
     * Mapreduce mode will connect to a cluster to execute the job. If
     * execTypeString is not one of these two, Pig will deduce the ExecutionEngine
     * if it is on the classpath and use it for the backend execution.
     * @throws ExecException
     * @throws IOException
     */
    public PigServer(String execTypeString) throws ExecException, IOException {
        this(addExecTypeProperty(PropertiesUtil.loadDefaultProperties(), execTypeString));
    }

    public PigServer(String execTypeString, Properties properties) throws ExecException, IOException {
        this(addExecTypeProperty(properties, execTypeString));
    }

    public PigServer(Properties properties) throws ExecException, IOException {
        this(new PigContext(properties));
    }

    private static Properties addExecTypeProperty(Properties properties, String execType) {
        properties.setProperty("exectype", execType);
        return properties;
    }

    /**
     * @param execType execution type to start the engine.  Local mode will
     * use Hadoop's local job runner to execute the job on the local machine.
     * Mapreduce mode will connect to a cluster to execute the job.
     * @throws ExecException
     */
    public PigServer(ExecType execType) throws ExecException {
        this(execType, PropertiesUtil.loadDefaultProperties());
    }

    public PigServer(ExecType execType, Properties properties) throws ExecException {
        this(new PigContext(execType, properties));
    }

    public PigServer(ExecType execType, Configuration conf) throws ExecException {
        this(new PigContext(execType, conf));
    }

    public PigServer(PigContext context) throws ExecException {
        this(context, true);
    }

    public PigServer(PigContext context, boolean connect) throws ExecException {
        this.pigContext = context;
        currDAG = new Graph(false);

        jobName = pigContext.getProperties().getProperty(PigContext.JOB_NAME,
                PigContext.JOB_NAME_PREFIX + ":DefaultJobName");

        if (connect) {
            pigContext.connect();
        }

        this.filter = new BlackAndWhitelistFilter(this);

        addHadoopProperties();
        addJarsFromProperties();
        markPredeployedJarsFromProperties();

        if (ScriptState.get() == null) {
            // If Pig was started via command line, ScriptState should have been
            // already initialized in Main. If so, we should not overwrite it.
            ScriptState.start(pigContext.getExecutionEngine().instantiateScriptState());
        }
        PigStats.start(pigContext.getExecutionEngine().instantiatePigStats());

        // log ATS event includes the caller context
        String auditId = PigATSClient.getPigAuditId(pigContext);
        String callerId = (String) pigContext.getProperties().get(PigConfiguration.CALLER_ID);
        log.info("Pig Script ID for the session: " + auditId);
        if (callerId != null) {
            log.info("Caller ID for session: " + callerId);
        }
        if (Boolean.parseBoolean(pigContext.getProperties().getProperty(PigConfiguration.ENABLE_ATS))) {
            if (Boolean.parseBoolean(
                    pigContext.getProperties().getProperty("yarn.timeline-service.enabled", "false"))) {
                PigATSClient.ATSEvent event = new PigATSClient.ATSEvent(auditId, callerId);
                try {
                    PigATSClient.getInstance().logEvent(event);
                } catch (Exception e) {
                    log.warn("Error posting to ATS: ", e);
                }
            } else {
                log.warn("ATS is disabled since" + " yarn.timeline-service.enabled set to false");
            }

        }

        // set hdfs caller context
        Class callerContextClass = null;
        try {
            callerContextClass = Class.forName("org.apache.hadoop.ipc.CallerContext");
        } catch (ClassNotFoundException e) {
            // If pre-Hadoop 2.8.0, skip setting CallerContext
        }
        if (callerContextClass != null) {
            try {
                // Reflection for the following code since it is only available since hadoop 2.8.0:
                // CallerContext hdfsContext = new CallerContext.Builder(auditId).build();
                // CallerContext.setCurrent(hdfsContext);
                Class callerContextBuilderClass = Class.forName("org.apache.hadoop.ipc.CallerContext$Builder");
                Constructor callerContextBuilderConstruct = callerContextBuilderClass.getConstructor(String.class);
                Object builder = callerContextBuilderConstruct.newInstance(auditId);
                Method builderBuildMethod = builder.getClass().getMethod("build");
                Object hdfsContext = builderBuildMethod.invoke(builder);
                Method callerContextSetCurrentMethod = callerContextClass.getMethod("setCurrent",
                        hdfsContext.getClass());
                callerContextSetCurrentMethod.invoke(callerContextClass, hdfsContext);
            } catch (Exception e) {
                // Shall not happen unless API change in future Hadoop commons
                throw new ExecException(e);
            }
        }
    }

    private void addHadoopProperties() throws ExecException {
        // For BZip input on hadoop 0.23/2.X
        // with PIG_BZIP_USE_HADOOP_INPUTFORMAT turned on,
        // PigTextInputFormat depends on hadoop's TextInputFormat
        // for handling bzip2 input. One problem is it only recognize 'bz2'
        // as extension and not 'bz'.
        // Adding custom BZip2 codec that returns 'bz' as extension
        // for backward compatibility.
        String codecs = pigContext.getProperties().getProperty("io.compression.codecs");

        if (codecs != null && codecs.contains(BZip2Codec.class.getCanonicalName())) {
            pigContext.getProperties().setProperty("io.compression.codecs",
                    codecs + "," + BZip2CodecWithExtensionBZ.class.getCanonicalName());
        }
    }

    private void addJarsFromProperties() throws ExecException {
        //add jars from properties to extraJars
        String jar_str = pigContext.getProperties().getProperty("pig.additional.jars");
        if (jar_str == null) {
            jar_str = "";
        }
        jar_str = jar_str.replaceAll(File.pathSeparator, ",");
        if (!jar_str.isEmpty()) {
            jar_str += ",";
        }

        String jar_str_comma = pigContext.getProperties().getProperty("pig.additional.jars.uris");
        if (jar_str_comma != null && !jar_str_comma.isEmpty()) {
            jar_str = jar_str + jar_str_comma;
        }

        if (jar_str != null && !jar_str.isEmpty()) {
            // Use File.pathSeparator (":" on Linux, ";" on Windows)
            // to correctly handle path aggregates as they are represented
            // on the Operating System.
            for (String jar : jar_str.split(",")) {
                try {
                    registerJar(jar);
                } catch (IOException e) {
                    int errCode = 4010;
                    String msg = "Failed to register jar :" + jar + ". Caught exception.";
                    throw new ExecException(msg, errCode, PigException.USER_ENVIRONMENT, e);
                }
            }
        }
    }

    private void markPredeployedJarsFromProperties() throws ExecException {
        // mark jars as predeployed from properties
        String jar_str = pigContext.getProperties().getProperty("pig.predeployed.jars");

        if (jar_str != null) {
            // Use File.pathSeparator (":" on Linux, ";" on Windows)
            // to correctly handle path aggregates as they are represented
            // on the Operating System.
            for (String jar : jar_str.split(File.pathSeparator)) {
                if (jar.length() > 0) {
                    pigContext.markJarAsPredeployed(jar);
                }
            }
        }
    }

    public PigContext getPigContext() {
        return pigContext;
    }

    /**
     * Current DAG
     *
     * @return
     */
    public Graph getCurrentDAG() {
        return this.currDAG;
    }

    /**
     * Set the logging level to DEBUG.
     */
    public void debugOn() {
        Logger.getLogger("org.apache.pig").setLevel(Level.DEBUG);
        pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig", Level.DEBUG.toString());
    }

    /**
     * Set the logging level to the default.
     */
    public void debugOff() {
        Logger.getLogger("org.apache.pig").setLevel(pigContext.getDefaultLogLevel());
        pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig",
                pigContext.getDefaultLogLevel().toString());
    }

    /**
     * Set the default parallelism for this job
     * @param p default number of reducers to use for this job.
     */
    public void setDefaultParallel(int p) {
        pigContext.defaultParallel = p;
    }

    /**
     * Starts batch execution mode.
     */
    public void setBatchOn() {
        log.debug("Create a new graph.");

        if (currDAG != null) {
            graphs.push(currDAG);
        }
        currDAG = new Graph(true);
    }

    /**
     * Retrieve the current execution mode.
     *
     * @return true if the execution mode is batch; false otherwise.
     */
    public boolean isBatchOn() {
        // Batch is on when there are multiple graphs on the
        // stack. That gives the right response even if multiquery was
        // turned off.
        return graphs.size() > 0;
    }

    /**
     * Returns whether there is anything to process in the current batch.
     * @throws FrontendException
     * @return true if there are no stores to process in the current
     * batch, false otherwise.
     */
    public boolean isBatchEmpty() throws FrontendException {
        if (currDAG == null) {
            int errCode = 1083;
            String msg = "setBatchOn() must be called first.";
            throw new FrontendException(msg, errCode, PigException.INPUT);
        }

        return currDAG.isBatchEmpty();
    }

    /**
     * This method parses the scripts and builds the LogicalPlan. This method
     * should be followed by {@link PigServer#executeBatch(boolean)} with
     * argument as false. Do Not use {@link PigServer#executeBatch()} after
     * calling this method as that will re-parse and build the script.
     *
     * @throws IOException
     */
    public void parseAndBuild() throws IOException {
        if (currDAG == null || !isBatchOn()) {
            int errCode = 1083;
            String msg = "setBatchOn() must be called first.";
            throw new FrontendException(msg, errCode, PigException.INPUT);
        }
        currDAG.parseQuery();
        currDAG.buildPlan(null);
    }

    /**
     * Submits a batch of Pig commands for execution.
     *
     * @return list of jobs being executed
     * @throws IOException
     */
    public List<ExecJob> executeBatch() throws IOException {
        return executeBatch(true);
    }

    /**
     * Submits a batch of Pig commands for execution. Parse and build of script
     * should be skipped if user called {@link PigServer#parseAndBuild()}
     * before. Pass false as an argument in which case.
     *
     * @param parseAndBuild
     * @return
     * @throws IOException
     */
    public List<ExecJob> executeBatch(boolean parseAndBuild) throws IOException {
        if (parseAndBuild) {
            parseAndBuild();
        }

        PigStats stats = execute();

        return getJobs(stats);
    }

    /**
     * Retrieves a list of Job objects from the PigStats object
     * @param stats
     * @return A list of ExecJob objects
     */
    protected List<ExecJob> getJobs(PigStats stats) {
        LinkedList<ExecJob> jobs = new LinkedList<ExecJob>();
        if (stats instanceof EmptyPigStats) {
            HJob job = new HJob(HJob.JOB_STATUS.COMPLETED, pigContext, stats.result(null).getPOStore(), null);
            jobs.add(job);
            return jobs;
        }
        JobGraph jGraph = stats.getJobGraph();
        Iterator<JobStats> iter = jGraph.iterator();
        while (iter.hasNext()) {
            JobStats js = iter.next();
            for (OutputStats output : js.getOutputs()) {
                if (js.isSuccessful()) {
                    jobs.add(new HJob(HJob.JOB_STATUS.COMPLETED, pigContext, output.getPOStore(), output.getAlias(),
                            stats));
                } else {
                    HJob hjob = new HJob(HJob.JOB_STATUS.FAILED, pigContext, output.getPOStore(), output.getAlias(),
                            stats);
                    hjob.setException(js.getException());
                    jobs.add(hjob);
                }
            }
        }
        return jobs;
    }

    /**
     * Discards a batch of Pig commands.
     *
     * @throws FrontendException
     */
    public void discardBatch() throws FrontendException {
        if (currDAG == null || !isBatchOn()) {
            int errCode = 1083;
            String msg = "setBatchOn() must be called first.";
            throw new FrontendException(msg, errCode, PigException.INPUT);
        }

        currDAG = graphs.pop();
    }

    /**
     * Add a path to be skipped while automatically shipping binaries for
     * streaming.
     *
     * @param path path to be skipped
     */
    public void addPathToSkip(String path) {
        pigContext.addPathToSkip(path);
    }

    /**
     * Defines an alias for the given function spec. This
     * is useful for functions that require arguments to the
     * constructor.
     *
     * @param function - the new function alias to define.
     * @param funcSpec - the FuncSpec object representing the name of
     * the function class and any arguments to constructor.
     */
    public void registerFunction(String function, FuncSpec funcSpec) {
        pigContext.registerFunction(function, funcSpec);
    }

    /**
     * Defines an alias for the given streaming command.
     *
     * @param commandAlias - the new command alias to define
     * @param command - streaming command to be executed
     */
    public void registerStreamingCommand(String commandAlias, StreamingCommand command) {
        pigContext.registerStreamCmd(commandAlias, command);
    }

    private URL locateJarFromResources(String jarName) throws IOException {
        Enumeration<URL> urls = ClassLoader.getSystemResources(jarName);
        URL resourceLocation = null;

        if (urls.hasMoreElements()) {
            resourceLocation = urls.nextElement();
        }

        if (urls.hasMoreElements()) {
            StringBuffer sb = new StringBuffer("Found multiple resources that match ");
            sb.append(jarName);
            sb.append(": ");
            sb.append(resourceLocation);

            while (urls.hasMoreElements()) {
                sb.append(urls.nextElement());
                sb.append("; ");
            }

            log.debug(sb.toString());
        }

        return resourceLocation;
    }

    /**
     * Registers a jar file. Name of the jar file can be an absolute or
     * relative path.
     *
     * If multiple resources are found with the specified name, the
     * first one is registered as returned by getSystemResources.
     * A warning is issued to inform the user.
     *
     * @param name of the jar file to register
     * @throws IOException
     */
    public void registerJar(String name) throws IOException {
        // Check if this operation is permitted
        filter.validate(PigCommandFilter.Command.REGISTER);

        if (pigContext.hasJar(name)) {
            log.debug("Ignoring duplicate registration for jar " + name);
            return;
        }

        // first try to locate jar via system resources
        // if this fails, try by using "name" as File (this preserves
        // compatibility with case when user passes absolute path or path
        // relative to current working directory.)
        if (name != null) {
            if (name.isEmpty()) {
                log.warn("Empty string specified for jar path");
                return;
            }

            URL resource = locateJarFromResources(name);

            if (resource == null) {
                FetchFileRet[] files = FileLocalizer.fetchFiles(pigContext.getProperties(), name);
                for (FetchFileRet file : files) {
                    File f = file.file;
                    if (!f.canRead()) {
                        int errCode = 4002;
                        String msg = "Can't read jar file: " + name;
                        throw new FrontendException(msg, errCode, PigException.USER_ENVIRONMENT);
                    }

                    pigContext.addJar(f.toURI().toURL(), name);
                }
            } else {
                pigContext.addJar(resource, name);
            }
        }
    }

    /**
     * Universal Scripting Language Support, see PIG-928
     *
     * @param path path of the script file
     * @param scriptingLang language keyword or scriptingEngine used to interpret the script
     * @param namespace namespace defined for functions of this script
     * @throws IOException
     */
    public void registerCode(String path, String scriptingLang, String namespace) throws IOException {
        if (pigContext.scriptingUDFs.containsKey(path) && pigContext.scriptingUDFs.get(path).equals(namespace)) {
            log.debug("Ignoring duplicate registration for scripting udf file " + path + " in namespace "
                    + namespace);
            return;
        } else {
            pigContext.scriptingUDFs.put(path, namespace);
        }

        FetchFileRet ret = FileLocalizer.fetchFile(pigContext.getProperties(), path);
        File f = ret.file;
        if (!f.canRead()) {
            int errCode = 4002;
            String msg = "Can't read file: " + path;
            throw new FrontendException(msg, errCode, PigException.USER_ENVIRONMENT);
        }
        String cwd = new File(".").getCanonicalPath();
        String filePath = f.getCanonicalPath();
        String nameInJar = filePath;
        // Use the relative path in the jar, if the path specified is relative
        if (!ret.didFetch) {
            if (!new File(path).isAbsolute() && path.indexOf("." + File.separator) == -1) {
                // In case of Oozie, the localized files are in a different
                // directory symlinked to the current directory. Canonical path will not point to cwd.
                nameInJar = path;
            } else if (filePath.equals(cwd + File.separator + path)) {
                // If user specified absolute path and it refers to cwd
                nameInJar = filePath.substring(cwd.length() + 1);
            }
        }

        pigContext.addScriptFile(nameInJar, filePath);
        if (scriptingLang != null) {
            ScriptEngine se = ScriptEngine.getInstance(scriptingLang);
            se.registerFunctions(nameInJar, namespace, pigContext);
        }
    }

    /**
     * Register a query with the Pig runtime. The query is parsed and registered, but it is not
     * executed until it is needed.
     *
     * @param query
     *            a Pig Latin expression to be evaluated.
     * @param startLine
     *            line number of the query within the whole script
     * @throws IOException
     */
    public void registerQuery(String query, int startLine) throws IOException {
        currDAG.registerQuery(query, startLine, validateEachStatement, skipParseInRegisterForBatch);
    }

    /**
     * Register a query with the Pig runtime. The query is parsed and registered, but it is not
     * executed until it is needed.  Equivalent to calling {@link #registerQuery(String, int)}
     * with startLine set to 1.
     *
     * @param query
     *            a Pig Latin expression to be evaluated.
     * @throws IOException
     */
    public void registerQuery(String query) throws IOException {
        registerQuery(query, 1);
    }

    /**
     * Register a pig script from InputStream source which is more general and extensible
     * the pig script can be from local file, then you can use FileInputStream.
     * or pig script can be in memory which you build it dynamically, the you can use ByteArrayInputStream
     * even pig script can be in remote machine, which you get wrap it as SocketInputStream
     * @param in
     * @throws IOException
     */
    public void registerScript(InputStream in) throws IOException {
        registerScript(in, null, null);
    }

    /**
     * Register a pig script from InputStream source which is more general and extensible
     * the pig script can be from local file, then you can use FileInputStream.
     * or pig script can be in memory which you build it dynamically, the you can use ByteArrayInputStream
     * even pig script can be in remote machine, which you get wrap it as SocketInputStream.
     * The parameters in the pig script will be substituted with the values in params
     * @param in
     * @param params the key is the parameter name, and the value is the parameter value
     * @throws IOException
     */
    public void registerScript(InputStream in, Map<String, String> params) throws IOException {
        registerScript(in, params, null);
    }

    /**
     * Register a pig script from InputStream source which is more general and extensible
     * the pig script can be from local file, then you can use FileInputStream.
     * or pig script can be in memory which you build it dynamically, the you can use ByteArrayInputStream
     * even pig script can be in remote machine, which you get wrap it as SocketInputStream
     * The parameters in the pig script will be substituted with the values in the parameter files
     * @param in
     * @param paramsFiles  files which have the parameter setting
     * @throws IOException
     */
    public void registerScript(InputStream in, List<String> paramsFiles) throws IOException {
        registerScript(in, null, paramsFiles);
    }

    /**
     * Register a pig script from InputStream.<br>
     * The pig script can be from local file, then you can use FileInputStream.
     * Or pig script can be in memory which you build it dynamically, the you can use ByteArrayInputStream
     * Pig script can even be in remote machine, which you get wrap it as SocketInputStream.<br>
     * The parameters in the pig script will be substituted with the values in the map and the parameter files.
     * The values in params Map will override the value in parameter file if they have the same parameter
     * @param in
     * @param params the key is the parameter name, and the value is the parameter value
     * @param paramsFiles  files which have the parameter setting
     * @throws IOException
     */
    public void registerScript(InputStream in, Map<String, String> params, List<String> paramsFiles)
            throws IOException {
        try {
            String substituted = pigContext.doParamSubstitution(in, paramMapToList(params), paramsFiles);
            GruntParser grunt = new GruntParser(new StringReader(substituted), this);
            grunt.setInteractive(false);
            grunt.parseStopOnError(true);
        } catch (org.apache.pig.tools.pigscript.parser.ParseException e) {
            log.error(e.getLocalizedMessage());
            throw new IOException(e);
        }
    }

    protected List<String> paramMapToList(Map<String, String> params) {
        List<String> paramList = new ArrayList<String>();
        if (params != null) {
            for (Map.Entry<String, String> entry : params.entrySet()) {
                paramList.add(entry.getKey() + "=" + entry.getValue());
            }
        }
        return paramList;
    }

    /**
     * Creates a clone of the current DAG
     * @return A Graph object which is a clone of the current DAG
     * @throws IOException
     */
    protected Graph getClonedGraph() throws IOException {
        Graph graph = currDAG.duplicate();

        if (graph == null) {
            int errCode = 2127;
            String msg = "Cloning of plan failed.";
            throw new FrontendException(msg, errCode, PigException.BUG);
        }
        return graph;
    }

    /**
     * Register a query with the Pig runtime.  The query will be read from the indicated file.
     * @param fileName file to read query from.
     * @throws IOException
     */
    public void registerScript(String fileName) throws IOException {
        registerScript(fileName, null, null);
    }

    /**
     * Register a pig script file.  The parameters in the file will be substituted with the values in params
     * @param fileName  pig script file
     * @param params  the key is the parameter name, and the value is the parameter value
     * @throws IOException
     */
    public void registerScript(String fileName, Map<String, String> params) throws IOException {
        registerScript(fileName, params, null);
    }

    /**
     * Register a pig script file.  The parameters in the file will be substituted with the values in the parameter files
     * @param fileName pig script file
     * @param paramsFiles  files which have the parameter setting
     * @throws IOException
     */
    public void registerScript(String fileName, List<String> paramsFiles) throws IOException {
        registerScript(fileName, null, paramsFiles);
    }

    /**
     * Register a pig script file.  The parameters in the file will be substituted with the values in the map and the parameter files
     * The values in params Map will override the value in parameter file if they have the same parameter
     * @param fileName  pig script
     * @param params  the key is the parameter name, and the value is the parameter value
     * @param paramsFiles   files which have the parameter setting
     * @throws IOException
     */
    public void registerScript(String fileName, Map<String, String> params, List<String> paramsFiles)
            throws IOException {
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(fileName);
            registerScript(fis, params, paramsFiles);
        } catch (FileNotFoundException e) {
            log.error(e.getLocalizedMessage());
            throw new IOException(e);
        } finally {
            if (fis != null) {
                fis.close();
            }
        }
    }

    /**
     * Intended to be used by unit tests only.
     * Print a list of all aliases in in the current Pig Latin script.  Output is written to
     * System.out.
     * @throws FrontendException
     */
    public void printAliases() throws FrontendException {
        System.out.println("aliases: " + currDAG.getAliasOp().keySet());
    }

    /**
     * Write the schema for an alias to System.out.
     * @param alias Alias whose schema will be written out
     * @return Schema of alias dumped
     * @throws IOException
     */
    public Schema dumpSchema(String alias) throws IOException {
        try {
            pigContext.inDumpSchema = true;
            if ("@".equals(alias)) {
                alias = getLastRel();
            }
            LogicalRelationalOperator op = getOperatorForAlias(alias);
            LogicalSchema schema = op.getSchema();

            boolean pretty = "true".equals(pigContext.getProperties().getProperty(PRETTY_PRINT_SCHEMA_PROPERTY));

            if (schema != null) {
                Schema s = org.apache.pig.newplan.logical.Util.translateSchema(schema);
                System.out.println(alias + ": " + (pretty ? s.prettyPrint() : s.toString()));
                return s;
            } else {
                System.out.println("Schema for " + alias + " unknown.");
                return null;
            }
        } catch (FrontendException fee) {
            int errCode = 1001;
            String msg = "Unable to describe schema for alias " + alias;
            throw new FrontendException(msg, errCode, PigException.INPUT, false, null, fee);
        } finally {
            pigContext.inDumpSchema = false;
        }
    }

    /**
     * Write the schema for a nestedAlias to System.out. Denoted by
     * alias::nestedAlias.
     *
     * @param alias Alias whose schema has nestedAlias
     * @param nestedAlias Alias whose schema will be written out
     * @return Schema of alias dumped
     * @throws IOException
     */
    public Schema dumpSchemaNested(String alias, String nestedAlias) throws IOException {
        try {
            pigContext.inDumpSchema = true;
            if ("@".equals(alias)) {
                alias = getLastRel();
            }
            Operator op = getOperatorForAlias(alias);
            if (op instanceof LOForEach) {
                LogicalSchema nestedSc = ((LOForEach) op).dumpNestedSchema(alias, nestedAlias);
                if (nestedSc != null) {
                    Schema s = org.apache.pig.newplan.logical.Util.translateSchema(nestedSc);
                    System.out.println(alias + "::" + nestedAlias + ": " + s.toString());
                    return s;
                } else {
                    System.out.println("Schema for " + alias + "::" + nestedAlias + " unknown.");
                    return null;
                }
            } else {
                int errCode = 1001;
                String msg = "Unable to describe schema for " + alias + "::" + nestedAlias;
                throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
            }
        } finally {
            pigContext.inDumpSchema = false;
        }
    }

    /**
     * Set the name of the job.  This name will get translated to mapred.job.name.
     * @param name of job
     */
    public void setJobName(String name) {
        jobName = PigContext.JOB_NAME_PREFIX + ":" + name;
    }

    /**
     * Set Hadoop job priority.  This value will get translated to mapred.job.priority.
     * @param priority valid values are found in {@link org.apache.hadoop.mapred.JobPriority}
     */
    public void setJobPriority(String priority) {
        jobPriority = priority;
    }

    /**
     * Executes a Pig Latin script up to and including indicated alias.  That is, if a user does:
     * <pre>
     * PigServer server = new PigServer();
     * server.registerQuery("A = load 'foo';");
     * server.registerQuery("B = filter A by $0 &gt; 0;");
     * server.registerQuery("C = order B by $1;");
     * </pre>
     * Then
     * <pre>
     * server.openIterator("B");
     * </pre>
     * filtered but unsorted data will be returned.  If instead a user does
     * <pre>
     * server.openIterator("C");
     * </pre>
     * filtered and sorted data will be returned.
     * @param id Alias to open iterator for
     * @return iterator of tuples returned from the script
     * @throws IOException
     */
    public Iterator<Tuple> openIterator(String id) throws IOException {
        try {
            pigContext.getProperties().setProperty(PigContext.JOB_NAME, jobName);
            if (jobPriority != null) {
                pigContext.getProperties().setProperty(PigContext.JOB_PRIORITY, jobPriority);
            }
            ExecJob job = store(id, FileLocalizer.getTemporaryPath(pigContext).toString(),
                    Utils.getTmpFileCompressorName(pigContext) + "()");

            // invocation of "execute" is synchronous!

            if (job.getStatus() == JOB_STATUS.COMPLETED) {
                return job.getResults();
            } else if (job.getStatus() == JOB_STATUS.FAILED && job.getException() != null) {
                // throw the backend exception in the failed case
                Exception e = job.getException();
                int errCode = 1066;
                String msg = "Unable to open iterator for alias " + id + ". Backend error : " + e.getMessage();
                throw new FrontendException(msg, errCode, PigException.INPUT, e);
            } else {
                throw new IOException("Job terminated with anomalous status " + job.getStatus().toString());
            }
        } catch (FrontendException e) {
            throw e;
        } catch (Exception e) {
            int errCode = 1066;
            String msg = "Unable to open iterator for alias " + id;
            throw new FrontendException(msg, errCode, PigException.INPUT, e);
        }
    }

    /**
     * Executes a Pig Latin script up to and including indicated alias and stores the resulting
     * records into a file.  That is, if a user does:
     * <pre>
     * PigServer server = new PigServer();
     * server.registerQuery("A = load 'foo';");
     * server.registerQuery("B = filter A by $0 &gt; 0;");
     * server.registerQuery("C = order B by $1;");
     * </pre>
     * Then
     * <pre>
     * server.store("B", "bar");
     * </pre>
     * filtered but unsorted data will be stored to the file <tt>bar</tt>.  If instead a user does
     * <pre>
     * server.store("C", "bar");
     * </pre>
     * filtered and sorted data will be stored to the file <tt>bar</tt>.
     * Equivalent to calling {@link #store(String, String, String)} with
     * <tt>org.apache.pig.PigStorage</tt> as the store function.
     * @param id The alias to store
     * @param filename The file to which to store to
     * @return {@link ExecJob} containing information about this job
     * @throws IOException
     */
    public ExecJob store(String id, String filename) throws IOException {
        return store(id, filename, PigStorage.class.getName() + "()"); // SFPig is the default store function
    }

    /**
     * Executes a Pig Latin script up to and including indicated alias and stores the resulting
     * records into a file.  That is, if a user does:
     * <pre>
     * PigServer server = new PigServer();
     * server.registerQuery("A = load 'foo';");
     * server.registerQuery("B = filter A by $0 &gt; 0;");
     * server.registerQuery("C = order B by $1;");
     * </pre>
     * Then
     * <pre>
     * server.store("B", "bar", "mystorefunc");
     * </pre>
     * filtered but unsorted data will be stored to the file <tt>bar</tt> using
     * <tt>mystorefunc</tt>.  If instead a user does
     * <pre>
     * server.store("C", "bar", "mystorefunc");
     * </pre>
     * filtered and sorted data will be stored to the file <tt>bar</tt> using
     * <tt>mystorefunc</tt>.
     * <p>
     * @param id The alias to store
     * @param filename The file to which to store to
     * @param func store function to use
     * @return {@link ExecJob} containing information about this job
     * @throws IOException
     */
    public ExecJob store(String id, String filename, String func) throws IOException {
        PigStats stats = storeEx(id, filename, func);
        if (stats.getOutputStats().size() < 1) {
            throw new IOException("Couldn't retrieve job.");
        }
        OutputStats output = stats.getOutputStats().get(0);

        if (stats.isSuccessful()) {
            return new HJob(JOB_STATUS.COMPLETED, pigContext, output.getPOStore(), output.getAlias(), stats);
        } else {
            HJob job = new HJob(JOB_STATUS.FAILED, pigContext, output.getPOStore(), output.getAlias(), stats);

            //check for exception
            Exception ex = null;
            for (JobStats js : stats.getJobGraph()) {
                if (js.getException() != null) {
                    ex = js.getException();
                }
            }
            job.setException(ex);
            return job;
        }
    }

    private PigStats storeEx(String alias, String filename, String func) throws IOException {
        if ("@".equals(alias)) {
            alias = getLastRel();
        }
        currDAG.parseQuery();
        currDAG.skipStores(); // skip the stores that have already been processed
        currDAG.buildPlan(alias);

        try {
            QueryParserUtils.attachStorePlan(scope, currDAG.lp, filename, func, currDAG.getOperator(alias), alias,
                    pigContext);
            currDAG.compile();
            return executeCompiledLogicalPlan();
        } catch (PigException e) {
            int errCode = 1002;
            String msg = "Unable to store alias " + alias;
            throw new PigException(msg, errCode, PigException.INPUT, e);
        }
    }

    /**
     * Provide information on how a pig query will be executed.  For now
     * this information is very developer focussed, and probably not very
     * useful to the average user.
     * @param alias Name of alias to explain.
     * @param stream PrintStream to write explanation to.
     * @throws IOException if the requested alias cannot be found.
     */
    public void explain(String alias, PrintStream stream) throws IOException {
        explain(alias, "text", true, false, stream, stream, null, null);
    }

    /**
     * Provide information on how a pig query will be executed.
     * @param alias Name of alias to explain.
     * @param format Format in which the explain should be printed.  If text, then the plan will
     * be printed in plain text.  Otherwise, the execution plan will be printed in
     * <a href="http://en.wikipedia.org/wiki/DOT_language">DOT</a> format.
     * @param verbose Controls the amount of information printed
     * @param markAsExecute When set will treat the explain like a
     * call to execute in the respoect that all the pending stores are
     * marked as complete.
     * @param lps Stream to print the logical tree
     * @param eps Stream to print the ExecutionEngine trees. If null, then will print to files
     * @param dir Directory to print ExecutionEngine trees. If null, will use eps
     * @param suffix Suffix of file names
     * @throws IOException if the requested alias cannot be found.
     */
    public void explain(String alias, String format, boolean verbose, boolean markAsExecute, PrintStream lps,
            PrintStream eps, File dir, String suffix) throws IOException {
        try {
            pigContext.inExplain = true;
            buildStorePlan(alias);
            currDAG.lp.optimize(pigContext);

            //Only add root xml node if all plans are being written to same stream.
            if (format == "xml" && lps == eps) {
                lps.println("<plan>");
            }

            currDAG.lp.explain(lps, format, verbose);

            if (currDAG.lp.size() == 0) {
                if (format == "xml" && lps == eps) {
                    lps.println("</plan>");
                }
                return;
            }

            pigContext.getExecutionEngine().explain(currDAG.lp, pigContext, eps, format, verbose, dir, suffix);

            if (format.equals("xml") && lps == eps) {
                lps.println("</plan>");
            }

            if (markAsExecute) {
                currDAG.markAsExecuted();
            }
        } catch (Exception e) {
            int errCode = 1067;
            String msg = "Unable to explain alias " + alias;
            throw new FrontendException(msg, errCode, PigException.INPUT, e);
        } finally {
            pigContext.inExplain = false;
        }
    }

    /**
     * Returns the unused byte capacity of an HDFS filesystem. This value does
     * not take into account a replication factor, as that can vary from file
     * to file. Thus if you are using this to determine if you data set will fit
     * in the HDFS, you need to divide the result of this call by your specific replication
     * setting.
     * @return unused byte capacity of the file system.
     * @throws IOException
     */
    public long capacity() throws IOException {
        if (pigContext.getExecType().isLocal()) {
            throw new IOException("capacity only supported for non-local execution");
        } else {
            DataStorage dds = pigContext.getDfs();

            Map<String, Object> stats = dds.getStatistics();

            String rawCapacityStr = (String) stats.get(DataStorage.RAW_CAPACITY_KEY);
            String rawUsedStr = (String) stats.get(DataStorage.RAW_USED_KEY);

            if ((rawCapacityStr == null) || (rawUsedStr == null)) {
                throw new IOException("Failed to retrieve capacity stats");
            }

            long rawCapacityBytes = new Long(rawCapacityStr).longValue();
            long rawUsedBytes = new Long(rawUsedStr).longValue();

            return rawCapacityBytes - rawUsedBytes;
        }
    }

    /**
     * Returns the length of a file in bytes which exists in the HDFS (accounts for replication).
     * @param filename
     * @return length of the file in bytes
     * @throws IOException
     */
    public long fileSize(String filename) throws IOException {
        DataStorage dfs = pigContext.getDfs();
        ElementDescriptor elem = dfs.asElement(filename);
        Map<String, Object> stats = elem.getStatistics();
        long length = (Long) stats.get(ElementDescriptor.LENGTH_KEY);
        int replication = (Short) stats.get(ElementDescriptor.BLOCK_REPLICATION_KEY);

        return length * replication;
    }

    /**
     * Test whether a file exists.
     * @param filename to test
     * @return true if file exists, false otherwise
     * @throws IOException
     */
    public boolean existsFile(String filename) throws IOException {
        ElementDescriptor elem = pigContext.getDfs().asElement(filename);
        return elem.exists();
    }

    /**
     * Delete a file.
     * @param filename to delete
     * @return true
     * @throws IOException
     */
    public boolean deleteFile(String filename) throws IOException {
        // Check if this operation is permitted
        filter.validate(PigCommandFilter.Command.RM);
        filter.validate(PigCommandFilter.Command.RMF);

        ElementDescriptor elem = pigContext.getDfs().asElement(filename);
        elem.delete();
        return true;
    }

    /**
     * Rename a file.
     * @param source file to rename
     * @param target new file name
     * @return true
     * @throws IOException
     */
    public boolean renameFile(String source, String target) throws IOException {
        // Check if this operation is permitted
        filter.validate(PigCommandFilter.Command.MV);

        pigContext.rename(source, target);
        return true;
    }

    /**
     * Make a directory.
     * @param dirs directory to make
     * @return true
     * @throws IOException
     */
    public boolean mkdirs(String dirs) throws IOException {
        // Check if this operation is permitted
        filter.validate(PigCommandFilter.Command.MKDIR);

        ContainerDescriptor container = pigContext.getDfs().asContainer(dirs);
        container.create();
        return true;
    }

    /**
     * List the contents of a directory.
     * @param dir name of directory to list
     * @return array of strings, one for each file name
     * @throws IOException
     */
    public String[] listPaths(String dir) throws IOException {
        // Check if this operation is permitted
        filter.validate(PigCommandFilter.Command.LS);

        Collection<String> allPaths = new ArrayList<String>();
        ContainerDescriptor container = pigContext.getDfs().asContainer(dir);
        Iterator<ElementDescriptor> iter = container.iterator();

        while (iter.hasNext()) {
            ElementDescriptor elem = iter.next();
            allPaths.add(elem.toString());
        }

        String[] type = new String[1];
        return allPaths.toArray(type);
    }

    /**
     * Return a map containing the logical plan associated with each alias.
     *
     * @return map
     */
    public Map<String, LogicalPlan> getAliases() {
        Map<String, LogicalPlan> aliasPlans = new HashMap<String, LogicalPlan>();
        for (LogicalRelationalOperator op : currDAG.getAliases().keySet()) {
            String alias = op.getAlias();
            if (null != alias) {
                aliasPlans.put(alias, currDAG.getAliases().get(op));
            }
        }
        return aliasPlans;
    }

    /**
     * Reclaims resources used by this instance of PigServer. This method
     * deletes all temporary files generated by the current thread while
     * executing Pig commands.
     */
    public void shutdown() {
        // clean-up activities
        // TODO: reclaim scope to free up resources. Currently
        // this is not implemented and throws an exception
        // hence, for now, we won't call it.
        //
        // pigContext.getExecutionEngine().reclaimScope(this.scope);

        FileLocalizer.deleteTempFiles();
    }

    /**
     * Get the set of all current aliases.
     * @return set
     */
    public Set<String> getAliasKeySet() {
        return currDAG.getAliasOp().keySet();
    }

    public Map<Operator, DataBag> getExamples(String alias) throws IOException {
        try {
            if (currDAG.isBatchOn() && alias != null) {
                currDAG.parseQuery();
                currDAG.buildPlan(null);
                execute();
            }
            currDAG.parseQuery();
            currDAG.skipStores();
            currDAG.buildPlan(alias);
            currDAG.compile();
        } catch (IOException e) {
            //Since the original script is parsed anyway, there should not be an
            //error in this parsing. The only reason there can be an error is when
            //the files being loaded in load don't exist anymore.
            e.printStackTrace();
        }

        ExampleGenerator exgen = new ExampleGenerator(currDAG.lp, pigContext);
        try {
            return exgen.getExamples();
        } catch (ExecException e) {
            e.printStackTrace(System.out);
            throw new IOException("ExecException", e);
        } catch (Exception e) {
            e.printStackTrace(System.out);
            throw new IOException("Exception ", e);
        }

    }

    public void printHistory(boolean withNumbers) {

        List<String> sc = currDAG.getScriptCache();

        if (!sc.isEmpty()) {
            for (int i = 0; i < sc.size(); i++) {
                if (withNumbers)
                    System.out.print((i + 1) + "   ");
                System.out.println(sc.get(i));
            }
        }

    }

    private void buildStorePlan(String alias) throws IOException {
        currDAG.parseQuery();
        currDAG.buildPlan(alias);

        if (!isBatchOn() || alias != null) {
            // MRCompiler needs a store to be the leaf - hence
            // add a store to the plan to explain
            QueryParserUtils.attachStorePlan(scope, currDAG.lp, "fakefile", null, currDAG.getOperator(alias),
                    "fake", pigContext);
        }
        currDAG.compile();
    }

    /**
     * Compile and execute the current plan.
     * @return
     * @throws IOException
     */
    private PigStats execute() throws IOException {
        pigContext.getProperties().setProperty(PigContext.JOB_NAME, jobName);
        if (jobPriority != null) {
            pigContext.getProperties().setProperty(PigContext.JOB_PRIORITY, jobPriority);
        }

        // In this plan, all stores in the plan will be executed. They should be ignored if the plan is reused.
        currDAG.countExecutedStores();

        currDAG.compile();

        if (currDAG.lp.size() == 0) {
            return PigStats.get();
        }

        pigContext.getProperties().setProperty("pig.logical.plan.signature", currDAG.lp.getSignature());

        PigStats stats = executeCompiledLogicalPlan();

        return stats;
    }

    private PigStats executeCompiledLogicalPlan() throws ExecException, FrontendException {
        // discover pig features used in this script
        ScriptState.get().setScriptFeatures(currDAG.lp);
        currDAG.lp.optimize(pigContext);

        return launchPlan(currDAG.lp, "job_pigexec_");
    }

    /**
     * A common method for launching the jobs according to the logical plan
     * @param lp The logical plan
     * @param jobName A String containing the job name to be used
     * @return The PigStats object
     * @throws ExecException
     * @throws FrontendException
     */
    protected PigStats launchPlan(LogicalPlan lp, String jobName) throws ExecException, FrontendException {

        PigStats stats = null;
        try {
            stats = pigContext.getExecutionEngine().launchPig(lp, jobName, pigContext);
        } catch (ExecException e) {
            throw e;
        } catch (FrontendException e) {
            throw e;
        } catch (Exception e) {
            // There are a lot of exceptions thrown by the launcher.  If this
            // is an ExecException, just let it through.  Else wrap it.
            int errCode = 2043;
            String msg = "Unexpected error during execution.";
            throw new ExecException(msg, errCode, PigException.BUG, e);
        }

        return stats;
    }

    /**
     * NOTE: For testing only. Don't use.
     * @throws IOException
     */
    @SuppressWarnings("unused")
    private LogicalPlan buildLp() throws IOException {
        currDAG.buildPlan(null);
        currDAG.compile();
        return currDAG.lp;
    }

    private LogicalRelationalOperator getOperatorForAlias(String alias) throws IOException {
        buildStorePlan(alias);
        LogicalRelationalOperator op = (LogicalRelationalOperator) currDAG.getOperator(alias);
        if (op == null) {
            int errCode = 1005;
            String msg = "No plan for " + alias + " to describe";
            throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
        }
        return op;
    }

    /**
     * Returns data associated with LogicalPlan. It makes
     * sense to call this method only after a query/script
     * has been registered with one of the {@link #registerQuery(String)}
     * or {@link #registerScript(InputStream)} methods.
     *
     * @return LogicalPlanData
     */
    public LogicalPlanData getLogicalPlanData() {
        return new LogicalPlanData(currDAG.lp);
    }

    /*
     * This class holds the internal states of a grunt shell session.
     */
    protected class Graph {

        private final Map<LogicalRelationalOperator, LogicalPlan> aliases = new HashMap<LogicalRelationalOperator, LogicalPlan>();

        private Map<String, Operator> operators = new HashMap<String, Operator>();
        private String lastRel;

        private final List<String> scriptCache = new ArrayList<String>();

        // the fileNameMap contains filename to canonical filename
        // mappings. This is done so we can reparse the cached script
        // and remember the translation (current directory might only
        // be correct during the first parse
        private Map<String, String> fileNameMap = new HashMap<String, String>();

        private final boolean batchMode;

        private int processedStores = 0;

        private LogicalPlan lp;

        private int currentLineNum = 0;

        public Graph(boolean batchMode) {
            this.batchMode = batchMode;
            this.lp = new LogicalPlan();
        };

        /**
         * Call back method for counting executed stores.
         */
        private void countExecutedStores() throws FrontendException {
            List<LOStore> sinks = Util.getLogicalRelationalOperators(lp, LOStore.class);
            processedStores += sinks.size();
        }

        Map<LogicalRelationalOperator, LogicalPlan> getAliases() {
            return aliases;
        }

        Map<String, Operator> getAliasOp() {
            return operators;
        }

        boolean isBatchOn() {
            return batchMode;
        };

        boolean isBatchEmpty() {
            for (Operator op : lp.getSinks()) {
                if (op instanceof LOStore)
                    return false;
            }
            return true;
        }

        void markAsExecuted() {
        }

        public LogicalPlan getLogicalPlan() {
            return this.lp;
        }

        /**
         * Get the operator with the given alias in the raw plan. Null if not
         * found.
         */
        Operator getOperator(String alias) throws FrontendException {
            return operators.get(alias);
        }

        public LogicalPlan getPlan(String alias) throws IOException {
            LogicalPlan plan = lp;

            if (alias != null) {
                LogicalRelationalOperator op = (LogicalRelationalOperator) operators.get(alias);
                if (op == null) {
                    int errCode = 1003;
                    String msg = "Unable to find an operator for alias " + alias;
                    throw new FrontendException(msg, errCode, PigException.INPUT);
                }
                plan = aliases.get(op);
            }
            return plan;
        }

        /**
         * Build a plan for the given alias. Extra branches and child branch under alias
         * will be ignored. Dependent branch (i.e. scalar) will be kept.
         * @throws IOException
         */
        void buildPlan(String alias) throws IOException {
            if (alias == null)
                skipStores();

            final Queue<Operator> queue = new LinkedList<Operator>();
            if (alias != null) {
                Operator op = getOperator(alias);
                if (op == null) {
                    String msg = "Unable to find an operator for alias " + alias;
                    throw new FrontendException(msg, 1003, PigException.INPUT);
                }
                queue.add(op);
            } else {
                List<LOStore> stores = Util.getLogicalRelationalOperators(lp, LOStore.class);
                for (LOStore op : stores) {
                    boolean addSink = true;
                    // Only add if all the successors are loads
                    List<Operator> succs = lp.getSuccessors(op);
                    if (succs != null && succs.size() > 0) {
                        for (Operator succ : succs) {
                            if (!(succ instanceof LOLoad)) {
                                addSink = false;
                                break;
                            }
                        }
                    }
                    if (addSink) {
                        queue.add(op);
                    }
                }
            }

            LogicalPlan plan = new LogicalPlan();

            while (!queue.isEmpty()) {
                Operator currOp = queue.poll();
                plan.add(currOp);

                List<Operator> preds = lp.getPredecessors(currOp);
                if (preds != null) {
                    List<Operator> ops = new ArrayList<Operator>(preds);
                    for (Operator pred : ops) {
                        if (!queue.contains(pred))
                            queue.add(pred);
                        plan.connect(pred, currOp);
                    }
                }

                // visit expression associated with currOp. If it refers to any other operator
                // that operator is also going to be enqueued.
                currOp.accept(new AllExpressionVisitor(plan, new DependencyOrderWalker(plan)) {
                    @Override
                    protected LogicalExpressionVisitor getVisitor(LogicalExpressionPlan exprPlan)
                            throws FrontendException {
                        return new LogicalExpressionVisitor(exprPlan, new DependencyOrderWalker(exprPlan)) {
                            @Override
                            public void visit(ScalarExpression expr) throws FrontendException {
                                Operator refOp = expr.getImplicitReferencedOperator();
                                if (!queue.contains(refOp))
                                    queue.add(refOp);
                            }
                        };
                    }
                });

                currOp.setPlan(plan);
            }
            lp = plan;
        }

        /**
         *  Remove stores that have been executed previously from the overall plan.
         */
        private void skipStores() throws IOException {
            // Get stores specifically
            List<LOStore> sinks = Util.getLogicalRelationalOperators(lp, LOStore.class);
            List<Operator> sinksToRemove = new ArrayList<Operator>();
            int skipCount = processedStores;
            if (skipCount > 0) {
                for (LOStore sink : sinks) {
                    sinksToRemove.add(sink);
                    skipCount--;
                    if (skipCount == 0)
                        break;
                }
            }

            for (Operator op : sinksToRemove) {
                // It's fully possible in the multiquery case that
                // a store that is not a leaf (sink) and therefor has
                // successors that need to be removed.
                removeToLoad(op);
                Operator pred = lp.getPredecessors(op).get(0);
                lp.disconnect(pred, op);
                lp.remove(op);
            }
        }

        private void removeToLoad(Operator toRemove) throws IOException {
            List<Operator> successors = lp.getSuccessors(toRemove);
            List<Operator> succToRemove = new ArrayList<Operator>();
            if (successors != null && successors.size() > 0) {
                succToRemove.addAll(successors);
                for (Operator succ : succToRemove) {
                    lp.disconnect(toRemove, succ);
                    if (!(succ instanceof LOLoad)) {
                        removeToLoad(succ);
                        lp.remove(succ);
                    }
                }
            }
        }

        /**
         * Accumulate the given statement to previous query statements and generate
         * an overall (raw) plan.
         */
        void registerQuery(String query, int startLine, boolean validateEachStatement, boolean skipParseForBatch)
                throws IOException {
            if (batchMode) {
                if (startLine == currentLineNum) {
                    String line = scriptCache.remove(scriptCache.size() - 1);
                    scriptCache.add(line + query);
                } else {
                    while (startLine > currentLineNum + 1) {
                        scriptCache.add("");
                        currentLineNum++;
                    }
                    BufferedReader br = new BufferedReader(new StringReader(query));
                    String line = br.readLine();
                    while (line != null) {
                        scriptCache.add(line);
                        currentLineNum++;
                        line = br.readLine();
                    }
                }
                if (skipParseForBatch) {
                    return;
                }
            } else {
                scriptCache.add(query);
            }

            if (validateEachStatement) {
                validateQuery();
            }
            parseQuery();

            if (!batchMode) {
                buildPlan(null);
                for (Operator sink : lp.getSinks()) {
                    if (sink instanceof LOStore) {
                        try {
                            execute();
                        } catch (Exception e) {
                            int errCode = 1002;
                            String msg = "Unable to store alias " + ((LOStore) sink).getAlias();
                            throw new FrontendException(msg, errCode, PigException.INPUT, e);
                        }
                        break; // We should have at most one store, so break here.
                    }
                }
            }
        }

        private void validateQuery() throws FrontendException {
            String query = buildQuery();
            QueryParserDriver parserDriver = new QueryParserDriver(pigContext, scope, fileNameMap);
            try {
                LogicalPlan plan = parserDriver.parse(query);
                plan.validate(pigContext, scope, true);
            } catch (FrontendException ex) {
                scriptCache.remove(scriptCache.size() - 1);
                throw ex;
            }
        }

        public List<String> getScriptCache() {
            return scriptCache;
        }

        /**
         * Parse the accumulated pig statements and generate an overall plan.
         */
        private void parseQuery() throws FrontendException {
            UDFContext.getUDFContext().reset();
            UDFContext.getUDFContext().setClientSystemProps(pigContext.getProperties());

            String query = buildQuery();

            if (query.isEmpty()) {
                lp = new LogicalPlan();
                return;
            }

            try {
                QueryParserDriver parserDriver = new QueryParserDriver(pigContext, scope, fileNameMap);
                lp = parserDriver.parse(query);
                operators = parserDriver.getOperators();
                lastRel = parserDriver.getLastRel();
            } catch (Exception ex) {
                scriptCache.remove(scriptCache.size() - 1); // remove the bad script from the cache.
                PigException pe = LogUtils.getPigException(ex);
                int errCode = 1000;
                String msg = "Error during parsing. " + (pe == null ? ex.getMessage() : pe.getMessage());
                log.error("exception during parsing: " + msg, ex);
                if (null == pe) {
                    throw new FrontendException(msg, errCode, PigException.INPUT, ex);
                } else {
                    throw new FrontendException(msg, errCode, PigException.INPUT, ex, pe.getSourceLocation());
                }
            }
        }

        public String getLastRel() {
            return lastRel;
        }

        private String buildQuery() {
            StringBuilder accuQuery = new StringBuilder();
            for (String line : scriptCache) {
                accuQuery.append(line + "\n");
            }

            return accuQuery.toString();
        }

        private void compile() throws IOException {
            lp.validate(pigContext, scope, false);
            currDAG.postProcess();
        }

        private void postProcess() throws IOException {
            // The following code deals with store/load combination of
            // intermediate files. In this case we will replace the load
            // operator
            // with a (implicit) split operator, iff the load/store
            // func is reversible (because that's when we can safely
            // skip the load and keep going with the split output). If
            // the load/store func is not reversible (or they are
            // different functions), we connect the store and the load
            // to remember the dependency.

            Set<LOLoad> loadOps = new HashSet<LOLoad>();
            List<Operator> sources = lp.getSources();
            for (Operator source : sources) {
                if (source instanceof LOLoad) {
                    loadOps.add((LOLoad) source);
                }
            }

            Set<LOStore> storeOps = new HashSet<LOStore>();
            List<Operator> sinks = lp.getSinks();
            for (Operator sink : sinks) {
                if (sink instanceof LOStore) {
                    storeOps.add((LOStore) sink);
                }
            }

            if ("true".equals(pigContext.getProperties().getProperty(PIG_LOCATION_CHECK_STRICT))) {
                log.info("Output location strick check enabled");
                checkDuplicateStoreLoc(storeOps);
            }

            for (LOLoad load : loadOps) {
                for (LOStore store : storeOps) {
                    String ifile = load.getFileSpec().getFileName();
                    String ofile = store.getFileSpec().getFileName();
                    if (ofile.equals(ifile)) {
                        // if there is no path from the load to the store,
                        // then connect the store to the load to create the
                        // dependency of the store on the load. If there is
                        // a path from the load to the store, then we should
                        // not connect the store to the load and create a cycle
                        if (!store.getPlan().pathExists(load, store)) {
                            store.getPlan().connect(store, load);
                        }
                    }
                }
            }
        }

        /**
         * This method checks whether the multiple sinks (STORE) use the same
         * "file-based" location. If yes, throws a RuntimeException
         *
         * @param storeOps
         */
        private void checkDuplicateStoreLoc(Set<LOStore> storeOps) {
            Set<String> uniqueStoreLoc = new HashSet<String>();
            for (LOStore store : storeOps) {
                String fileName = store.getFileSpec().getFileName();
                if (!uniqueStoreLoc.add(fileName)
                        && UriUtil.isHDFSFileOrLocalOrS3N(fileName, new Configuration(true))) {
                    throw new RuntimeException(
                            "Script contains 2 or more STORE statements writing to same location : " + fileName);
                }
            }
        }

        protected Graph duplicate() {
            // There are two choices on how we duplicate the logical plan
            // 1 - we really clone each operator and connect up the cloned operators
            // 2 - we cache away the script till the point we need to clone
            // and then simply re-parse the script.
            // The latter approach is used here
            // FIXME: There is one open issue with this now:
            // Consider the following script:
            // A = load 'file:/somefile';
            // B = filter A by $0 > 10;
            // store B into 'bla';
            // rm 'file:/somefile';
            // A = load 'file:/someotherfile'
            // when we try to clone - we try to reparse
            // from the beginning and currently the parser
            // checks for file existence of files in the load
            // in the case where the file is a local one -i.e. with file: prefix
            // This will be a known issue now and we will need to revisit later

            // parse each line of the cached script
            int lineNumber = 1;

            // create data structures needed for parsing
            Graph graph = new Graph(isBatchOn());
            graph.processedStores = processedStores;
            graph.fileNameMap = new HashMap<String, String>(fileNameMap);

            try {
                for (Iterator<String> it = scriptCache.iterator(); it.hasNext(); lineNumber++) {
                    // always doing registerQuery irrespective of the batch mode
                    // TODO: Need to figure out if anything different needs to happen if batch
                    // mode is not on
                    // Don't have to do the validation again, so set validateEachStatement param to false
                    graph.registerQuery(it.next(), lineNumber, false, false);
                }
                graph.postProcess();
            } catch (IOException ioe) {
                ioe.printStackTrace();
                graph = null;
            }
            return graph;
        }
    }

    /**
     * This can be called to indicate if the query is being parsed/compiled
     * in a mode that expects each statement to be validated as it is
     * entered, instead of just doing it once for whole script.
     * @param validateEachStatement
     */
    public void setValidateEachStatement(boolean validateEachStatement) {
        this.validateEachStatement = validateEachStatement;
    }

    /**
     * Set whether to skip parsing while registering the query in batch mode
     * @param skipParseInRegisterForBatch
     */
    public void setSkipParseInRegisterForBatch(boolean skipParseInRegisterForBatch) {
        this.skipParseInRegisterForBatch = skipParseInRegisterForBatch;
    }

    public String getLastRel() {
        return currDAG.getLastRel();
    }
}