org.apache.pig.tools.pigstats.ScriptState.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.tools.pigstats.ScriptState.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.tools.pigstats;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.BitSet;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.jar.Attributes;
import java.util.jar.JarFile;
import java.util.jar.Manifest;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.VersionInfo;
import org.apache.pig.PigConfiguration;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.OriginalLocation;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCollectedGroup;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PODemux;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PODistinct;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POFRJoin;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POFilter;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLimit;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POMergeCogroup;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POMergeJoin;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPartialAgg;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSkewedJoin;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSort;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSplit;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStream;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POUnion;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.plan.DepthFirstWalker;
import org.apache.pig.impl.plan.OperatorPlan;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.JarManager;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.newplan.logical.relational.LOCogroup;
import org.apache.pig.newplan.logical.relational.LOCogroup.GROUPTYPE;
import org.apache.pig.newplan.logical.relational.LOCross;
import org.apache.pig.newplan.logical.relational.LODistinct;
import org.apache.pig.newplan.logical.relational.LOFilter;
import org.apache.pig.newplan.logical.relational.LOForEach;
import org.apache.pig.newplan.logical.relational.LOJoin;
import org.apache.pig.newplan.logical.relational.LOJoin.JOINTYPE;
import org.apache.pig.newplan.logical.relational.LOLimit;
import org.apache.pig.newplan.logical.relational.LONative;
import org.apache.pig.newplan.logical.relational.LORank;
import org.apache.pig.newplan.logical.relational.LOSort;
import org.apache.pig.newplan.logical.relational.LOSplit;
import org.apache.pig.newplan.logical.relational.LOStream;
import org.apache.pig.newplan.logical.relational.LOUnion;
import org.apache.pig.newplan.logical.relational.LogicalPlan;
import org.apache.pig.newplan.logical.relational.LogicalRelationalNodesVisitor;
import org.apache.pig.tools.pigstats.mapreduce.MRScriptState;

import com.google.common.collect.Lists;

/**
 * ScriptStates encapsulates settings for a Pig script that runs on a hadoop
 * cluster. These settings are added to all MR jobs spawned by the script and in
 * turn are persisted in the hadoop job xml. With the properties already in the
 * job xml, users who want to know the relations between the script and MR jobs
 * can derive them from the job xmls.
 */
public abstract class ScriptState {

    /**
     * Keys of Pig settings added to Jobs
     */
    protected enum PIG_PROPERTY {
        SCRIPT_ID("pig.script.id"), SCRIPT("pig.script"), COMMAND_LINE("pig.command.line"), HADOOP_VERSION(
                "pig.hadoop.version"), VERSION("pig.version"), INPUT_DIRS("pig.input.dirs"), MAP_OUTPUT_DIRS(
                        "pig.map.output.dirs"), REDUCE_OUTPUT_DIRS("pig.reduce.output.dirs"), JOB_PARENTS(
                                "pig.parent.jobid"), JOB_FEATURE("pig.job.feature"), SCRIPT_FEATURES(
                                        "pig.script.features"), JOB_ALIAS(
                                                "pig.alias"), JOB_ALIAS_LOCATION("pig.alias.location");

        private String displayStr;

        private PIG_PROPERTY(String s) {
            displayStr = s;
        }

        @Override
        public String toString() {
            return displayStr;
        }
    };

    /**
     * Features used in a Pig script
     */
    public static enum PIG_FEATURE {
        UNKNOWN, MERGE_JOIN, MERGE_SPARSE_JOIN, REPLICATED_JOIN, SKEWED_JOIN, HASH_JOIN, COLLECTED_GROUP, MERGE_COGROUP, COGROUP, GROUP_BY, ORDER_BY, RANK, DISTINCT, STREAMING, SAMPLER, INDEXER, MULTI_QUERY, FILTER, MAP_ONLY, CROSS, LIMIT, UNION, COMBINER, NATIVE, MAP_PARTIALAGG;
    };

    private static final Log LOG = LogFactory.getLog(ScriptState.class);

    /**
     * PIG-3844. Each thread should have its own copy of ScriptState. We initialize the ScriptState
     * for new threads with the ScriptState of its parent thread, using InheritableThreadLocal.
     * Used eg. in PPNL running in separate thread.
     */
    private static InheritableThreadLocal<ScriptState> tss = new InheritableThreadLocal<ScriptState>();

    protected String id;

    protected String serializedScript;
    protected String truncatedScript;
    protected String commandLine;
    protected String fileName;

    protected String pigVersion;
    protected String hadoopVersion;

    protected long scriptFeatures;

    protected PigContext pigContext;

    protected List<PigProgressNotificationListener> listeners = Lists.newArrayList();

    protected ScriptState(String id) {
        this.id = id;
        this.serializedScript = "";
        this.truncatedScript = "";
    }

    public static ScriptState get() {
        return tss.get();
    }

    public static ScriptState start(ScriptState state) {
        tss.set(state);
        return tss.get();
    }

    /**
     * @deprecated use {@link org.apache.pig.tools.pigstats.ScriptState#start(ScriptState)} instead.
     */
    @Deprecated
    public static ScriptState start(String commandLine, PigContext pigContext) {
        ScriptState ss = new MRScriptState(UUID.randomUUID().toString());
        ss.setCommandLine(commandLine);
        ss.setPigContext(pigContext);
        tss.set(ss);
        return ss;
    }

    public void registerListener(PigProgressNotificationListener listener) {
        listeners.add(listener);
    }

    public List<PigProgressNotificationListener> getAllListeners() {
        return listeners;
    }

    public void emitInitialPlanNotification(OperatorPlan<?> plan) {
        for (PigProgressNotificationListener listener : listeners) {
            try {
                listener.initialPlanNotification(id, plan);
            } catch (NoSuchMethodError e) {
                LOG.warn(
                        "PigProgressNotificationListener implementation doesn't "
                                + "implement initialPlanNotification(..) method: " + listener.getClass().getName(),
                        e);
            }
        }
    }

    public void emitLaunchStartedNotification(int numJobsToLaunch) {
        for (PigProgressNotificationListener listener : listeners) {
            listener.launchStartedNotification(id, numJobsToLaunch);
        }
    }

    public void emitJobsSubmittedNotification(int numJobsSubmitted) {
        for (PigProgressNotificationListener listener : listeners) {
            listener.jobsSubmittedNotification(id, numJobsSubmitted);
        }
    }

    public void emitJobStartedNotification(String assignedJobId) {
        for (PigProgressNotificationListener listener : listeners) {
            listener.jobStartedNotification(id, assignedJobId);
        }
    }

    public void emitjobFinishedNotification(JobStats jobStats) {
        for (PigProgressNotificationListener listener : listeners) {
            listener.jobFinishedNotification(id, jobStats);
        }
    }

    public void emitJobFailedNotification(JobStats jobStats) {
        for (PigProgressNotificationListener listener : listeners) {
            listener.jobFailedNotification(id, jobStats);
        }
    }

    public void emitOutputCompletedNotification(OutputStats outputStats) {
        for (PigProgressNotificationListener listener : listeners) {
            listener.outputCompletedNotification(id, outputStats);
        }
    }

    public void emitProgressUpdatedNotification(int progress) {
        for (PigProgressNotificationListener listener : listeners) {
            listener.progressUpdatedNotification(id, progress);
        }
    }

    public void emitLaunchCompletedNotification(int numJobsSucceeded) {
        for (PigProgressNotificationListener listener : listeners) {
            listener.launchCompletedNotification(id, numJobsSucceeded);
        }
    }

    public void setScript(File file) throws IOException {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            setScript(reader);
        } catch (FileNotFoundException e) {
            LOG.warn("unable to find the file", e);
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException ignored) {
                }
            }
        }
    }

    public void setScript(String script) throws IOException {
        if (script == null)
            return;

        //Retain the truncated script
        setTruncatedScript(script);

        //Serialize and encode the string.
        this.serializedScript = ObjectSerializer.serialize(script);
    }

    private void setTruncatedScript(String script) {
        // restrict the size of the script to be stored in job conf
        int maxScriptSize = 10240;
        if (pigContext != null) {
            String prop = pigContext.getProperties().getProperty(PigConfiguration.PIG_SCRIPT_MAX_SIZE);
            if (prop != null) {
                maxScriptSize = Integer.valueOf(prop);
            }
        }

        this.truncatedScript = (script.length() > maxScriptSize) ? script.substring(0, maxScriptSize) : script;

    }

    public void setScriptFeatures(LogicalPlan plan) {
        BitSet bs = new BitSet();
        try {
            new LogicalPlanFeatureVisitor(plan, bs).visit();
        } catch (FrontendException e) {
            LOG.warn("unable to get script feature", e);
        }
        scriptFeatures = bitSetToLong(bs);

        LOG.info("Pig features used in the script: " + featureLongToString(scriptFeatures));
    }

    public String getHadoopVersion() {
        if (hadoopVersion == null) {
            hadoopVersion = VersionInfo.getVersion();
        }
        return (hadoopVersion == null) ? "" : hadoopVersion;
    }

    public String getPigVersion() {
        if (pigVersion == null) {
            String findContainingJar = JarManager.findContainingJar(ScriptState.class);
            if (findContainingJar != null) {
                try {
                    JarFile jar = new JarFile(findContainingJar);
                    final Manifest manifest = jar.getManifest();
                    final Map<String, Attributes> attrs = manifest.getEntries();
                    Attributes attr = attrs.get("org/apache/pig");
                    pigVersion = attr.getValue("Implementation-Version");
                } catch (Exception e) {
                    LOG.warn("unable to read pigs manifest file");
                }
            } else {
                LOG.warn("unable to read pigs manifest file. Not running from the Pig jar");
            }
        }
        return (pigVersion == null) ? "" : pigVersion;
    }

    public String getFileName() {
        return (fileName == null) ? "" : fileName;
    }

    public void setFileName(String fileName) {
        this.fileName = fileName;
    }

    public String getId() {
        return id;
    }

    public void setCommandLine(String commandLine) {
        this.commandLine = new String(Base64.encodeBase64(commandLine.getBytes()));
    }

    public String getCommandLine() {
        return (commandLine == null) ? "" : commandLine;
    }

    public String getSerializedScript() {
        return (serializedScript == null) ? "" : serializedScript;
    }

    public String getScript() {
        return (truncatedScript == null) ? "" : truncatedScript;
    }

    protected void setScript(BufferedReader reader) throws IOException {
        StringBuilder sb = new StringBuilder();
        try {
            String line = reader.readLine();
            while (line != null) {
                sb.append(line).append("\n");
                line = reader.readLine();
            }
        } catch (IOException e) {
            LOG.warn("unable to parse the script", e);
        }
        setScript(sb.toString());
    }

    protected long bitSetToLong(BitSet bs) {
        long ret = 0;
        for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) {
            ret |= (1L << i);
        }
        return ret;
    }

    protected String featureLongToString(long l) {
        if (l == 0)
            return PIG_FEATURE.UNKNOWN.name();

        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < PIG_FEATURE.values().length; i++) {
            if (((l >> i) & 0x00000001) != 0) {
                if (sb.length() > 0)
                    sb.append(",");
                sb.append(PIG_FEATURE.values()[i].name());
            }
        }
        return sb.toString();
    }

    public void setPigContext(PigContext pigContext) {
        this.pigContext = pigContext;
    }

    public PigContext getPigContext() {
        return pigContext;
    }

    public String getScriptFeatures() {
        return featureLongToString(scriptFeatures);
    }

    static class LogicalPlanFeatureVisitor extends LogicalRelationalNodesVisitor {

        private BitSet feature;

        protected LogicalPlanFeatureVisitor(LogicalPlan plan, BitSet feature) throws FrontendException {
            super(plan, new org.apache.pig.newplan.DepthFirstWalker(plan));
            this.feature = feature;
        }

        @Override
        public void visit(LOCogroup op) {
            if (op.getGroupType() == GROUPTYPE.COLLECTED) {
                feature.set(PIG_FEATURE.COLLECTED_GROUP.ordinal());
            } else if (op.getGroupType() == GROUPTYPE.MERGE) {
                feature.set(PIG_FEATURE.MERGE_COGROUP.ordinal());
            } else if (op.getGroupType() == GROUPTYPE.REGULAR) {
                if (op.getExpressionPlans().size() > 1) {
                    feature.set(PIG_FEATURE.COGROUP.ordinal());
                } else {
                    feature.set(PIG_FEATURE.GROUP_BY.ordinal());
                }
            }
        }

        @Override
        public void visit(LOCross op) {
            feature.set(PIG_FEATURE.CROSS.ordinal());
        }

        @Override
        public void visit(LODistinct op) {
            feature.set(PIG_FEATURE.DISTINCT.ordinal());
        }

        @Override
        public void visit(LOFilter op) {
            feature.set(PIG_FEATURE.FILTER.ordinal());
        }

        @Override
        public void visit(LOForEach op) {

        }

        @Override
        public void visit(LOJoin op) {
            if (op.getJoinType() == JOINTYPE.HASH) {
                feature.set(PIG_FEATURE.HASH_JOIN.ordinal());
            } else if (op.getJoinType() == JOINTYPE.MERGE) {
                feature.set(PIG_FEATURE.MERGE_JOIN.ordinal());
            } else if (op.getJoinType() == JOINTYPE.MERGESPARSE) {
                feature.set(PIG_FEATURE.MERGE_SPARSE_JOIN.ordinal());
            } else if (op.getJoinType() == JOINTYPE.REPLICATED) {
                feature.set(PIG_FEATURE.REPLICATED_JOIN.ordinal());
            } else if (op.getJoinType() == JOINTYPE.SKEWED) {
                feature.set(PIG_FEATURE.SKEWED_JOIN.ordinal());
            }
        }

        @Override
        public void visit(LOLimit op) {
            feature.set(PIG_FEATURE.LIMIT.ordinal());
        }

        @Override
        public void visit(LORank op) {
            feature.set(PIG_FEATURE.RANK.ordinal());
        }

        public void visit(LOSort op) {
            feature.set(PIG_FEATURE.ORDER_BY.ordinal());
        }

        @Override
        public void visit(LOStream op) {
            feature.set(PIG_FEATURE.STREAMING.ordinal());
        }

        @Override
        public void visit(LOSplit op) {

        }

        @Override
        public void visit(LOUnion op) {
            feature.set(PIG_FEATURE.UNION.ordinal());
        }

        @Override
        public void visit(LONative n) {
            feature.set(PIG_FEATURE.NATIVE.ordinal());
        }
    }

    protected static class FeatureVisitor extends PhyPlanVisitor {
        private BitSet feature;

        public FeatureVisitor(PhysicalPlan plan, BitSet feature) {
            super(plan, new DepthFirstWalker<PhysicalOperator, PhysicalPlan>(plan));
            this.feature = feature;
        }

        @Override
        public void visitFRJoin(POFRJoin join) throws VisitorException {
            feature.set(PIG_FEATURE.REPLICATED_JOIN.ordinal());
        }

        @Override
        public void visitMergeJoin(POMergeJoin join) throws VisitorException {
            if (join.getJoinType() == LOJoin.JOINTYPE.MERGESPARSE)
                feature.set(PIG_FEATURE.MERGE_SPARSE_JOIN.ordinal());
            else
                feature.set(PIG_FEATURE.MERGE_JOIN.ordinal());
        }

        @Override
        public void visitMergeCoGroup(POMergeCogroup mergeCoGrp) throws VisitorException {
            feature.set(PIG_FEATURE.MERGE_COGROUP.ordinal());
            ;
        }

        @Override
        public void visitCollectedGroup(POCollectedGroup mg) throws VisitorException {
            feature.set(PIG_FEATURE.COLLECTED_GROUP.ordinal());
        }

        @Override
        public void visitDistinct(PODistinct distinct) throws VisitorException {
            feature.set(PIG_FEATURE.DISTINCT.ordinal());
        }

        @Override
        public void visitStream(POStream stream) throws VisitorException {
            feature.set(PIG_FEATURE.STREAMING.ordinal());
        }

        @Override
        public void visitSplit(POSplit split) throws VisitorException {
            feature.set(PIG_FEATURE.MULTI_QUERY.ordinal());
        }

        @Override
        public void visitDemux(PODemux demux) throws VisitorException {
            feature.set(PIG_FEATURE.MULTI_QUERY.ordinal());
        }

        @Override
        public void visitPartialAgg(POPartialAgg partAgg) {
            feature.set(PIG_FEATURE.MAP_PARTIALAGG.ordinal());
        }
    }

    protected static class AliasVisitor extends PhyPlanVisitor {
        private HashSet<String> aliasSet;
        private List<String> alias;
        private final List<String> aliasLocation;

        public AliasVisitor(PhysicalPlan plan, List<String> alias, List<String> aliasLocation) {
            super(plan, new DepthFirstWalker<PhysicalOperator, PhysicalPlan>(plan));
            this.alias = alias;
            this.aliasLocation = aliasLocation;
            aliasSet = new HashSet<String>();
            if (!alias.isEmpty()) {
                for (String s : alias)
                    aliasSet.add(s);
            }
        }

        @Override
        public void visitLoad(POLoad load) throws VisitorException {
            setAlias(load);
            super.visitLoad(load);
        }

        @Override
        public void visitFRJoin(POFRJoin join) throws VisitorException {
            setAlias(join);
            super.visitFRJoin(join);
        }

        @Override
        public void visitMergeJoin(POMergeJoin join) throws VisitorException {
            setAlias(join);
            super.visitMergeJoin(join);
        }

        @Override
        public void visitMergeCoGroup(POMergeCogroup mergeCoGrp) throws VisitorException {
            setAlias(mergeCoGrp);
            super.visitMergeCoGroup(mergeCoGrp);
        }

        @Override
        public void visitCollectedGroup(POCollectedGroup mg) throws VisitorException {
            setAlias(mg);
            super.visitCollectedGroup(mg);
        }

        @Override
        public void visitDistinct(PODistinct distinct) throws VisitorException {
            setAlias(distinct);
            super.visitDistinct(distinct);
        }

        @Override
        public void visitStream(POStream stream) throws VisitorException {
            setAlias(stream);
            super.visitStream(stream);
        }

        @Override
        public void visitFilter(POFilter fl) throws VisitorException {
            setAlias(fl);
            super.visitFilter(fl);
        }

        @Override
        public void visitLocalRearrange(POLocalRearrange lr) throws VisitorException {
            setAlias(lr);
            super.visitLocalRearrange(lr);
        }

        @Override
        public void visitPOForEach(POForEach nfe) throws VisitorException {
            setAlias(nfe);
            super.visitPOForEach(nfe);
        }

        @Override
        public void visitUnion(POUnion un) throws VisitorException {
            setAlias(un);
            super.visitUnion(un);
        }

        @Override
        public void visitSort(POSort sort) throws VisitorException {
            setAlias(sort);
            super.visitSort(sort);
        }

        @Override
        public void visitLimit(POLimit lim) throws VisitorException {
            setAlias(lim);
            super.visitLimit(lim);
        }

        @Override
        public void visitSkewedJoin(POSkewedJoin sk) throws VisitorException {
            setAlias(sk);
            super.visitSkewedJoin(sk);
        }

        private void setAlias(PhysicalOperator op) {
            String s = op.getAlias();
            if (s != null) {
                if (!aliasSet.contains(s)) {
                    alias.add(s);
                    aliasSet.add(s);
                }
            }
            List<OriginalLocation> originalLocations = op.getOriginalLocations();
            for (OriginalLocation originalLocation : originalLocations) {
                aliasLocation.add(originalLocation.toString());
            }
        }
    }
}