org.apache.pig.impl.logicalLayer.LOLoad.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.impl.logicalLayer.LOLoad.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.logicalLayer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.LoadPushDown;
import org.apache.pig.PigException;
import org.apache.pig.ResourceSchema;
import org.apache.pig.LoadPushDown.RequiredField;
import org.apache.pig.LoadPushDown.RequiredFieldList;
import org.apache.pig.LoadPushDown.RequiredFieldResponse;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.SchemaMergeException;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.ProjectionMap;
import org.apache.pig.impl.plan.RequiredFields;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.MultiMap;
import org.apache.pig.impl.util.Pair;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;

public class LOLoad extends RelationalOperator {
    private static final long serialVersionUID = 2L;

    private FileSpec mInputFileSpec;
    transient private LoadFunc mLoadFunc;
    private String mSchemaFile;
    private Schema mEnforcedSchema = null;
    transient private Configuration conf;
    private static Log log = LogFactory.getLog(LOLoad.class);
    private Schema mDeterminedSchema = null;
    private RequiredFieldList requiredFieldList;

    private boolean mDeterminedSchemaCached = false;

    /**
     * @param plan
     *            LogicalPlan this operator is a part of.
     * @param key
     *            OperatorKey for this operator
     * @param inputFileSpec
     *            the file to be loaded *
     * @param conf
     *            the read-only configuration object
     *            
     */
    public LOLoad(LogicalPlan plan, OperatorKey key, FileSpec inputFileSpec, Configuration conf)
            throws IOException {
        super(plan, key);
        mInputFileSpec = inputFileSpec;
        //mSchemaFile = schemaFile;
        // schemaFile is the input file since we are trying
        // to deduce the schema by looking at the input file
        mSchemaFile = inputFileSpec.getFileName();
        this.conf = conf;
        // Generate a psudo alias. Since in the following script, we do not have alias for LOLoad, however, alias is required.
        // a = foreach (load '1') generate b0;
        this.mAlias = "" + key.getId();

        try {
            mLoadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(inputFileSpec.getFuncSpec());
            mLoadFunc.setUDFContextSignature(getAlias());
        } catch (ClassCastException cce) {
            log.error(inputFileSpec.getFuncSpec() + " should implement the LoadFunc interface.");
            throw new IOException(cce);
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    public FileSpec getInputFile() {
        return mInputFileSpec;
    }

    public void setInputFile(FileSpec inputFileSpec) throws IOException {
        try {
            mLoadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(inputFileSpec.getFuncSpec());
        } catch (ClassCastException cce) {
            log.error(inputFileSpec.getFuncSpec() + " should implement the LoadFunc interface.");
            IOException ioe = new IOException(cce.getMessage());
            ioe.setStackTrace(cce.getStackTrace());
            throw ioe;
        } catch (Exception e) {
            IOException ioe = new IOException(e.getMessage());
            ioe.setStackTrace(e.getStackTrace());
            throw ioe;
        }
        mInputFileSpec = inputFileSpec;
    }

    public String getSchemaFile() {
        return mSchemaFile;
    }

    public LoadFunc getLoadFunc() {
        return mLoadFunc;
    }

    @Override
    public String name() {
        return "Load " + mKey.scope + "-" + mKey.id;
    }

    @Override
    public Schema getSchema() throws FrontendException {
        if (!mIsSchemaComputed) {
            // get the schema of the load function
            try {
                if (mEnforcedSchema != null) {
                    mSchema = mEnforcedSchema;
                    return mSchema;
                }

                if (null == mDeterminedSchema) {
                    mSchema = determineSchema();
                }
                mIsSchemaComputed = true;
            } catch (IOException ioe) {
                int errCode = 1018;
                String msg = "Problem determining schema during load";
                FrontendException fee = new FrontendException(msg, errCode, PigException.INPUT, false, null, ioe);
                mIsSchemaComputed = false;
                mSchema = null;
                throw fee;
            }
        }
        return mSchema;
    }

    private Schema determineSchema() throws IOException {
        if (!mDeterminedSchemaCached) {
            if (LoadMetadata.class.isAssignableFrom(mLoadFunc.getClass())) {
                LoadMetadata loadMetadata = (LoadMetadata) mLoadFunc;
                ResourceSchema rSchema = loadMetadata.getSchema(mInputFileSpec.getFileName(), new Job(conf));
                mDeterminedSchema = Schema.getPigSchema(rSchema);
            }
            // set the flag so that future calls just use mDeterminedSchema
            mDeterminedSchemaCached = true;
            return mDeterminedSchema;
        } else {
            return mDeterminedSchema;
        }
    }

    /* (non-Javadoc)
     * @see org.apache.pig.impl.logicalLayer.LogicalOperator#setSchema(org.apache.pig.impl.logicalLayer.schema.Schema)
     */
    @Override
    public void setSchema(Schema schema) throws FrontendException {
        // In general, operators don't generate their schema until they're
        // asked, so ask them to do it.
        try {
            getSchema();
        } catch (FrontendException ioe) {
            // It's fine, it just means we don't have a schema yet.
        }
        if (mSchema == null) {
            log.debug("Operator schema is null; Setting it to new schema");
            mSchema = schema;
        } else {
            log.debug("Reconciling schema");
            log.debug("mSchema: " + mSchema + " schema: " + schema);
            try {
                mSchema = mSchema.mergePrefixSchema(schema, true, true);
            } catch (SchemaMergeException e) {
                int errCode = 1019;
                String msg = "Unable to merge schemas";
                throw new FrontendException(msg, errCode, PigException.INPUT, false, null, e);
            }
        }
    }

    @Override
    public boolean supportsMultipleInputs() {
        return false;
    }

    @Override
    public void visit(LOVisitor v) throws VisitorException {
        v.visit(this);
    }

    public Schema getEnforcedSchema() {
        return mEnforcedSchema;
    }

    /***
     * Set this when user enforces schema
     * @param enforcedSchema
     */
    public void setEnforcedSchema(Schema enforcedSchema) {
        this.mEnforcedSchema = enforcedSchema;
    }

    @Override
    public byte getType() {
        return DataType.BAG;
    }

    /**
     * @return the DeterminedSchema
     */
    public Schema getDeterminedSchema() {
        return mDeterminedSchema;
    }

    @Override
    public ProjectionMap getProjectionMap() {
        if (mIsProjectionMapComputed)
            return mProjectionMap;
        mIsProjectionMapComputed = true;

        Schema outputSchema;

        try {
            outputSchema = getSchema();
        } catch (FrontendException fee) {
            mProjectionMap = null;
            return mProjectionMap;
        }

        Schema inputSchema = null;

        List<LogicalOperator> predecessors = (ArrayList<LogicalOperator>) mPlan.getPredecessors(this);
        if (predecessors != null) {
            try {
                inputSchema = predecessors.get(0).getSchema();
            } catch (FrontendException fee) {
                mProjectionMap = null;
                return mProjectionMap;
            }
        } else {
            try {
                inputSchema = getSchema();
            } catch (IOException ioe) {
                mProjectionMap = null;
                return mProjectionMap;
            }
        }

        if (inputSchema == null) {
            if (outputSchema != null) {
                //determine schema returned null and the user specified a schema
                //OR
                //the predecessor did not have a schema and the user specified a schema
                mProjectionMap = new ProjectionMap(false);
                return mProjectionMap;
            }
        }

        if (Schema.equals(inputSchema, outputSchema, false, true)) {
            //there is a one is to one mapping between input and output schemas
            mProjectionMap = new ProjectionMap(false);
            return mProjectionMap;
        } else {
            MultiMap<Integer, ProjectionMap.Column> mapFields = new MultiMap<Integer, ProjectionMap.Column>();
            //compute the mapping assuming its a prefix projection
            for (int i = 0; i < inputSchema.size(); ++i) {
                mapFields.put(i, new ProjectionMap.Column(new Pair<Integer, Integer>(0, i)));
            }

            mProjectionMap = new ProjectionMap(mapFields, null, null);
            return mProjectionMap;
        }
    }

    @Override
    public List<RequiredFields> getRequiredFields() {
        List<RequiredFields> requiredFields = new ArrayList<RequiredFields>();
        requiredFields.add(new RequiredFields(false, true));
        return requiredFields;
    }

    @Override
    public List<RequiredFields> getRelevantInputs(int output, int column) throws FrontendException {
        if (!mIsSchemaComputed)
            getSchema();

        if (output != 0)
            return null;

        if (column < 0)
            return null;

        // if we have schema information, check if output column is valid
        if (mSchema != null) {
            if (column >= mSchema.size())
                return null;
        }

        List<RequiredFields> result = new ArrayList<RequiredFields>();
        result.add(new RequiredFields(true));
        return result;
    }

    public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException {
        RequiredFieldResponse response = new RequiredFieldResponse(false);
        if (mSchema == null)
            return response;

        if (requiredFieldList.getFields() == null)
            return response;

        if (requiredFieldList.getFields() == null) {
            return response;
        }

        this.requiredFieldList = requiredFieldList;
        if (mLoadFunc instanceof LoadPushDown) {
            response = ((LoadPushDown) mLoadFunc).pushProjection(requiredFieldList);
        } else {
            // loadfunc does not support pushing projections
            response = new RequiredFieldResponse(false);
        }
        if (!response.getRequiredFieldResponse())
            return response;

        // Change LOLoad schema to reflect this pruning
        TreeSet<Integer> prunedIndexSet = new TreeSet<Integer>();
        for (int i = 0; i < mSchema.size(); i++)
            prunedIndexSet.add(i);

        for (int i = 0; i < requiredFieldList.getFields().size(); i++) {
            RequiredField requiredField = requiredFieldList.getFields().get(i);
            if (requiredField.getIndex() >= 0)
                prunedIndexSet.remove(requiredField.getIndex());
            else {

                try {
                    int index = mSchema.getPosition(requiredField.getAlias());
                    if (index > 0)
                        prunedIndexSet.remove(index);
                } catch (FrontendException e) {
                    return new RequiredFieldResponse(false);
                }

            }
        }

        Integer index;
        while ((index = prunedIndexSet.pollLast()) != null)
            mSchema.getFields().remove(index.intValue());

        mIsProjectionMapComputed = false;
        getProjectionMap();
        return response;

    }

    @Override
    public void setAlias(String newAlias) {
        super.setAlias(newAlias);
        mLoadFunc.setUDFContextSignature(getAlias());
    }

    @Override
    public boolean pruneColumns(List<Pair<Integer, Integer>> columns) throws FrontendException {
        throw new FrontendException("Not implemented");
    }

    public RequiredFieldList getRequiredFieldList() {
        return requiredFieldList;
    }

    public Configuration getConfiguration() {
        return conf;
    }

}