org.apache.pig.piggybank.storage.AllLoader.java Source code

Introduction

Here is the source code for org.apache.pig.piggybank.storage.AllLoader.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.pig.piggybank.storage;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.log4j.Logger;
import org.apache.pig.Expression;
import org.apache.pig.FileInputLoadFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.LoadCaster;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.LoadPushDown;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.StoreMetadata;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.builtin.Utf8StorageConverter;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.piggybank.storage.allloader.LoadFuncHelper;
import org.apache.pig.piggybank.storage.partition.PathPartitionHelper;

/**
 * The AllLoader provides the ability to point pig at a folder that contains
 * files in multiple formats e.g. PlainText, Gz, Bz, Lzo, HiveRC etc and have
 * the LoadFunc(s) automatically selected based on the file extension. <br/>
 * <b>How this works:<b/><br/>
 * The file extensions are mapped in the pig.properties via the property
 * file.extension.loaders.
 * 
 * <p/>
 * <b>file.extension.loaders format</b>
 * <ul>
 * <li>[file extension]:[loader func spec]</li>
 * <li>[file-extension]:[optional path tag]:[loader func spec]</li>
 * <li>[file-extension]:[optional path tag]:[sequence file key value writer
 * class name]:[loader func spec]</li>
 * </ul>
 * 
 * <p/>
 * The file.extension.loaders property associate pig loaders with file
 * extensions, if a file does not have an extension the AllLoader will look at
 * the first three bytes of a file and try to guess its format bassed on:
 * <ul>
 * <li>[ -119, 76, 90 ] = lzo</li>
 * <li>[ 31, -117, 8 ] = gz</li>
 * <li>[ 66, 90, 104 ] = bz2</li>
 * <li>[ 83, 69, 81 ] = seq</li>
 * </ul>
 * <br/>
 * The loader associated with that extension will then be used.
 * <p/>
 * 
 * <b>Path partitioning</b> The AllLoader supports hive style path partitioning
 * e.g. /log/type1/daydate=2010-11-01<br/>
 * "daydate" will be considered a partition key and filters can be written
 * against this.<br/>
 * Note that the filter should go into the AllLoader contructor e.g.<br/>
 * a = LOAD 'input' using AllLoader('daydate<\"2010-11-01\"')<br/>
 * 
 * <b>Path tags</b> AllLoader supports configuring different loaders for the
 * same extension based on there file path.<br/>
 * E.g.<br/>
 * We have the paths /log/type1, /log/type2<br/>
 * For each of these directories we'd like to use different loaders.<br/>
 * So we use setup our loaders:<br/>
 * file.extension.loaders:gz:type1:MyType1Loader, gz:type2:MyType2Loader<br/>
 * 
 * 
 * <p/>
 * <b>Sequence files<b/> Sequence files also support using the Path tags for
 * loader selection but has an extra configuration option that relates to the
 * Key Class used to write the Sequence file.<br/>
 * E.g. for HiveRC this value is: org.apache.hadoop.hive.ql.io.RCFile so we can
 * setup our sequence file formatting:<br/>
 * file.extension.loaders:seq::org.apache.hadoop.hive.ql.io.RCFile:
 * MyHiveRCLoader, seq::DefaultSequenceFileLoader<br/>
 * 
 * <p/>
 * <b>Schema</b> The JsoneMetadata schema loader is supported and the schema
 * will be loaded using this loader.<br/>
 * In case this fails, the schema can be loaded using the default schema
 * provided.
 * 
 */
public class AllLoader extends FileInputLoadFunc implements LoadMetadata, StoreMetadata, LoadPushDown {

    private static final Logger LOG = Logger.getLogger(AllLoader.class);

    private static final String PROJECTION_ID = AllLoader.class.getName() + ".projection";

    transient LoadFunc childLoadFunc;
    transient boolean supportPushDownProjection = false;
    transient RequiredFieldList requiredFieldList;
    transient SortedSet<Integer> requiredFieldHashSet;

    transient TupleFactory tupleFactory = TupleFactory.getInstance();
    transient ResourceSchema schema;

    String signature;

    /**
     * Implements the logic for searching partition keys and applying parition
     * filtering
     */
    transient PathPartitionHelper pathPartitionerHelper = new PathPartitionHelper();
    transient Map<String, String> currentPathPartitionKeyMap;
    transient String[] partitionColumns;

    transient JsonMetadata jsonMetadata;
    transient boolean partitionKeysSet = false;

    LoadFuncHelper loadFuncHelper = null;

    transient Configuration conf;
    transient Path currentPath;

    String constructorPassedPartitionFilter;

    public AllLoader() {
        jsonMetadata = new JsonMetadata();
    }

    public AllLoader(String partitionFilter) {
        this();
        LOG.debug("PartitionFilter: " + partitionFilter.toString());

        constructorPassedPartitionFilter = partitionFilter;

    }

    @Override
    public void setLocation(String location, Job job) throws IOException {
        FileInputFormat.setInputPaths(job, location);
        // called on the front end
        conf = job.getConfiguration();
        loadFuncHelper = new LoadFuncHelper(conf);

        if (constructorPassedPartitionFilter != null) {

            pathPartitionerHelper.setPartitionFilterExpression(constructorPassedPartitionFilter, AllLoader.class,
                    signature);

        }

        getPartitionKeys(location, job);
    }

    @Override
    public LoadCaster getLoadCaster() throws IOException {
        return new Utf8StorageConverter();
    }

    @Override
    public AllLoaderInputFormat getInputFormat() throws IOException {
        // this plugs the AllLoaderInputFormat into the system, which in turn
        // will plug in the AllRecordReader
        // the AllRecordReader will select and create the correct LoadFunc
        return new AllLoaderInputFormat(signature);
    }

    @Override
    public void prepareToRead(@SuppressWarnings("rawtypes") RecordReader reader, PigSplit split)
            throws IOException {

        AllReader allReader = (AllReader) reader;

        if (currentPath == null || !(currentPath.equals(allReader.path))) {
            currentPathPartitionKeyMap = (partitionColumns == null) ? null
                    : pathPartitionerHelper.getPathPartitionKeyValues(allReader.path.toString());
            currentPath = allReader.path;
        }

        childLoadFunc = allReader.prepareLoadFuncForReading(split);

        String projectProperty = getUDFContext().getProperty(PROJECTION_ID);

        if (projectProperty != null) {

            // load the required field list from the current UDF context
            ByteArrayInputStream input = new ByteArrayInputStream(
                    Base64.decodeBase64(projectProperty.getBytes("UTF-8")));

            ObjectInputStream objInput = new ObjectInputStream(input);

            try {
                requiredFieldList = (RequiredFieldList) objInput.readObject();
            } catch (ClassNotFoundException e) {
                throw new FrontendException(e.toString(), e);
            } finally {
                IOUtils.closeStream(objInput);
            }

            if (childLoadFunc.getClass().isAssignableFrom(LoadPushDown.class)) {
                supportPushDownProjection = true;
                ((LoadPushDown) childLoadFunc).pushProjection(requiredFieldList);
            } else {
                if (requiredFieldList != null) {
                    requiredFieldHashSet = new TreeSet<Integer>();
                    for (RequiredField requiredField : requiredFieldList.getFields()) {
                        requiredFieldHashSet.add(requiredField.getIndex());
                    }
                }
            }

        }

    }

    @Override
    public Tuple getNext() throws IOException {
        // delegate work to the child load func selected based on the file type
        // and other criteria
        // We do support PushDown Projection if the LoadFunc does not so
        // in this method we need to look at the childLoadFunc flag
        // (supportPushDownProjection )
        // if true we use the getNext method as is, if not we remove the fields
        // not required in the spushDownProjection.

        Tuple tuple = null;

        if (supportPushDownProjection) {
            tuple = childLoadFunc.getNext();
        } else if ((tuple = childLoadFunc.getNext()) != null) {
            // ----- If the function does not support projection we do it here

            if (requiredFieldHashSet != null) {

                Tuple projectedTuple = tupleFactory.newTuple(requiredFieldHashSet.size());
                int i = 0;
                int tupleSize = tuple.size();

                for (int index : requiredFieldHashSet) {
                    if (index < tupleSize) {
                        // add the tuple columns
                        projectedTuple.set(i++, tuple.get(index));
                    } else {
                        // add the partition columns
                        projectedTuple.set(i++,
                                currentPathPartitionKeyMap.get(partitionColumns[index - tupleSize]));
                    }
                }

                tuple = projectedTuple;
            }

        }

        return tuple;
    }

    @Override
    public List<OperatorSet> getFeatures() {
        return Arrays.asList(LoadPushDown.OperatorSet.PROJECTION);
    }

    @Override
    public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException {
        // save the required field list to the UDFContext properties.

        Properties properties = getUDFContext();

        ByteArrayOutputStream byteArray = new ByteArrayOutputStream();
        ObjectOutputStream objOut = null;
        try {
            objOut = new ObjectOutputStream(byteArray);
            objOut.writeObject(requiredFieldList);
        } catch (IOException e) {
            throw new FrontendException(e.toString(), e);
        } finally {
            IOUtils.closeStream(objOut);
        }

        // write out the whole required fields list as a base64 string
        try {
            properties.setProperty(PROJECTION_ID,
                    new String(Base64.encodeBase64(byteArray.toByteArray()), "UTF-8"));
        } catch (UnsupportedEncodingException e) {
            throw new FrontendException(e.toString(), e);
        }

        return new RequiredFieldResponse(true);
    }

    /**
     * Tries to determine the LoadFunc by using the LoadFuncHelper to identify a
     * loader for the first file in the location directory.<br/>
     * If no LoadFunc can be determine ad FrontendException is thrown.<br/>
     * If the LoadFunc implements the LoadMetadata interface and returns a non
     * null schema this schema is returned.
     * 
     * @param location
     * @param job
     * @return
     * @throws IOException
     */
    private ResourceSchema getSchemaFromLoadFunc(String location, Job job) throws IOException {

        ResourceSchema schema = null;

        if (loadFuncHelper == null) {
            loadFuncHelper = new LoadFuncHelper(job.getConfiguration());
        }

        Path firstFile = loadFuncHelper.determineFirstFile(location);

        if (childLoadFunc == null) {

            // choose loader
            FuncSpec funcSpec = loadFuncHelper.determineFunction(location, firstFile);

            if (funcSpec == null) {
                // throw front end exception, no loader could be determined.
                throw new FrontendException("No LoadFunction could be determined for " + location);
            }

            childLoadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(funcSpec);
        }

        LOG.debug("Found LoadFunc:  " + childLoadFunc.getClass().getName());

        if (childLoadFunc instanceof LoadMetadata) {
            schema = ((LoadMetadata) childLoadFunc).getSchema(firstFile.toUri().toString(), job);
            LOG.debug("Found schema " + schema + " from loadFunc:  " + childLoadFunc.getClass().getName());
        }

        return schema;
    }

    @Override
    public ResourceSchema getSchema(String location, Job job) throws IOException {

        if (schema == null) {
            ResourceSchema foundSchema = jsonMetadata.getSchema(location, job);

            // determine schema from files in location
            if (foundSchema == null) {
                foundSchema = getSchemaFromLoadFunc(location, job);

            }

            // only add the partition keys if the schema is not null
            // we use the partitionKeySet to only set partition keys once.
            if (!(partitionKeysSet || foundSchema == null)) {
                String[] keys = getPartitionColumns(location, job);

                if (!(keys == null || keys.length == 0)) {

                    // re-edit the pigSchema to contain the new partition keys.
                    ResourceFieldSchema[] fields = foundSchema.getFields();

                    LOG.debug("Schema: " + Arrays.toString(fields));

                    ResourceFieldSchema[] newFields = Arrays.copyOf(fields, fields.length + keys.length);

                    int index = fields.length;

                    for (String key : keys) {
                        newFields[index++] = new ResourceFieldSchema(new FieldSchema(key, DataType.CHARARRAY));
                    }

                    foundSchema.setFields(newFields);

                    LOG.debug("Added partition fields: " + keys + " to loader schema");
                    LOG.debug("Schema is: " + Arrays.toString(newFields));
                }

                partitionKeysSet = true;

            }

            schema = foundSchema;
        }

        return schema;
    }

    @Override
    public ResourceStatistics getStatistics(String location, Job job) throws IOException {
        return null;
    }

    @Override
    public void storeStatistics(ResourceStatistics stats, String location, Job job) throws IOException {

    }

    @Override
    public void storeSchema(ResourceSchema schema, String location, Job job) throws IOException {
        jsonMetadata.storeSchema(schema, location, job);
    }

    /**
     * Reads the partition columns
     * 
     * @param location
     * @param job
     * @return
     */
    private String[] getPartitionColumns(String location, Job job) {

        if (partitionColumns == null) {
            // read the partition columns from the UDF Context first.
            // if not in the UDF context then read it using the PathPartitioner.

            Properties properties = getUDFContext();

            if (properties == null) {
                properties = new Properties();
            }

            String partitionColumnStr = properties.getProperty(PathPartitionHelper.PARTITION_COLUMNS);

            if (partitionColumnStr == null && !(location == null || job == null)) {
                // if it hasn't been written yet.
                Set<String> partitionColumnSet;

                try {
                    partitionColumnSet = pathPartitionerHelper.getPartitionKeys(location, job.getConfiguration());
                } catch (IOException e) {

                    RuntimeException rte = new RuntimeException(e);
                    rte.setStackTrace(e.getStackTrace());
                    throw rte;

                }

                if (partitionColumnSet != null) {

                    StringBuilder buff = new StringBuilder();

                    int i = 0;
                    for (String column : partitionColumnSet) {
                        if (i++ != 0) {
                            buff.append(',');
                        }

                        buff.append(column);
                    }

                    String buffStr = buff.toString().trim();

                    if (buffStr.length() > 0) {

                        properties.setProperty(PathPartitionHelper.PARTITION_COLUMNS, buff.toString());
                    }

                    partitionColumns = partitionColumnSet.toArray(new String[] {});

                }

            } else {
                // the partition columns has been set already in the UDF Context
                if (partitionColumnStr != null) {
                    String split[] = partitionColumnStr.split(",");
                    Set<String> partitionColumnSet = new LinkedHashSet<String>();
                    if (split.length > 0) {
                        for (String splitItem : split) {
                            partitionColumnSet.add(splitItem);
                        }
                    }

                    partitionColumns = partitionColumnSet.toArray(new String[] {});
                }

            }

        }

        return partitionColumns;

    }

    @Override
    public String[] getPartitionKeys(String location, Job job) throws IOException {

        String[] partitionKeys = getPartitionColumns(location, job);

        if (partitionKeys == null) {
            throw new NullPointerException("INDUCED");
        }
        LOG.info("Get Parition Keys for: " + location + " keys: " + Arrays.toString(partitionKeys));

        return partitionKeys;
    }

    // --------------- Save Signature and PartitionFilter Expression
    // ----------------- //
    @Override
    public void setUDFContextSignature(String signature) {
        this.signature = signature;
        super.setUDFContextSignature(signature);
    }

    private Properties getUDFContext() {
        return UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[] { signature });
    }

    @Override
    public void setPartitionFilter(Expression partitionFilter) throws IOException {
        LOG.debug("PartitionFilter: " + partitionFilter.toString());

        pathPartitionerHelper.setPartitionFilterExpression(partitionFilter.toString(), AllLoader.class, signature);

    }

    /**
     * InputFormat that encapsulates the correct input format based on the file
     * type.
     * 
     */
    public static class AllLoaderInputFormat extends FileInputFormat<Writable, Writable> {

        transient PathPartitionHelper partitionHelper = new PathPartitionHelper();
        String udfSignature;

        public AllLoaderInputFormat(String udfSignature) {
            super();
            this.udfSignature = udfSignature;
        }

        @Override
        protected List<FileStatus> listStatus(JobContext jobContext) throws IOException {

            List<FileStatus> files = partitionHelper.listStatus(jobContext, AllLoader.class, udfSignature);

            if (files == null)
                files = super.listStatus(jobContext);

            return files;

        }

        @Override
        public RecordReader<Writable, Writable> createRecordReader(InputSplit inputSplit,
                TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {

            // this method plugs the AllReader into the system, and the
            // AllReader will when called select the correct LoadFunc
            // return new AllReader(udfSignature);
            return new AllReader(udfSignature);
        }

    }

    /**
     * This is where the logic is for selecting the correct Loader.
     * 
     */
    public static class AllReader extends RecordReader<Writable, Writable> {

        LoadFunc selectedLoadFunc;
        RecordReader<Writable, Writable> selectedReader;
        LoadFuncHelper loadFuncHelper = null;
        String udfSignature;
        Path path;

        public AllReader(String udfSignature) {
            this.udfSignature = udfSignature;
        }

        @SuppressWarnings("unchecked")
        @Override
        public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
                throws IOException, InterruptedException {

            FileSplit fileSplit = (FileSplit) inputSplit;

            path = fileSplit.getPath();
            String fileName = path.toUri().toString();

            // select the correct load function and initialise
            loadFuncHelper = new LoadFuncHelper(taskAttemptContext.getConfiguration());

            FuncSpec funcSpec = loadFuncHelper.determineFunction(fileName);

            if (funcSpec == null) {
                throw new IOException("Cannot determine LoadFunc for " + fileName);
            }

            selectedLoadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(funcSpec);

            selectedLoadFunc.setUDFContextSignature(udfSignature);
            selectedLoadFunc.setLocation(fileName,
                    new Job(taskAttemptContext.getConfiguration(), taskAttemptContext.getJobName()));

            selectedReader = selectedLoadFunc.getInputFormat().createRecordReader(fileSplit, taskAttemptContext);

            selectedReader.initialize(fileSplit, taskAttemptContext);

            LOG.info("Using LoadFunc " + selectedLoadFunc.getClass().getName() + " on " + fileName);

        }

        // ---------------------- all functions below this line delegate work to
        // the selectedReader ------------//

        public LoadFunc prepareLoadFuncForReading(PigSplit split) throws IOException {

            selectedLoadFunc.prepareToRead(selectedReader, split);
            return selectedLoadFunc;

        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return selectedReader.nextKeyValue();
        }

        @Override
        public Writable getCurrentKey() throws IOException, InterruptedException {
            return selectedReader.getCurrentKey();
        }

        @Override
        public Writable getCurrentValue() throws IOException, InterruptedException {
            return selectedReader.getCurrentValue();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return selectedReader.getProgress();
        }

        @Override
        public void close() throws IOException {
            selectedReader.close();
        }

    }

}