com.twitter.elephanttwin.retrieval.IndexedPigLoader.java Source code

Introduction

Here is the source code for com.twitter.elephanttwin.retrieval.IndexedPigLoader.java
Source

/**
 * Copyright 2012 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.twitter.elephanttwin.retrieval;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.log4j.Logger;
import org.apache.pig.LoadFunc;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.UDFContext;

import com.twitter.elephantbird.mapreduce.io.ThriftWritable;
import com.twitter.elephantbird.pig.load.FilterLoadFunc;
import com.twitter.elephantbird.util.TypeRef;
import com.twitter.elephanttwin.gen.FileIndexDescriptor;
import com.twitter.elephanttwin.gen.IndexedField;
import com.twitter.elephanttwin.util.HdfsUtils;

/**
 * This PigLoader "wraps" over a real PigLoader. It adds the
 * capability of reporting to Pig what filtering conditions can be pushed down
 * because of the existence of indexed files for the input files/paths to work
 * on. Example to use this new Loader:
 * <pre>
 * {@code
  T1 = LOAD '/user/mep/testdata' USING
          com.twitter.elephanttwin.retrieval.IndexedPigLoader(
              'com.twitter.elephantbird.pig.load.LzoThriftB64LinePigLoader',
              'com.mycompany.thrift.gen.LogEvent',
              '/user/test/index/hdfs/index001');
  }
 * </pre>
 * If the last index directory parameter optional and if it is omitted, it'll use the value from -Dindex.hdfs.directory
 * or from the configuration file.
 */
public class IndexedPigLoader extends FilterLoadFunc {
    protected TypeRef<?> typeRef = null;
    protected String realPigLoaderName;
    protected String realInpuFormatName;
    protected String valueClassName;
    protected String indexDir;
    protected String contextSignature;

    public static final String INDEXDIRECTORY = "index.hdfs.directory";
    private static final Logger LOG = Logger.getLogger(IndexedPigLoader.class);

    public IndexedPigLoader(String realPigLoaderName, String valueClassName) {
        this(realPigLoaderName, valueClassName, "");
    }

    public IndexedPigLoader(String realPigLoaderName, String valueClassName, String indexDir) {
        super(null);
        this.realPigLoaderName = realPigLoaderName;
        this.valueClassName = valueClassName;
        this.indexDir = indexDir;
        loader = (LoadFunc) PigContext
                .instantiateFuncFromSpec(String.format("%s('%s')", realPigLoaderName, valueClassName));
        try {
            this.realInpuFormatName = loader.getInputFormat().getClass().getName();
        } catch (IOException e) {
            LOG.error(e);
            throw new RuntimeException(e);
        }
    }

    /**
     * set up LZOHDFSBlockIndexedInputFormat options to be used by the real
     * PigLoader/InputFormat
     */
    @Override
    public void setLocation(String location, Job job) throws IOException {
        loader.setLocation(location, job);
        Properties props = getUDFProperties();
        // Filter conditions were set up through the setPartitionFilter
        // and getPartitionFilter mechanism,

        String filterConditions = props.getProperty(BlockIndexedFileInputFormat.FILTERCONDITIONS);
        if (indexDir == null || indexDir.equals(""))
            indexDir = job.getConfiguration().get(IndexedPigLoader.INDEXDIRECTORY);
        BlockIndexedFileInputFormat.setSearchOptions(job, realInpuFormatName, valueClassName, indexDir,
                filterConditions);
    }

    /**
     * return the "wrap" InputFormat class LZOHDFSBlockIndexedInputFormat for all
     * LZO based PigLoader.
     */
    @SuppressWarnings("rawtypes")
    @Override
    public InputFormat getInputFormat() throws IOException {
        BlockIndexedFileInputFormat format = new BlockIndexedFileInputFormat();
        // deserialize using PigContext
        Properties props = getUDFProperties();
        Expression.BinaryExpression filter;
        String filterConditions = props.getProperty(BlockIndexedFileInputFormat.FILTERCONDITIONS);
        filter = Expression.getFilterCondition(filterConditions);
        format.setSearchFilter(filter);
        return format;
    }

    /**
     * report to pig what columns have been indexed before. The current
     * implementation only reports the columns indexed on all input files the
     * PigLoader need to work on.
     */
    @Override
    public String[] getPartitionKeys(String location, Job job) throws IOException {
        return getUnionedPartitionKeys(location, job);
    }

    private String[] getUnionedPartitionKeys(String location, Job job) throws IOException {
        /**
         * report what columns have been indexed before. The current implementation
         * only reports the columns indexed on all input files the PigLoader need to
         * work on. This is done by inspecting the FileIndexDesriptor of each input
         * file
         */

        if (location == null || location.equals(""))
            return null;

        Configuration conf = job.getConfiguration();
        FileSystem fs = FileSystem.get(conf);

        if (!fs.exists(new Path(indexDir))) {
            LOG.info("index dir:" + indexDir + " does not exist, no indexes will be used");
            return null;
        }
        LOG.info("checking directory:" + new Path(indexDir + new Path(location).toUri().getPath()));
        FileStatus[] fileStatues = fs.globStatus(new Path(indexDir + new Path(location).toUri().getPath()));

        if (fileStatues == null || fileStatues.length == 0) {
            LOG.info("index dir:" + indexDir + location + " does not have indexes, no indexes will be used");
            return null;
        }

        // return all indexed column names from all base file under location which have been previously indexed.
        HashSet<String> indexedColumns = new HashSet<String>();
        List<FileStatus> indexMetaFiles = new ArrayList<FileStatus>();
        for (FileStatus status : fileStatues) {
            HdfsUtils.addInputPathRecursively(indexMetaFiles, fs, status.getPath(), HdfsUtils.hiddenDirectoryFilter,
                    indexMetaPathFilter);
        }
        LOG.info("found " + indexMetaFiles.size() + " index descriptor files");

        for (FileStatus indexMetafile : indexMetaFiles) {
            FSDataInputStream in = fs.open(indexMetafile.getPath());
            ThriftWritable<FileIndexDescriptor> writable = ThriftWritable.newInstance(FileIndexDescriptor.class);
            writable.readFields(in);
            FileIndexDescriptor indexDescriptor = writable.get();

            List<IndexedField> indexedFields = indexDescriptor.getIndexedFields();
            in.close();
            for (IndexedField field : indexedFields) {
                String colName = field.getFieldName();
                indexedColumns.add(colName);
            }
        }

        if (indexedColumns.size() == 0) {
            return null;
        }

        return indexedColumns.toArray(new String[indexedColumns.size()]);
    }

    private final PathFilter indexMetaPathFilter = new PathFilter() {
        // avoid hidden files and directories.
        @Override
        public boolean accept(Path path) {
            String name = path.getName();
            return name.equals(BlockIndexedFileInputFormat.INDEXMETAFILENAME);
        }
    };

    /**
     * get the filter conditions from Pig and set up
     */
    @Override
    public void setPartitionFilter(org.apache.pig.Expression partitionFilter) throws IOException {
        // supporting one index only for now
        if (!Expression.isSupported(partitionFilter))
            throw new IOException("not supported PIG filter condition " + partitionFilter);
        Expression.BinaryExpression filter = Expression.newInstance(partitionFilter);
        String filterCondString = ObjectSerializer.serialize(filter);
        Properties props = getUDFProperties();
        props.setProperty(BlockIndexedFileInputFormat.FILTERCONDITIONS, filterCondString);
    }

    /** UDF properties for this class based on context signature */
    protected Properties getUDFProperties() {
        return UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[] { contextSignature });
    }

    @Override
    public void setUDFContextSignature(String signature) {
        this.contextSignature = signature;
    }
}