brickhouse.udf.dcache.DistributedMapUDF.java Source code

Java tutorial

Introduction

Here is the source code for brickhouse.udf.dcache.DistributedMapUDF.java

Source

package brickhouse.udf.dcache;
/**
 * Copyright 2012 Klout, Inc
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **/

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/**
 *   UDF to access a distributed map file 
 *   
 *   Assumes the file is a tab-separated file of name-value pairs,
 *   which has been placed in distributed cache using the "add file" command
 * 
 * Example 
 * 
 *  INSERT OVERWRITE LOCAL DIRECTORY mymap select key,value from my_map_table;
 *  ADD FILE mymap;
 *  
 *  select key, val* distributed_map( key, 'mymap') from the_table;
 *   
 *   
 *   If one argument is passed in, it is assumed to be a filename, containing
 *    a map of type map<string,double>, and the entire map is returned.
 *    
 *   If two arguments are passed in, it is either filename, and a string specifying the
 *    type of the map ( i.e distributed_map('mymap','map<string,bigint>'); ) and returns
 *     the entire map, or it is the key and the filename ( ie distributed_map( key, 'mymap'),
 *     and only the key's value is returned.
 *     
 *   If there are three arguments passed in, it is assumed to be the key, the filename, and the 
 *    maptype, (i.e distributed_map( key, 'mymap', 'map<string,bigint>') )
 *
 */
@UDFType(deterministic = false)
public class DistributedMapUDF extends GenericUDF {
    private static final Logger LOG = Logger.getLogger(DistributedMapUDF.class);
    private static HashMap<String, HashMap<Object, Object>> localMapMap = new HashMap<String, HashMap<Object, Object>>();
    private StringObjectInspector fileNameInspector;
    private PrimitiveObjectInspector keyInspector;
    private TypeInfo keyType;
    private TypeInfo valType;
    private LazySimpleSerDe serde;

    private LazySimpleSerDe getLineSerde() throws SerDeException {
        if (serde == null) {
            Logger.getLogger(LazySimpleSerDe.class).setLevel(Level.DEBUG);
            serde = new LazySimpleSerDe();
            Configuration job = new Configuration();
            Properties tbl = new Properties();
            tbl.setProperty("columns", "key,value");
            tbl.setProperty("columns.types", keyType.getTypeName() + "," + valType.getTypeName());
            serde.initialize(job, tbl);
        }
        return serde;

    }

    private void addValues(HashMap<Object, Object> map, String mapFilename) throws IOException, SerDeException {
        if (!mapFilename.endsWith("crc")) {
            File mapFile = new File(mapFilename);
            if (mapFile.isDirectory()) {
                String[] subFiles = mapFile.list();
                for (String subFile : subFiles) {
                    LOG.info("Checking recursively " + subFile);
                    addValues(map, mapFilename + "/" + subFile);
                }
            } else {
                BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(mapFile)));

                SerDe lazy = getLineSerde();
                StructObjectInspector lineInsp = (StructObjectInspector) lazy.getObjectInspector();
                StructField keyRef = lineInsp.getStructFieldRef("key");
                StructField valueRef = lineInsp.getStructFieldRef("value");

                String line;
                while ((line = reader.readLine()) != null) {
                    Writable lineText = new Text(line);
                    Object lineObj = lazy.deserialize(lineText);
                    List<Object> objList = lineInsp.getStructFieldsDataAsList(lineObj);
                    Object key = ((PrimitiveObjectInspector) keyRef.getFieldObjectInspector())
                            .getPrimitiveJavaObject(objList.get(0));
                    Object val = ((PrimitiveObjectInspector) valueRef.getFieldObjectInspector())
                            .getPrimitiveJavaObject(objList.get(1));
                    map.put(key, val);
                }
            }
        } else {
            LOG.info(" Ignoring CRC file " + mapFilename);
        }
    }

    private Map<Object, Object> getLocalMap(String mapFileName) {
        HashMap<Object, Object> map = localMapMap.get(mapFileName);
        if (map == null) {
            try {
                File localDir = new File(".");
                String[] files = localDir.list();
                for (String file : files) {
                    LOG.info(" In current dir is " + file);
                    File checkFile = new File(file);
                    if (checkFile.isDirectory()) {
                        LOG.info(" FILE " + file + " is a directory");
                    }
                }
                map = new HashMap<Object, Object>();
                addValues(map, mapFileName);

                localMapMap.put(mapFileName, map);
            } catch (IOException ioExc) {
                ioExc.printStackTrace();
                throw new RuntimeException(ioExc);

            } catch (SerDeException serdeExc) {
                throw new RuntimeException(serdeExc);
            }
        }
        return map;
    }

    @Override
    public Object evaluate(DeferredObject[] arg0) throws HiveException {
        //// if keyInspector has been set 
        if (this.keyInspector != null) {
            Object key = keyInspector.getPrimitiveJavaObject(arg0[0].get());
            String mapFileName = this.fileNameInspector.getPrimitiveJavaObject(arg0[1].get());
            Map<Object, Object> map = getLocalMap(mapFileName);
            return map.get(key);
        } else {
            Object mapFNameObj;
            if (arg0.length == 1) {
                mapFNameObj = arg0[0].get();
            } else {
                mapFNameObj = arg0[1].get();
            }
            String mapFileName = this.fileNameInspector.getPrimitiveJavaObject(mapFNameObj);
            Map<Object, Object> map = getLocalMap(mapFileName);
            return map;
        }
    }

    @Override
    public String getDisplayString(String[] arg0) {
        return "distributed_map()";
    }

    public String usage(String err) {
        return " Distributed Map -- Case  " + err;
    }

    private MapObjectInspector getMapType(String typeStr) throws UDFArgumentException, IllegalArgumentException {
        try {
            TypeInfo hiveType = TypeInfoUtils.getTypeInfoFromTypeString(typeStr);
            if (hiveType.getCategory() != Category.MAP) {
                throw new UDFArgumentException(usage("Type is not map"));
            }
            MapObjectInspector mapInsp = (MapObjectInspector) TypeInfoUtils
                    .getStandardJavaObjectInspectorFromTypeInfo(hiveType);
            if (mapInsp.getMapKeyObjectInspector().getCategory() != Category.PRIMITIVE) {
                throw new UDFArgumentException(usage("Key is not primitive"));
            }

            return mapInsp;
        } catch (IllegalArgumentException badTypeStr) {
            throw new UDFArgumentException(usage("String is not type"));
        }
    }

    /**
     *  Either one, two or three values can be passed in.
     *  If one argument is passed it, it is implied that the 
     *   return value is a map<string,double>. If three arguments
     *   are passed in, then it is implied the arguments are the
     *    map key, the map filename, and the value type.
     *    
     *    If two arguments are passed in, it is implied that either)
     *     a map key, and a filename are being passed in, 
     *    or a filename, and a map return type are being passed in.
     */
    @Override
    public ObjectInspector initialize(ObjectInspector[] arg0) throws UDFArgumentException {
        if (arg0.length == 0 || arg0.length > 3)
            throw new UDFArgumentException(usage("Between 1 and 3 arguments"));
        switch (arg0.length) {
        case 1:
            //// filename
            if (!(arg0[0] instanceof ConstantObjectInspector) || !(arg0[0] instanceof StringObjectInspector)) {
                throw new UDFArgumentException(usage(" 1 arguments is always name of directory"));
            }
            fileNameInspector = (StringObjectInspector) arg0[0];
            keyType = TypeInfoFactory.stringTypeInfo;
            valType = TypeInfoFactory.doubleTypeInfo;
            return ObjectInspectorFactory.getStandardMapObjectInspector(
                    PrimitiveObjectInspectorFactory.javaStringObjectInspector,
                    PrimitiveObjectInspectorFactory.javaDoubleObjectInspector);

        case 2:
            //// either key, filename ... 
            ///  or filename, maptype
            if (!(arg0[1] instanceof ConstantObjectInspector) || !(arg0[1] instanceof StringObjectInspector)
                    || !(arg0[0] instanceof PrimitiveObjectInspector)) {
                throw new UDFArgumentException(
                        usage("2 arguments is eiter key and filename, or a filename and maptype"));
            }
            ConstantObjectInspector mapType = (ConstantObjectInspector) arg0[1];
            String typeStr = mapType.getWritableConstantValue().toString();
            try {
                //// able to parse map type ...
                MapObjectInspector mapInsp = getMapType(typeStr);
                keyType = TypeInfoFactory.getPrimitiveTypeInfo(mapInsp.getMapKeyObjectInspector().getTypeName());
                valType = TypeInfoFactory.getPrimitiveTypeInfo(mapInsp.getMapValueObjectInspector().getTypeName());
                fileNameInspector = (StringObjectInspector) arg0[0];
                return ObjectInspectorUtils.getStandardObjectInspector(mapInsp);

            } catch (UDFArgumentException checkMapType) {
                /// Assume that it is key, filename 
                this.keyInspector = (PrimitiveObjectInspector) arg0[0];
                keyType = TypeInfoFactory.getPrimitiveTypeInfo(keyInspector.getTypeName());
                valType = TypeInfoFactory.doubleTypeInfo;
                this.fileNameInspector = (StringObjectInspector) arg0[1];
                //// Default case is  string, double
                return PrimitiveObjectInspectorFactory.javaDoubleObjectInspector;
            }
        case 3:
            //// key , filename , maptype
            if (!(arg0[1] instanceof ConstantObjectInspector) || !(arg0[1] instanceof StringObjectInspector)
                    || !(arg0[2] instanceof ConstantObjectInspector) || !(arg0[2] instanceof StringObjectInspector)
                    || !(arg0[0] instanceof PrimitiveObjectInspector)) {
                throw new UDFArgumentException(usage("3 arguments are key,filename and maptype"));
            }
            fileNameInspector = (StringObjectInspector) arg0[1];

            ConstantObjectInspector mapType3 = (ConstantObjectInspector) arg0[2];
            String typeStr3 = mapType3.getWritableConstantValue().toString();
            MapObjectInspector mapInspect = this.getMapType(typeStr3);

            keyInspector = (PrimitiveObjectInspector) arg0[0];
            if (keyInspector
                    .getPrimitiveCategory() != ((PrimitiveObjectInspector) mapInspect.getMapKeyObjectInspector())
                            .getPrimitiveCategory()) {
                throw new UDFArgumentException(usage("Key must be primitive"));
            }

            keyType = TypeInfoFactory.getPrimitiveTypeInfo(keyInspector.getTypeName());

            ObjectInspector valInspector = ObjectInspectorUtils
                    .getStandardObjectInspector(mapInspect.getMapValueObjectInspector());
            /// XXX Can we have non primitives for the values ????
            valType = TypeInfoFactory.getPrimitiveTypeInfo(valInspector.getTypeName());
            return valInspector;
        }
        return null;
    }

}