Java tutorial
/** * Copyright (C) 2011 Metamarkets http://metamx.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.metamx.milano.pig; import com.google.common.base.Joiner; import com.google.common.base.Throwables; import com.google.protobuf.ByteString; import com.google.protobuf.Descriptors; import com.google.protobuf.Message; import com.metamx.milano.generated.io.MilanoTypeMetadata; import com.metamx.milano.hadoop.MilanoProtoFileInputFormat; import com.metamx.milano.hadoop.MilanoProtoFileRecordReader; import com.metamx.milano.io.MilanoProtoFile; import com.metamx.milano.proto.MilanoTool; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.log4j.Logger; import org.apache.pig.Expression; import org.apache.pig.LoadFunc; import org.apache.pig.LoadMetadata; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceStatistics; import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.schema.Schema; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.Set; import java.util.TreeSet; /** * */ public class MilanoLoadFunc extends LoadFunc implements LoadMetadata //, LoadPushDown -- Not yet supported { private static final Logger log = Logger.getLogger(MilanoLoadFunc.class); private String udfSignature; private MilanoProtoFileRecordReader recordReader; private TupleFactory tupleFactory = TupleFactory.getInstance(); private BagFactory bagFactory = BagFactory.getInstance(); private MilanoTypeMetadata.TypeMetadata typeMetadata; private Descriptors.Descriptor descriptor; /** * Set the UDF Signature which is used to store the ProtoBuf Schema between the client and mapper/reducer. * Called in both the client and mapper/reducer context. * * @param signature The UDF signature. */ @Override public void setUDFContextSignature(String signature) { udfSignature = signature; } @Override public void setLocation(String location, Job job) throws IOException { Path basePath = new Path(location); FileSystem fileSystem = basePath.getFileSystem(job.getConfiguration()); Set<Path> paths = new TreeSet<Path>(); if (fileSystem.getFileStatus(basePath).isDir()) { getPaths(basePath, paths, fileSystem); } else { paths.add(basePath); } log.info("Setting input to " + paths); FileInputFormat.setInputPaths(job, Joiner.on(',').join(paths)); } private void getPaths(Path baseDirectory, Set<Path> paths, FileSystem fileSystem) throws IOException { FileStatus[] files = fileSystem.listStatus(baseDirectory); for (FileStatus file : files) { Path path = file.getPath(); FileStatus fileStatus = fileSystem.getFileStatus(path); if (fileStatus.isDir()) { getPaths(path, paths, fileSystem); } else { paths.add(baseDirectory); } } } @Override public InputFormat getInputFormat() throws IOException { log.debug("Getting InputFormat"); return new MilanoProtoFileInputFormat(); } @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { log.debug("Preparing to read"); recordReader = (MilanoProtoFileRecordReader) reader; typeMetadata = recordReader.getMetadata(); descriptor = MilanoTool.with(typeMetadata).getDescriptor(); } @Override public Tuple getNext() throws IOException { try { if (!recordReader.nextKeyValue()) { return null; } return buildTuple(recordReader.getCurrentValue(), descriptor); } catch (InterruptedException e) { log.error("Interrupted", e); throw Throwables.propagate(e); } } /** * This recursively builds a Pig Tuple from a Message and a Descriptor. * * @param message The Message to decode into a Tuple. * @param descriptor The Descriptor to use in decoding the Message. * * @return The new Tuple. * * @throws IOException Thrown when we receive an unsupported type in the Message/Descriptor (ENUM/BOOLEAN). */ private Tuple buildTuple(Message message, final Descriptors.Descriptor descriptor) throws IOException { List<Object> tuple = new ArrayList<Object>(); for (Descriptors.FieldDescriptor fieldDescriptor : descriptor.getFields()) { // HACK: For some reason the FieldDescriptor from the Descriptor doesn't match the one from the Message. // HACK: This is a hack to get around that problem. Descriptors.FieldDescriptor messageFieldDescriptor = message.getDescriptorForType() .findFieldByName(fieldDescriptor.getName()); switch (fieldDescriptor.getJavaType()) { case INT: case LONG: case FLOAT: case DOUBLE: case STRING: tuple.add(message.getField(messageFieldDescriptor)); break; case BYTE_STRING: // Pig doesn't understand ByteString. Here we convert to a byte[] which Pig does understand. tuple.add(new DataByteArray(((ByteString) message.getField(messageFieldDescriptor)).toByteArray())); break; // This functionality is totally untested. case MESSAGE: if (fieldDescriptor.isRepeated()) { // We have a bag. List<Tuple> bag = new ArrayList<Tuple>(); int count = message.getRepeatedFieldCount(messageFieldDescriptor); for (int i = 0; i < count; i++) { bag.add(buildTuple((Message) message.getRepeatedField(messageFieldDescriptor, i), fieldDescriptor.getMessageType())); } tuple.add(bagFactory.newDefaultBag(bag)); } else { // Just a tuple. tuple.add(buildTuple((Message) message.getField(messageFieldDescriptor), messageFieldDescriptor.getMessageType())); } break; case ENUM: case BOOLEAN: throw new IOException( String.format("Type %s not supported.", fieldDescriptor.getJavaType().toString())); } } return tupleFactory.newTuple(tuple); } /** * This builds a Pig ResourceSchema from the input file(s). This relies on the existence of TypeMetadata. * This is the method by which we pass the schema types and names directly to pig without having to specify them directly. * * @param location As passed to relativeToAbsolutePath * @param job The job. * * @return Returns a ResourceSchema representing the incoming file(s) or null if TypeMetadata does not exist. * * @throws IOException Not thrown directly, but thrown from getMessageSchema where it indicates an unsupported type. */ @Override public ResourceSchema getSchema(String location, Job job) throws IOException { Configuration conf = job.getConfiguration(); Properties props = ConfigurationUtil.toProperties(conf); // HACK: Here we open the file directly to read the TypeMetadata. // HACK: There may be a better more direct way to do this, but it works for now. Path path = new Path(location); FileSystem fileSystem = path.getFileSystem(conf); FileStatus fileStatus = fileSystem.getFileStatus(path); if (fileStatus.isDir()) { log.debug(String.format("Path is a directory.")); path = getFilePath(path, fileSystem); if (path == null) { return null; } } else if (!fileSystem.exists(path)) { return null; } MilanoProtoFile.Reader reader = MilanoProtoFile.createReader(fileSystem.open(path)); typeMetadata = reader.getMetadata(); reader.close(); if (typeMetadata == null) { return null; } descriptor = MilanoTool.with(typeMetadata).getDescriptor(); return new ResourceSchema(getMessageSchema(descriptor)); } private Path getFilePath(Path path, FileSystem fileSystem) throws IOException { Path newPath = null; FileStatus[] files = fileSystem.listStatus(path); for (FileStatus file : files) { if (file.isDir()) { newPath = getFilePath(file.getPath(), fileSystem); if (newPath != null) { break; } } else { newPath = file.getPath(); break; } } return newPath; } /** * This takes a Descriptor and recursively creates a Pig Schema out of it. * * @param descriptor The descriptor to use. * * @return A Schema representing the structure of the descriptor. * * @throws IOException Thrown if an unsupported type is encountered. */ private Schema getMessageSchema(final Descriptors.Descriptor descriptor) throws IOException { Schema schema = new Schema(); for (Descriptors.FieldDescriptor fieldDescriptor : descriptor.getFields()) { String name = fieldDescriptor.getName(); switch (fieldDescriptor.getJavaType()) { case INT: schema.add(new Schema.FieldSchema(name, DataType.INTEGER)); break; case LONG: schema.add(new Schema.FieldSchema(name, DataType.LONG)); break; case FLOAT: schema.add(new Schema.FieldSchema(name, DataType.FLOAT)); break; case DOUBLE: schema.add(new Schema.FieldSchema(name, DataType.DOUBLE)); break; case STRING: schema.add(new Schema.FieldSchema(name, DataType.CHARARRAY)); break; case BYTE_STRING: schema.add(new Schema.FieldSchema(name, DataType.BYTEARRAY)); break; case MESSAGE: Descriptors.Descriptor messageType = fieldDescriptor.getMessageType(); if (fieldDescriptor.isRepeated()) { // We have a bag. schema.add(new Schema.FieldSchema(name, getMessageSchema(messageType), DataType.BAG)); } else { // Just a tuple. schema.add(new Schema.FieldSchema(name, getMessageSchema(messageType), DataType.TUPLE)); } break; case ENUM: case BOOLEAN: default: throw new IOException("Unsupported data type."); } } return schema; } @Override public ResourceStatistics getStatistics(String location, Job job) throws IOException { return null; //Not supported. } @Override public String[] getPartitionKeys(String location, Job job) throws IOException { return null; //Not supported. } @Override public void setPartitionFilter(Expression partitionFilter) throws IOException { // No-Op not supported. } /* // Not yet supported, scheduled for later release. @Override public List<LoadPushDown.OperatorSet> getFeatures() { return null; } @Override public LoadPushDown.RequiredFieldResponse pushProjection(LoadPushDown.RequiredFieldList requiredFieldList) throws FrontendException { return null; } */ }