com.datatorrent.demos.mroperator.MapOperator.java Source code

Introduction

Here is the source code for com.datatorrent.demos.mroperator.MapOperator.java
Source

/*
 * Copyright (c) 2013 DataTorrent, Inc. ALL Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datatorrent.demos.mroperator;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

import com.datatorrent.lib.io.fs.AbstractHDFSInputOperator;
import com.datatorrent.lib.util.KeyHashValPair;
import com.datatorrent.api.Context.OperatorContext;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.DefaultPartition;
import com.datatorrent.api.Partitioner;
import com.datatorrent.demos.mroperator.ReporterImpl.ReporterType;

/**
 * <p>
 * MapOperator class.
 * </p>
 *
 * @since 0.9.0
 */
@SuppressWarnings({ "unchecked" })
public class MapOperator<K1, V1, K2, V2> extends AbstractHDFSInputOperator
        implements Partitioner<MapOperator<K1, V1, K2, V2>> {

    private static final Logger logger = LoggerFactory.getLogger(MapOperator.class);

    private String dirName;

    private boolean emitPartitioningCountOnce = false;
    private boolean emitLastCountOnce = false;
    private int operatorId;
    private Class<? extends InputFormat<K1, V1>> inputFormatClass;
    private transient InputFormat<K1, V1> inputFormat;
    private transient InputSplit inputSplit;
    private Class<? extends InputSplit> inputSplitClass;
    private ByteArrayOutputStream outstream = new ByteArrayOutputStream();
    private transient RecordReader<K1, V1> reader;
    private boolean emittedAll = false;
    public final transient DefaultOutputPort<KeyHashValPair<Integer, Integer>> outputCount = new DefaultOutputPort<KeyHashValPair<Integer, Integer>>();
    public final transient DefaultOutputPort<KeyHashValPair<K2, V2>> output = new DefaultOutputPort<KeyHashValPair<K2, V2>>();
    private transient JobConf jobConf;

    public Class<? extends InputSplit> getInputSplitClass() {
        return inputSplitClass;
    }

    public void setInputSplitClass(Class<? extends InputSplit> inputSplitClass) {
        this.inputSplitClass = inputSplitClass;
    }

    public Class<? extends InputFormat<K1, V1>> getInputFormatClass() {
        return inputFormatClass;
    }

    public void setInputFormatClass(Class<? extends InputFormat<K1, V1>> inputFormatClass) {
        this.inputFormatClass = inputFormatClass;
    }

    public String getDirName() {
        return dirName;
    }

    public void setDirName(String dirName) {
        this.dirName = dirName;
        super.setFilePath(dirName);
    }

    @Override
    public void beginWindow(long windowId) {
        if (!emitPartitioningCountOnce) {
            outputCount.emit(new KeyHashValPair<Integer, Integer>(operatorId, 1));
            emitPartitioningCountOnce = true;
        }
        if (reader == null) {
            try {
                reader = inputFormat.getRecordReader(inputSplit, new JobConf(new Configuration()), reporter);
            } catch (IOException e) {
                logger.info("error getting record reader {}", e.getMessage());
            }
        }
        super.beginWindow(windowId);
    }

    @Override
    public void setup(OperatorContext context) {
        if (context != null) {
            operatorId = context.getId();
        }
        reporter = new ReporterImpl(ReporterType.Mapper, new Counters());
        outputCollector = new OutputCollectorImpl<K2, V2>();
        Configuration conf = new Configuration();
        try {
            inputFormat = inputFormatClass.newInstance();
            SerializationFactory serializationFactory = new SerializationFactory(conf);
            Deserializer keyDesiralizer = serializationFactory.getDeserializer(inputSplitClass);
            keyDesiralizer.open(new ByteArrayInputStream(outstream.toByteArray()));
            inputSplit = (InputSplit) keyDesiralizer.deserialize(null);
            ((ReporterImpl) reporter).setInputSplit(inputSplit);
            reader = inputFormat.getRecordReader(inputSplit, new JobConf(conf), reporter);
        } catch (Exception e) {
            logger.info("failed to initialize inputformat obj {}", inputFormat);
            throw new RuntimeException(e);
        }
        InputStream stream = null;
        if (configFile != null && configFile.length() > 0) {
            stream = ClassLoader.getSystemResourceAsStream("/" + configFile);
            if (stream == null) {
                stream = ClassLoader.getSystemResourceAsStream(configFile);
            }
        }
        if (stream != null) {
            conf.addResource(stream);
        }
        jobConf = new JobConf(conf);
        if (mapClass != null) {
            try {
                mapObject = mapClass.newInstance();
            } catch (Exception e) {
                logger.info("can't instantiate object {}", e.getMessage());
            }

            mapObject.configure(jobConf);
        }
        if (combineClass != null) {
            try {
                combineObject = combineClass.newInstance();
            } catch (Exception e) {
                logger.info("can't instantiate object {}", e.getMessage());
            }
            combineObject.configure(jobConf);
        }
    }

    @Override
    public void activate(OperatorContext context) {
    }

    @Override
    public void deactivate() {
    }

    @Override
    public void emitTuples(FSDataInputStream stream) {
        if (!emittedAll) {
            try {

                K1 key = reader.createKey();
                V1 val = reader.createValue();
                emittedAll = !reader.next(key, val);
                if (!emittedAll) {
                    KeyHashValPair<K1, V1> keyValue = new KeyHashValPair<K1, V1>(key, val);
                    mapObject.map(keyValue.getKey(), keyValue.getValue(), outputCollector, reporter);
                    if (combineObject == null) {
                        List<KeyHashValPair<K2, V2>> list = ((OutputCollectorImpl<K2, V2>) outputCollector)
                                .getList();
                        for (KeyHashValPair<K2, V2> e : list) {
                            output.emit(e);
                        }
                        list.clear();
                    }
                }
            } catch (IOException ex) {
                logger.debug(ex.toString());
                throw new RuntimeException(ex);
            }
        }
    }

    @Override
    public void endWindow() {
        List<KeyHashValPair<K2, V2>> list = ((OutputCollectorImpl<K2, V2>) outputCollector).getList();
        if (combineObject != null) {
            Map<K2, List<V2>> cacheObject = new HashMap<K2, List<V2>>();
            for (KeyHashValPair<K2, V2> tuple : list) {
                List<V2> cacheList = cacheObject.get(tuple.getKey());
                if (cacheList == null) {
                    cacheList = new ArrayList<V2>();
                    cacheList.add(tuple.getValue());
                    cacheObject.put(tuple.getKey(), cacheList);
                } else {
                    cacheList.add(tuple.getValue());
                }
            }
            list.clear();
            OutputCollector<K2, V2> tempOutputCollector = new OutputCollectorImpl<K2, V2>();
            for (Map.Entry<K2, List<V2>> e : cacheObject.entrySet()) {
                try {
                    combineObject.reduce(e.getKey(), e.getValue().iterator(), tempOutputCollector, reporter);
                } catch (IOException e1) {
                    logger.info(e1.getMessage());
                }
            }
            list = ((OutputCollectorImpl<K2, V2>) tempOutputCollector).getList();
            for (KeyHashValPair<K2, V2> e : list) {
                output.emit(e);
            }
        }
        if (!emitLastCountOnce && emittedAll) {
            outputCount.emit(new KeyHashValPair<Integer, Integer>(operatorId, -1));
            logger.info("emitting end of file {}", new KeyHashValPair<Integer, Integer>(operatorId, -1));
            emitLastCountOnce = true;
        }
        list.clear();
    }

    private InputSplit[] getSplits(JobConf conf, int numSplits, String path) throws Exception {
        FileInputFormat.setInputPaths(conf, new Path(path));
        if (inputFormat == null) {
            inputFormat = inputFormatClass.newInstance();
            String inputFormatClassName = inputFormatClass.getName();
            if (inputFormatClassName.equals("org.apache.hadoop.mapred.TextInputFormat")) {
                ((TextInputFormat) inputFormat).configure(conf);
            } else if (inputFormatClassName.equals("org.apache.hadoop.mapred.KeyValueTextInputFormat")) {
                ((KeyValueTextInputFormat) inputFormat).configure(conf);
            }
        }
        return inputFormat.getSplits(conf, numSplits);
        // return null;
    }

    @Override
    public void partitioned(Map<Integer, Partition<MapOperator<K1, V1, K2, V2>>> partitions) {
    }

    @SuppressWarnings("rawtypes")
    @Override
    public Collection<Partition<MapOperator<K1, V1, K2, V2>>> definePartitions(
            Collection<Partition<MapOperator<K1, V1, K2, V2>>> partitions, int incrementalCapacity) {
        Collection c = partitions;
        Collection<Partition<MapOperator<K1, V1, K2, V2>>> operatorPartitions = c;
        Partition<MapOperator<K1, V1, K2, V2>> template = null;
        Iterator<Partition<MapOperator<K1, V1, K2, V2>>> itr = operatorPartitions.iterator();
        template = itr.next();
        Configuration conf = new Configuration();
        SerializationFactory serializationFactory = new SerializationFactory(conf);
        if (outstream.size() == 0) {
            InputSplit[] splits;
            try {
                splits = getSplits(new JobConf(conf), incrementalCapacity + 1,
                        template.getPartitionedInstance().getDirName());
            } catch (Exception e1) {
                logger.info(" can't get splits {}", e1.getMessage());
                throw new RuntimeException(e1);
            }
            Collection<Partition<MapOperator<K1, V1, K2, V2>>> operList = new ArrayList<Partition<MapOperator<K1, V1, K2, V2>>>();
            itr = operatorPartitions.iterator();
            int size = splits.length;
            Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
            while (size > 0 && itr.hasNext()) {
                Partition<MapOperator<K1, V1, K2, V2>> p = itr.next();
                MapOperator<K1, V1, K2, V2> opr = p.getPartitionedInstance();
                opr.setInputFormatClass(inputFormatClass);
                opr.setMapClass(mapClass);
                opr.setCombineClass(combineClass);
                opr.setConfigFile(configFile);
                try {
                    keySerializer.open(opr.getOutstream());
                    keySerializer.serialize(splits[size - 1]);
                    opr.setInputSplitClass(splits[size - 1].getClass());
                } catch (IOException e) {
                    logger.info("error while serializing {}", e.getMessage());
                }
                size--;
                operList.add(p);
            }
            while (size > 0) {
                MapOperator<K1, V1, K2, V2> opr = new MapOperator<K1, V1, K2, V2>();
                opr.setInputFormatClass(inputFormatClass);
                opr.setMapClass(mapClass);
                opr.setCombineClass(combineClass);
                opr.setConfigFile(configFile);
                try {
                    keySerializer.open(opr.getOutstream());
                    keySerializer.serialize(splits[size - 1]);
                    opr.setInputSplitClass(splits[size - 1].getClass());
                } catch (IOException e) {
                    logger.info("error while serializing {}", e.getMessage());
                }
                size--;
                operList.add(new DefaultPartition<MapOperator<K1, V1, K2, V2>>(opr));
            }
            try {
                keySerializer.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            return operList;
        }
        return null;
    }

    public ByteArrayOutputStream getOutstream() {
        return outstream;
    }

    public void setOutstream(ByteArrayOutputStream outstream) {
        this.outstream = outstream;
    }

    /**
     * adding map code
     */

    private Class<? extends Mapper<K1, V1, K2, V2>> mapClass;
    private Class<? extends Reducer<K2, V2, K2, V2>> combineClass;

    private transient Mapper<K1, V1, K2, V2> mapObject;
    private transient Reducer<K2, V2, K2, V2> combineObject;
    private transient Reporter reporter;

    private String configFile;

    public String getConfigFile() {
        return configFile;
    }

    public void setConfigFile(String configFile) {
        this.configFile = configFile;
    }

    private transient OutputCollector<K2, V2> outputCollector;

    public Class<? extends Mapper<K1, V1, K2, V2>> getMapClass() {
        return mapClass;
    }

    public void setMapClass(Class<? extends Mapper<K1, V1, K2, V2>> mapClass) {
        this.mapClass = mapClass;
    }

    public Class<? extends Reducer<K2, V2, K2, V2>> getCombineClass() {
        return combineClass;
    }

    public void setCombineClass(Class<? extends Reducer<K2, V2, K2, V2>> combineClass) {
        this.combineClass = combineClass;
    }

}