Text2ColumntStorageMR.java Source code

Java tutorial

Introduction

Here is the source code for Text2ColumntStorageMR.java

Source

/**
* Tencent is pleased to support the open source community by making TDW available.
* Copyright (C) 2014 THL A29 Limited, a Tencent company. All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use 
* this file except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed 
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
* OF ANY KIND, either express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

import java.io.IOException;

import java.util.Iterator;
import java.text.SimpleDateFormat;
import java.util.Calendar;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;

import Comm.ConstVar;
import FormatStorage.FieldMap;
import FormatStorage.Head;
import FormatStorage.Unit;
import FormatStorage.Unit.FieldValue;
import FormatStorage.Unit.Record;
import FormatStorage.FieldMap.Field;
import StorageEngineClient.ColumnStorageHiveOutputFormat;
import StorageEngineClient.ColumnStorageInputFormat;
import StorageEngineClient.ColumnStorageSplit;

public class Text2ColumntStorageMR {
    public static final Log LOG = LogFactory.getLog("Text2FormatMR");

    public static class TextFileMapper extends MapReduceBase implements Mapper {
        public void configure(JobConf job) {
        }

        public void map(Object key, Object value, OutputCollector output, Reporter reporter) throws IOException {
            String content = ((Text) value).toString();

            String[] values = content.split(",");

            try {

                Record record = new Record((short) values.length);
                record.addValue(new FieldValue(Byte.valueOf(values[0]), (short) 0));
                record.addValue(new FieldValue(Short.valueOf(values[1]), (short) 1));
                record.addValue(new FieldValue(Integer.valueOf(values[2]), (short) 2));
                record.addValue(new FieldValue(Long.valueOf(values[3]), (short) 3));
                record.addValue(new FieldValue(Float.valueOf(values[4]), (short) 4));
                record.addValue(new FieldValue(Double.valueOf(values[5]), (short) 5));
                record.addValue(new FieldValue(values[6], (short) 6));

                LongWritable lw = new LongWritable((long) (Math.random() * 100));

                output.collect(lw, record);
            } catch (Exception e) {
            }
        }

        public void close() {

        }

    }

    public static class ColumnStorageReducer extends MapReduceBase implements Reducer {

        public void configure(JobConf job) {
        }

        public void reduce(Object key, Iterator values, OutputCollector output, Reporter reporter)
                throws IOException {
            LongWritable lw = new LongWritable(0);
            while (values.hasNext()) {
                output.collect(lw, (Writable) values.next());
            }
        }

        public void close() {

        }
    }

    public static void initHead(Head head) {
        short fieldNum = 7;
        FieldMap fieldMap = new FieldMap();
        fieldMap.addField(new Field(ConstVar.FieldType_Byte, ConstVar.Sizeof_Byte, (short) 0));
        fieldMap.addField(new Field(ConstVar.FieldType_Short, ConstVar.Sizeof_Short, (short) 1));
        fieldMap.addField(new Field(ConstVar.FieldType_Int, ConstVar.Sizeof_Int, (short) 2));
        fieldMap.addField(new Field(ConstVar.FieldType_Long, ConstVar.Sizeof_Long, (short) 3));
        fieldMap.addField(new Field(ConstVar.FieldType_Float, ConstVar.Sizeof_Float, (short) 4));
        fieldMap.addField(new Field(ConstVar.FieldType_Double, ConstVar.Sizeof_Double, (short) 5));
        fieldMap.addField(new Field(ConstVar.FieldType_String, 0, (short) 6));

        head.setFieldMap(fieldMap);

        head.setVar(ConstVar.VarFlag);
    }

    @SuppressWarnings({ "unchecked", "deprecation" })
    public static void showSplits(JobConf conf) throws IOException {
        ColumnStorageInputFormat inputFormat = new ColumnStorageInputFormat();
        InputSplit[] splits = inputFormat.getSplits(conf, 1);
        int size = splits.length;
        System.out.println("getSplits return size:" + size);
        for (int i = 0; i < size; i++) {
            ColumnStorageSplit split = (ColumnStorageSplit) splits[i];
            System.out.printf("split:" + i + "offset:" + split.getStart() + "len:" + split.getLength() + "path:"
                    + conf.get(ConstVar.InputPath) + "beginLine:" + split.getBeginLine() + "endLine:"
                    + split.getEndLine() + "\n");
        }
    }

    @SuppressWarnings("deprecation")
    public static void main(String[] args) throws Exception {

        if (args.length != 3) {
            System.out.println("Text2ColumnStorageMR <input> <output> <columnStorageMode>");
            System.exit(-1);
        }

        JobConf conf = new JobConf(Text2ColumntStorageMR.class);

        conf.setJobName("Text2ColumnStorageMR");

        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(4);

        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Unit.Record.class);

        conf.setMapperClass(TextFileMapper.class);
        conf.setReducerClass(ColumnStorageReducer.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat((Class<? extends OutputFormat>) ColumnStorageHiveOutputFormat.class);
        conf.set("mapred.output.compress", "flase");

        Head head = new Head();
        initHead(head);

        head.toJobConf(conf);

        int bt = Integer.valueOf(args[2]);

        FileInputFormat.setInputPaths(conf, args[0]);
        Path outputPath = new Path(args[1]);
        FileOutputFormat.setOutputPath(conf, outputPath);

        FileSystem fs = outputPath.getFileSystem(conf);
        fs.delete(outputPath, true);

        JobClient jc = new JobClient(conf);
        RunningJob rj = null;
        rj = jc.submitJob(conf);

        String lastReport = "";
        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss,SSS");
        long reportTime = System.currentTimeMillis();
        long maxReportInterval = 3 * 1000;
        while (!rj.isComplete()) {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
            }

            int mapProgress = Math.round(rj.mapProgress() * 100);
            int reduceProgress = Math.round(rj.reduceProgress() * 100);

            String report = " map = " + mapProgress + "%,  reduce = " + reduceProgress + "%";

            if (!report.equals(lastReport) || System.currentTimeMillis() >= reportTime + maxReportInterval) {

                String output = dateFormat.format(Calendar.getInstance().getTime()) + report;
                System.out.println(output);
                lastReport = report;
                reportTime = System.currentTimeMillis();
            }
        }

        System.exit(0);

    }

}