FormatStorage1.MergeFileUtil.java Source code

Introduction

Here is the source code for FormatStorage1.MergeFileUtil.java
Source

/**
* Tencent is pleased to support the open source community by making TDW available.
* Copyright (C) 2014 THL A29 Limited, a Tencent company. All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use 
* this file except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed 
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
* OF ANY KIND, either express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package FormatStorage1;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.util.Progressable;

import StorageEngineClient.CombineFormatStorageFileInputFormat;

@SuppressWarnings("deprecation")
public class MergeFileUtil {
    public static final Log LOG = LogFactory.getLog(MergeFileUtil.class);

    public static class MergeMap extends MapReduceBase
            implements Mapper<LongWritable, IRecord, LongWritable, IRecord> {

        public MergeMap() {
        }

        @Override
        public void map(LongWritable key, IRecord value, OutputCollector<LongWritable, IRecord> output,
                Reporter reporter) throws IOException {
            output.collect(key, value);
        }

    }

    public static class MergeReduce extends MapReduceBase
            implements Reducer<LongWritable, IRecord, LongWritable, IRecord> {
        @Override
        public void reduce(LongWritable key, Iterator<IRecord> values,
                OutputCollector<LongWritable, IRecord> output, Reporter reporter) throws IOException {
            while (values.hasNext()) {
                output.collect(key, values.next());
            }
        }
    }

    public static class MergeIFormatInputFormat<K, V> extends FileInputFormat<LongWritable, IRecord> {

        @Override
        public RecordReader<LongWritable, IRecord> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
                throws IOException {
            MergeFileSplit sp = (MergeFileSplit) split;

            final IFormatDataFile ifdf = new IFormatDataFile(job);
            ifdf.open(sp.file);
            ifdf.seek(0);

            return new RecordReader<LongWritable, IRecord>() {

                int num = 0;

                @Override
                public boolean next(LongWritable key, IRecord value) throws IOException {
                    num++;
                    key.set(num);
                    boolean res = ifdf.next(value);
                    return res;
                }

                @Override
                public float getProgress() throws IOException {
                    return (float) num / ifdf.recnum();
                }

                @Override
                public long getPos() throws IOException {
                    return num;
                }

                @Override
                public IRecord createValue() {
                    return ifdf.getIRecordObj();
                }

                @Override
                public LongWritable createKey() {
                    return new LongWritable(0);
                }

                @Override
                public void close() throws IOException {
                    ifdf.close();
                }
            };
        }

        @Override
        public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
            String inputdir = job.get("mapred.input.dir");
            FileSystem fs = FileSystem.get(job);
            FileStatus[] fss = fs.listStatus(new Path(inputdir));
            MergeFileSplit[] splits = new MergeFileSplit[fss.length];
            int i = 0;
            for (FileStatus status : fss) {
                BlockLocation[] blkLocations = fs.getFileBlockLocations(status, 0, status.getLen());

                MergeFileSplit split = new MergeFileSplit(status.getPath().toString(), status.getLen(),
                        blkLocations[0].getHosts());
                splits[i++] = split;
            }
            return splits;
        }
    }

    public static class MergeFileSplit implements InputSplit {
        String file;
        long length;
        private String[] hosts;

        public MergeFileSplit() {
        }

        public MergeFileSplit(String file, long length, String[] hosts) {
            this.file = file;
            this.length = length;
            this.hosts = hosts;
        }

        @Override
        public long getLength() throws IOException {
            return this.length;
        }

        @Override
        public String[] getLocations() throws IOException {
            return this.hosts;
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            this.file = in.readUTF();
            this.length = in.readLong();
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(this.file);
            out.writeLong(this.length);
        }
    }

    public static class MergeIFormatOutputFormat<K, V> extends FileOutputFormat<LongWritable, IRecord> {

        public MergeIFormatOutputFormat() {
        }

        @Override
        public RecordWriter<LongWritable, IRecord> getRecordWriter(FileSystem ignored, JobConf job, String name,
                Progressable progress) throws IOException {
            Path path = FileOutputFormat.getTaskOutputPath(job, name);
            String fileName = path.toString();
            final IFormatDataFile ifdf = new IFormatDataFile(job);
            IHead head = new IHead();
            head.fromStr(job.get("ifdf.head.info"));
            ifdf.create(fileName, head);

            return new RecordWriter<LongWritable, IRecord>() {
                @Override
                public void write(LongWritable key, IRecord value) throws IOException {
                    ifdf.addRecord(value);
                }

                @Override
                public void close(Reporter reporter) throws IOException {
                    ifdf.close();
                }
            };
        }
    }

    public static void run(String inputdir, String outputdir, Configuration conf) throws IOException {
        JobConf job = new JobConf(conf);
        job.setJobName("MergeFileUtil");
        job.setJarByClass(MergeFileUtil.class);
        FileSystem fs = null;
        fs = FileSystem.get(job);
        if (fs.exists(new Path(outputdir))) {
            throw new IOException("outputdir: " + outputdir + " exist!!!");
        }

        FileStatus[] fss = fs.listStatus(new Path(inputdir));

        if (fss == null || fss.length <= 0) {
            throw new IOException("no input files");
        }

        IFormatDataFile ifdf = new IFormatDataFile(job);
        ifdf.open(fss[0].getPath().toString());
        job.set("ifdf.head.info", ifdf.fileInfo().head().toStr());
        ifdf.close();

        long wholesize = 0;
        for (FileStatus status : fss) {
            wholesize += status.getLen();
        }

        job.setNumReduceTasks(0);

        FileInputFormat.setInputPaths(job, inputdir);
        FileOutputFormat.setOutputPath(job, new Path(outputdir));

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(IRecord.class);

        job.setMapperClass(MergeMap.class);

        job.setInputFormat(CombineFormatStorageFileInputFormat.class);
        job.setOutputFormat(MergeIFormatOutputFormat.class);

        JobClient jc = new JobClient(job);
        RunningJob rjob = jc.submitJob(job);
        try {

            String lastReport = "";
            SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss,SSS");
            long reportTime = System.currentTimeMillis();
            long maxReportInterval = 3 * 1000;

            while (!rjob.isComplete()) {
                Thread.sleep(1000);

                int mapProgress = Math.round(rjob.mapProgress() * 100);
                int reduceProgress = Math.round(rjob.reduceProgress() * 100);

                String report = " map = " + mapProgress + "%,  reduce = " + reduceProgress + "%";

                if (!report.equals(lastReport) || System.currentTimeMillis() >= reportTime + maxReportInterval) {

                    String output = dateFormat.format(Calendar.getInstance().getTime()) + report;
                    System.err.println(output);
                    lastReport = report;
                    reportTime = System.currentTimeMillis();
                }
            }
            LOG.info(rjob.getJobState());

        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    public static void runold(String inputdir, String outputdir, Configuration conf) throws IOException {
        JobConf job = new JobConf(conf);
        job.setJobName("MergeFileUtil");
        job.setJarByClass(MergeFileUtil.class);
        FileSystem fs = null;
        fs = FileSystem.get(job);
        if (fs.exists(new Path(outputdir))) {
            throw new IOException("outputdir: " + outputdir + " exist!!!");
        }

        FileStatus[] fss = fs.listStatus(new Path(inputdir));

        if (fss == null || fss.length <= 0) {
            throw new IOException("no input files");
        }

        for (FileStatus status : fss) {
            if (status.isDir()) {
                throw new IOException("!!!input dir contains directory:\t" + status.getPath().toString());
            }
        }

        IFormatDataFile ifdf = new IFormatDataFile(job);
        ifdf.open(fss[0].getPath().toString());
        job.set("ifdf.head.info", ifdf.fileInfo().head().toStr());
        ifdf.close();

        long wholesize = 0;
        for (FileStatus status : fss) {
            wholesize += status.getLen();
        }

        long fl = 512 * 1024 * 1024;
        int reduces = (int) (wholesize / fl + 1);
        job.setNumReduceTasks(reduces);

        FileInputFormat.setInputPaths(job, inputdir);
        FileOutputFormat.setOutputPath(job, new Path(outputdir));

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(IRecord.class);

        job.setMapperClass(MergeMap.class);
        job.setReducerClass(MergeReduce.class);

        job.setInputFormat(MergeIFormatInputFormat.class);
        job.setOutputFormat(MergeIFormatOutputFormat.class);

        JobClient jc = new JobClient(job);
        RunningJob rjob = jc.submitJob(job);
        try {

            String lastReport = "";
            SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss,SSS");
            long reportTime = System.currentTimeMillis();
            long maxReportInterval = 3 * 1000;

            while (!rjob.isComplete()) {
                Thread.sleep(1000);

                int mapProgress = Math.round(rjob.mapProgress() * 100);
                int reduceProgress = Math.round(rjob.reduceProgress() * 100);

                String report = " map = " + mapProgress + "%,  reduce = " + reduceProgress + "%";

                if (!report.equals(lastReport) || System.currentTimeMillis() >= reportTime + maxReportInterval) {

                    String output = dateFormat.format(Calendar.getInstance().getTime()) + report;
                    System.err.println(output);
                    lastReport = report;
                    reportTime = System.currentTimeMillis();
                }
            }
            LOG.info(rjob.getJobState());

        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws IOException {
        if (args.length < 2) {
            System.out.println("please input the inputdir and outputdir in such format \r\n\t"
                    + "$HADOOP_HOME/hadoop jar $jarname FormatStorage1.MergeFileUtil srchdfsdir desthdfsdir");
            System.exit(0);
        }
        Configuration conf = new Configuration();
        if (args.length >= 3 && args[2].equals("old")) {
            runold(args[0], args[1], conf);
        } else {
            run(args[0], args[1], conf);
        }

        Path op = new Path(args[1] + "/_logs");
        op.getFileSystem(conf).delete(op, true);

        if (args[args.length - 1].equals("check")) {
            check(args, conf);
        }
    }

    private static void check(String[] args, Configuration conf) throws IOException {
        FileSystem fs = FileSystem.get(conf);
        long orirecnum = 0;
        FileStatus[] fss = fs.listStatus(new Path(args[0]));
        for (FileStatus status : fss) {
            IFormatDataFile ifdf = new IFormatDataFile(conf);
            ifdf.open(status.getPath().toString());
            orirecnum += ifdf.segIndex().recnum();
            ifdf.close();
        }
        long newrecnum = 0;
        fss = fs.listStatus(new Path(args[1]));
        for (FileStatus status : fss) {
            IFormatDataFile ifdf = new IFormatDataFile(conf);
            ifdf.open(status.getPath().toString());
            newrecnum += ifdf.segIndex().recnum();
            ifdf.close();
        }
        System.out.println(args[0] + ":\tmerge result:\torirecnum:\t" + orirecnum + "\tnewrecnum:\t" + newrecnum
                + "\tmerge " + ((orirecnum == newrecnum) ? "success" : "fail"));
    }
}