com.shmsoft.dmass.main.MRFreeEedProcess.java Source code

Introduction

Here is the source code for com.shmsoft.dmass.main.MRFreeEedProcess.java
Source

/*
 *
 * Copyright SHMsoft, Inc. 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.shmsoft.dmass.main;

import com.google.common.io.Files;
import java.io.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Properties;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.shmsoft.dmass.data.index.SolrIndex;
import com.shmsoft.dmass.ec2.S3Agent;
import com.shmsoft.dmass.services.Project;
import com.shmsoft.dmass.services.Settings;
import com.shmsoft.dmass.services.Util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Configure and start Hadoop process
 */
public class MRFreeEedProcess extends Configured implements Tool {
    private static Logger logger = LoggerFactory.getLogger(MRFreeEedProcess.class);

    private byte[] b = new byte[1024];

    @Override
    public int run(String[] args) throws Exception {
        // inventory dir holds all package (zip) files resulting from stage
        String projectFileName = args[0];
        String outputPath = args[1];
        logger.info("Running Hadoop job");
        logger.info("Input project file = " + projectFileName);
        logger.info("Output path = " + outputPath);

        // Hadoop configuration class
        Configuration configuration = getConf();
        // No speculative execution! Do not process the same file twice
        configuration.set("mapred.reduce.tasks.speculative.execution", "false");
        // TODO even in local mode, the first argument should not be the inventory
        // but write a complete project file instead
        Project project = Project.getProject();
        if (project == null || project.isEmpty()) {
            // configure Hadoop input files
            System.out.println("Reading project file " + projectFileName);
            project = new Project().loadFromFile(new File(projectFileName));
            Project.setProject(project);
        }
        project.setProperty(ParameterProcessing.OUTPUT_DIR_HADOOP, outputPath);
        // send complete project information to all mappers and reducers
        configuration.set(ParameterProcessing.PROJECT, project.toString());

        Settings.load();
        configuration.set(ParameterProcessing.SETTINGS_STR, Settings.getSettings().toString());
        configuration.set(ParameterProcessing.METADATA_FILE,
                Files.toString(new File(ColumnMetadata.metadataNamesFile), Charset.defaultCharset()));
        Job job = new Job(configuration);
        job.setJarByClass(MRFreeEedProcess.class);
        job.setJobName("MRFreeEedProcess");

        // Hadoop processes key-value pairs
        job.setOutputKeyClass(MD5Hash.class);
        job.setOutputValueClass(MapWritable.class);

        // set map and reduce classes
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        // Hadoop TextInputFormat class
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        //        String delim = "\u0001";
        //        configuration.set("mapred.textoutputformat.separator", delim);
        //        configuration.set("mapreduce.output.textoutputformat.separator", delim);

        logger.debug("project.isEnvHadoop() = {} ", project.isEnvHadoop());
        String inputPath = projectFileName;
        if (project.isEnvHadoop() || Settings.getSettings().isHadoopDebug()) {
            inputPath = formInputPath(project);
        }

        logger.debug("Ready to run, inputPath = {}, outputPath = {}", inputPath, outputPath);
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        SHMcloudLogging.init(false);

        if (Settings.getSettings().isHadoopDebug()) {
            if (new File(outputPath).exists()) {
                Util.deleteDirectory(new File(outputPath));
            }
        }

        SolrIndex.getInstance().init();

        boolean success = job.waitForCompletion(true);
        if (project.isEnvHadoop() && project.isFsS3()) {
            transferResultsToS3(outputPath);
        }

        SolrIndex.getInstance().destroy();

        return success ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        System.out.println(Version.getVersionAndBuild());
        if (PlatformUtil.isNix()) {
            ToolRunner.run(new MRFreeEedProcess(), args);
        } else {
            WindowsRunner.run(args);
        }
    }

    private String formInputPath(Properties props) throws IOException {
        String projectCode = props.getProperty(ParameterProcessing.PROJECT_CODE).trim();
        String cmd = "hadoop fs -rmr " + ParameterProcessing.WORK_AREA + "/" + projectCode;
        PlatformUtil.runUnixCommand(cmd);
        cmd = "hadoop fs -mkdir " + ParameterProcessing.WORK_AREA + "/" + projectCode;
        PlatformUtil.runUnixCommand(cmd);

        StringBuilder builder = new StringBuilder();
        String[] inputPaths = props.getProperty(ParameterProcessing.PROJECT_INPUTS).split(",");
        inputPaths = loadBalance(inputPaths);
        int inputNumber = 0;
        Project project = Project.getProject();
        Util.deleteDirectory(new File(ParameterProcessing.TMP_DIR_HADOOP + "/"));
        new File(ParameterProcessing.TMP_DIR_HADOOP).mkdirs();
        for (String inputPath : inputPaths) {
            ++inputNumber;
            String tmp = ParameterProcessing.TMP_DIR_HADOOP + "/input" + inputNumber;
            inputPath = inputPath.trim();
            FileUtils.writeStringToFile(new File(tmp), inputPath);

            if (project.isEnvHadoop() || Settings.getSettings().isHadoopDebug()) {
                builder.append(ParameterProcessing.WORK_AREA + "/").append(projectCode).append("/input")
                        .append(inputNumber).append(",");
            } else {
                builder.append(ParameterProcessing.TMP_DIR_HADOOP + "/input").append(inputNumber).append(",");
            }
        }
        if (project.isEnvHadoop() || Settings.getSettings().isHadoopDebug()) {
            File[] files = new File(ParameterProcessing.TMP_DIR_HADOOP).listFiles();
            cmd = "hadoop fs -put ";
            for (File file : files) {
                if (file.getName().startsWith("input")) {
                    cmd = cmd + file.getPath() + " ";
                }
            }
            cmd = cmd + ParameterProcessing.WORK_AREA + "/" + projectCode + "/";
            PlatformUtil.runUnixCommand(cmd);
        } else {
            // files already in the right place
        }
        builder.deleteCharAt(builder.length() - 1);
        return builder.toString();
    }

    private void transferResultsToS3(String hdfsOutputPath) {
        try {
            String outputPath = "/mnt/tmp/results";
            File localOutput = new File(outputPath);
            if (localOutput.exists()) {
                Util.deleteDirectory(localOutput);
            }
            localOutput.mkdirs();
            if (!Settings.getSettings().isHadoopDebug()) {
                String cmd = "hadoop fs -copyToLocal " + hdfsOutputPath + "/* " + outputPath;
                PlatformUtil.runUnixCommand(cmd);
            } else {
                String cmd = "cp " + hdfsOutputPath + "/* " + outputPath;
                PlatformUtil.runUnixCommand(cmd);
            }

            File[] parts = localOutput.listFiles();
            S3Agent s3agent = new S3Agent();
            Project project = Project.getProject();
            String run = project.getRun();
            if (!run.isEmpty()) {
                run = run + "/";
            }
            for (File part : parts) {
                String s3key = project.getProjectCode() + "/" + "output/" + run + "results/" + part.getName();
                if (part.getName().startsWith("part")) {
                    s3agent.putFileInS3(part.getPath(), s3key);
                }
            }
        } catch (IOException e) {
            e.printStackTrace(System.out);
        }
    }

    private void copyToHdfs(String from, String to) throws IOException {
        Configuration configuration = getConf();
        FileSystem fileSystem = FileSystem.get(configuration);

        // Check if the file already exists
        Path path = new Path(to);
        if (fileSystem.exists(path)) {
            System.out.println("File " + to + " already exists");
            return;
        }

        // Create a new file and write data to it.
        FSDataOutputStream out = fileSystem.create(path);
        InputStream in = new BufferedInputStream(new FileInputStream(new File(from)));

        int numBytes = 0;
        while ((numBytes = in.read(b)) > 0) {
            out.write(b, 0, numBytes);
        }

        // Close all the file descripters
        in.close();
        out.close();
        fileSystem.close();
    }

    private String[] loadBalance(String[] inputPaths) {
        Settings settings = Settings.getSettings();
        if (!settings.isLoadBalance()) {
            return inputPaths;
        }
        S3Agent s3agent = new S3Agent();
        ArrayList<String> balancedPaths = new ArrayList<String>();
        for (String fileName : inputPaths) {
            // right now balance only s3 files
            // local cluster remains unbalanced
            if (fileName.startsWith("s3://")) {
                try {
                    long size = s3agent.getFileSize(fileName);
                    long chunks = size / settings.getBytesPerMapper() + 1;
                    if (chunks == 1) {
                        balancedPaths.add(fileName);
                    } else {
                        for (int chunk = 0; chunk < chunks; ++chunk) {
                            balancedPaths.add(fileName + " " + (chunk * settings.getItemsPerMapper() + 1) + " "
                                    + (chunk + 1) * settings.getItemsPerMapper());
                        }
                        balancedPaths.add(fileName + " " + (chunks + 1) * settings.getItemsPerMapper() + " -1");
                    }
                } catch (Exception e) {
                    e.printStackTrace(System.out);
                }

            } else {
                balancedPaths.add(fileName);
            }
        }
        return balancedPaths.toArray(new String[0]);
    }
}