com.aliyun.odps.graph.local.LocalGraphJobRunner.java Source code

Java tutorial

Introduction

Here is the source code for com.aliyun.odps.graph.local.LocalGraphJobRunner.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.aliyun.odps.graph.local;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import com.aliyun.odps.utils.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.aliyun.odps.Column;
import com.aliyun.odps.Odps;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.PartitionSpec;
import com.aliyun.odps.conf.Configuration;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.graph.GRAPH_CONF;
import com.aliyun.odps.graph.JobConf;
import com.aliyun.odps.graph.job.JobRunner;
import com.aliyun.odps.graph.local.master.Master;
import com.aliyun.odps.graph.local.utils.LocalGraphRunUtils;
import com.aliyun.odps.local.common.Constants;
import com.aliyun.odps.local.common.DownloadMode;
import com.aliyun.odps.local.common.ExceptionCode;
import com.aliyun.odps.local.common.FileSplit;
import com.aliyun.odps.local.common.JobDirecotry;
import com.aliyun.odps.local.common.TableMeta;
import com.aliyun.odps.local.common.WareHouse;
import com.aliyun.odps.local.common.utils.DownloadUtils;
import com.aliyun.odps.local.common.utils.LocalRunUtils;
import com.aliyun.odps.local.common.utils.PartitionUtils;
import com.aliyun.odps.local.common.utils.SchemaUtils;
import com.aliyun.odps.mapred.JobStatus;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.SessionState;

public class LocalGraphJobRunner implements JobRunner {

    private static final Log LOG = LogFactory.getLog(LocalGraphJobRunner.class);
    private String jobId;
    private JobConf conf;

    private Odps odps;
    private RuntimeContext ctx;

    private List<FileSplit> inputs;
    private Map<String, TableInfo> outputs;

    private WareHouse wareHouse;
    private JobDirecotry jobDirecotry;

    private void initialize() throws IOException {
        odps = SessionState.get().getOdps();
        wareHouse = WareHouse.getInstance();
        wareHouse.init(odps, conf);

        jobId = LocalGraphRunUtils.generateLocalGraphTaskName();
        //conf.setJobName(jobId);
        conf.set(Constants.JOB_NAME, jobId);
        ctx = RuntimeContext.create(jobId, conf);
        jobDirecotry = new JobDirecotry();
        inputs = new ArrayList<FileSplit>();
    }

    @Override
    public RunningJob submit() throws OdpsException {
        try {
            initialize();
            LOG.info("run mapreduce job in local mode");
            LOG.info("job id: " + jobId);
            runJob();
        } catch (Exception e) {
            throw new OdpsException(e);
        }
        return new LocalRunningJob(jobId, JobStatus.SUCCEEDED, ctx.getCounters());
    }

    private void runJob() throws Exception {

        //jobDirecotry.writeJobConf();
        // write job configuration to temporary file job.xml
        FileOutputStream fos = new FileOutputStream(jobDirecotry.getJobFile());
        conf.writeXml(fos);
        fos.close();

        processInputs();

        processResources();

        processOutputs();

        @SuppressWarnings({ "unchecked", "rawtypes" })
        Master master = new Master(LocalGraphRunUtils.getGraphJobConf(conf), ctx, inputs, outputs);

        master.run();

        moveOutputs();
        System.out.println("graph task finish");

    }

    private void processInputs() throws OdpsException, IOException {

        int maxGraphWorkers = LocalGraphRunUtils.getMaxGraphTasks();
        TableInfo[] inputTableInfos = LocalGraphRunUtils.getInputTables(conf);

        if (inputTableInfos.length > 64) {
            throw new OdpsException(ExceptionCode.ODPS_0720301);
        }
        for (TableInfo tbl : inputTableInfos) {
            processInput(tbl);
        }

        if (inputs.isEmpty()) {
            inputs.add(FileSplit.NullSplit);
        }

        checkInputsSize(maxGraphWorkers);
    }

    private void processInput(TableInfo tableInfo) throws IOException, OdpsException {
        LOG.info("Processing input: " + tableInfo);

        String projName = tableInfo.getProjectName();
        if (projName == null) {
            projName = SessionState.get().getOdps().getDefaultProject();
        }
        String tblName = tableInfo.getTableName();
        String[] readCols = tableInfo.getCols();

        // ?MR??
        LinkedHashMap<String, String> expectPartsHashMap = tableInfo.getPartSpec();
        PartitionSpec expectParts = null;
        if (expectPartsHashMap != null && expectPartsHashMap.size() > 0) {
            StringBuffer sb = new StringBuffer();
            for (String key : expectPartsHashMap.keySet()) {
                if (sb.length() > 0) {
                    sb.append(",");
                }
                sb.append(key + "=" + expectPartsHashMap.get(key));
            }
            expectParts = new PartitionSpec(sb.toString());
        }

        // ?Table Scheme???
        if (!wareHouse.existsTable(projName, tblName) || wareHouse.getDownloadMode() == DownloadMode.ALWAYS) {

            DownloadUtils.downloadTableSchemeAndData(odps, tableInfo, wareHouse.getLimitDownloadRecordCount(),
                    wareHouse.getInputColumnSeperator());

            if (!wareHouse.existsTable(projName, tblName)) {
                throw new OdpsException("download table from remote host failure");
            }
        }

        // ////warehouse _scheme_????////
        TableMeta whTblMeta = wareHouse.getTableMeta(projName, tblName);
        Column[] whReadFields = LocalRunUtils.getInputTableFields(whTblMeta, readCols);
        List<PartitionSpec> whParts = wareHouse.getPartitions(projName, tblName);
        // //////////////////////

        if (whParts.size() > 0) {
            // partitioned table
            for (PartitionSpec partSpec : whParts) {
                // ?
                if (!match(expectParts, partSpec)) {
                    continue;
                }
                File whSrcDir = wareHouse.getPartitionDir(whTblMeta.getProjName(), whTblMeta.getTableName(),
                        partSpec);
                // add input split only when src dir has data file
                if (LocalRunUtils.listDataFiles(whSrcDir).size() > 0) {

                    // ??warehouse
                    File tempDataDir = jobDirecotry.getInputDir(
                            wareHouse.getRelativePath(whTblMeta.getProjName(), whTblMeta.getTableName(), partSpec));
                    File tempSchemeDir = jobDirecotry.getInputDir(
                            wareHouse.getRelativePath(whTblMeta.getProjName(), whTblMeta.getTableName(), null));
                    wareHouse.copyTable(whTblMeta.getProjName(), whTblMeta.getTableName(), partSpec, readCols,
                            tempSchemeDir, wareHouse.getLimitDownloadRecordCount(),
                            wareHouse.getInputColumnSeperator());
                    for (File file : LocalRunUtils.listDataFiles(tempDataDir)) {
                        inputs.add(new InputSplit(file, whReadFields, 0L, file.length(), tableInfo));
                    }
                }
            }
        } else {
            // not partitioned table
            if (tableInfo.getPartSpec() != null && tableInfo.getPartSpec().size() > 0) {
                throw new IOException(ExceptionCode.ODPS_0720121 + "table " + projName + "." + tblName
                        + " is not partitioned table");
            }

            File whSrcDir = wareHouse.getTableDir(whTblMeta.getProjName(), whTblMeta.getTableName());
            if (LocalRunUtils.listDataFiles(whSrcDir).size() > 0) {
                // ??warehouse
                File tempDataDir = jobDirecotry.getInputDir(
                        wareHouse.getRelativePath(whTblMeta.getProjName(), whTblMeta.getTableName(), null));
                File tempSchemeDir = tempDataDir;
                wareHouse.copyTable(whTblMeta.getProjName(), whTblMeta.getTableName(), null, readCols,
                        tempSchemeDir, wareHouse.getLimitDownloadRecordCount(),
                        wareHouse.getInputColumnSeperator());
                for (File file : LocalRunUtils.listDataFiles(tempDataDir)) {
                    inputs.add(new InputSplit(file, whReadFields, 0L, file.length(), tableInfo));
                }
            }
        }

    }

    private void checkInputsSize(int maxGraphWorkers) throws IOException {
        if (inputs.size() > maxGraphWorkers) {
            String msg = "ODPS-0740002: Too many local-run workers : %s, must be <= %s (specified by local_run parameter 'odps.graph.local.max.workers')";
            msg = String.format(msg, inputs.size(), maxGraphWorkers);
            throw new IOException(msg);
        }
        if (inputs.size() == 0) {
            throw new IOException("ODPS-0720231: Job input is not set");
        }
    }

    private void processResources() throws IOException, OdpsException {

        File resDir = jobDirecotry.getResourceDir();

        String curProjName = SessionState.get().getOdps().getDefaultProject();
        String[] resources = conf.getStrings(GRAPH_CONF.CACHE_RESOURCES);
        int maxResouceNum = 256;

        if (resources == null || resources.length == 0) {
            return;
        }

        Set<String> names = new HashSet<String>(Arrays.asList(resources));
        LOG.info("Start to process resources: " + StringUtils.join(resources, ','));
        if (names.size() > maxResouceNum) {
            throw new IOException(
                    ExceptionCode.ODPS_0720331 + " - define too many cache resources, must be <= " + maxResouceNum);
        }
        long resourceSize = 0;
        URLClassLoader loader = (URLClassLoader) Thread.currentThread().getContextClassLoader();
        ArrayList<URL> cp = new ArrayList<URL>(Arrays.asList(loader.getURLs()));
        for (String name : names) {

            List<String> res = LocalRunUtils.parseResourceName(name, curProjName);

            String projName = res.get(0);
            String resName = res.get(1);

            if (!wareHouse.existsResource(projName, resName)
                    || wareHouse.getDownloadMode() == DownloadMode.ALWAYS) {
                DownloadUtils.downloadResource(odps, projName, resName, wareHouse.getLimitDownloadRecordCount(),
                        wareHouse.getInputColumnSeperator());
                resourceSize += new File(resDir, resName).length();

            }
            wareHouse.copyResource(projName, resName, resDir, wareHouse.getLimitDownloadRecordCount(),
                    wareHouse.getInputColumnSeperator());
            cp.add(new File(resDir, resName).toURI().toURL());
            if (resourceSize > 512 * 1024 * 1024) {
                throw new IOException(ExceptionCode.ODPS_0720071 + " - must be <= 512M");
            }
        }
        URLClassLoader newLoader = new URLClassLoader(cp.toArray(new URL[0]), loader);
        Thread.currentThread().setContextClassLoader(newLoader);
        conf.setClassLoader(newLoader);
    }

    private void processOutputs() throws IOException, OdpsException {
        this.outputs = new HashMap<String, TableInfo>();
        TableInfo[] tableInfos = LocalGraphRunUtils.getOutputTables(conf);
        for (TableInfo table : tableInfos) {
            String label = table.getLabel();
            if (label.equals("__default__")) {
                label = "";
            }
            String projName = table.getProjectName();
            if (projName == null) {
                projName = SessionState.get().getOdps().getDefaultProject();
            }
            String tblName = table.getTableName();

            // FIXME: Support partition
            File tblDir = jobDirecotry.getOutputDir(table.getLabel());
            tblDir.mkdirs();
            TableMeta tblMeta = null;
            if (wareHouse.existsTable(projName, tblName) && wareHouse.getDownloadMode() != DownloadMode.ALWAYS) {
                tblMeta = wareHouse.getTableMeta(projName, tblName);
            } else {
                tblMeta = DownloadUtils.downloadTableInfo(odps, table);
            }
            SchemaUtils.generateSchemaFile(tblMeta, null, tblDir);
            //conf.setOutputSchema(tblMeta.getCols(), label);
            conf.set("odps.mapred.output.schema." + label, SchemaUtils.toString(tblMeta.getCols()));
            outputs.put(label, table);
        }
    }

    private void moveOutputs() throws IOException {
        for (TableInfo table : LocalGraphRunUtils.getOutputTables(conf)) {
            String label = table.getLabel();

            String projName = table.getProjectName();
            if (projName == null) {
                projName = SessionState.get().getOdps().getDefaultProject();
            }
            String tblName = table.getTableName();
            Map<String, String> partSpec = table.getPartSpec();

            File tempTblDir = jobDirecotry.getOutputDir(table.getLabel());
            File whOutputDir = wareHouse.createPartitionDir(projName, tblName, PartitionUtils.convert(partSpec));
            if (wareHouse.existsTable(projName, tblName)) {
                LOG.info("Reload warehouse table:" + tblName);
                if (!wareHouse.isRetainTempData()) {
                    LocalRunUtils.removeDataFiles(whOutputDir);
                }
                wareHouse.copyDataFiles(tempTblDir, null, whOutputDir, wareHouse.getInputColumnSeperator());
            } else {
                LOG.info("Copy output to warehouse: label=" + label + " -> " + whOutputDir.getAbsolutePath());
                FileUtils.copyDirectory(tempTblDir, whOutputDir);
            }
        }
    }

    private boolean match(PartitionSpec expectedParts, PartitionSpec parts) {
        if (expectedParts == null) {
            return true;
        }
        if (parts == null || expectedParts.keys().size() > parts.keys().size()) {
            return false;
        }
        for (String key : expectedParts.keys()) {
            String expectedValue = expectedParts.get(key);
            if (expectedValue == null) {
                continue;
            }
            String value = parts.get(key);
            if (!expectedValue.equals(value)) {
                return false;
            }
        }
        return true;
    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = new JobConf(conf);
    }

}