com.aliyun.odps.mapred.BridgeJobRunner.java Source code

Java tutorial

Introduction

Here is the source code for com.aliyun.odps.mapred.BridgeJobRunner.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.aliyun.odps.mapred;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.jar.JarEntry;
import java.util.jar.JarOutputStream;
import java.util.jar.Manifest;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.parser.Feature;
import com.aliyun.odps.Column;
import com.aliyun.odps.Instance;
import com.aliyun.odps.Odps;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.OdpsType;
import com.aliyun.odps.PartitionSpec;
import com.aliyun.odps.Resource;
import com.aliyun.odps.Table;
import com.aliyun.odps.conf.Configured;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.data.VolumeInfo;
import com.aliyun.odps.mapred.bridge.BridgeRunningJob;
import com.aliyun.odps.mapred.bridge.ErrorCode;
import com.aliyun.odps.mapred.bridge.MetaExplorer;
import com.aliyun.odps.mapred.bridge.MetaExplorerImpl;
import com.aliyun.odps.mapred.bridge.utils.Validator;
import com.aliyun.odps.mapred.bridge.utils.ValidatorFactory;
import com.aliyun.odps.mapred.conf.BridgeJobConf;
import com.aliyun.odps.mapred.conf.SessionState;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

public abstract class BridgeJobRunner extends Configured implements JobRunner, EventListener {

    protected static final Log LOG = LogFactory.getLog(BridgeJobRunner.class);

    private static final String TEMP_RESOURCE_PREFIX = "file:";

    protected BridgeJobConf job;
    protected String jobId;
    protected boolean isClean;
    protected String taskName;

    protected Map<String, String> aliasToTempResource = new HashMap<String, String>();

    protected MetaExplorer metaExplorer;

    /**
     * Create jar with jobconf.
     *
     * @return
     * @throws OdpsException
     */
    private ByteArrayOutputStream createJarArchive() throws OdpsException {
        try {
            ByteArrayOutputStream archiveOut = new ByteArrayOutputStream();
            // Open archive file
            JarOutputStream out = new JarOutputStream(archiveOut, new Manifest());

            ByteArrayOutputStream jobOut = new ByteArrayOutputStream();
            job.writeXml(jobOut);
            // Add jobconf entry
            JarEntry jobconfEntry = new JarEntry("jobconf.xml");
            out.putNextEntry(jobconfEntry);
            out.write(jobOut.toByteArray());

            out.close();
            return archiveOut;
        } catch (IOException ex) {
            throw new OdpsException(ErrorCode.UNEXPECTED.toString(), ex);
        }
    }

    private void applyFrameworkResources() throws OdpsException {
        // Add framework jars

        String padding = "-" + jobId;

        Set<String> added = new HashSet<String>();

        applyFrameworkResource(Odps.class, "odps-sdk-core.jar", padding, added);
        applyFrameworkResource(Mapper.class, "odps-sdk-mapred.jar", padding, added);
        applyFrameworkResource(BridgeJobRunner.class, "odps-mapred-bridge.jar", padding, added);
        applyFrameworkResource(JSON.class, "fastjson.jar", padding, added);
    }

    private void applyFrameworkResource(Class<?> clz, String alias, String padding, Set<String> added)
            throws OdpsException {
        String jarFilePath;
        try {
            jarFilePath = new File(clz.getProtectionDomain().getCodeSource().getLocation().toURI())
                    .getAbsolutePath();
        } catch (URISyntaxException ex) {
            throw new OdpsException(ex);
        }
        if (added.contains(jarFilePath)) {
            return;
        }
        int trycount = 0;
        while (true) {
            try {
                aliasToTempResource.put(alias,
                        metaExplorer.addFileResourceWithRetry(jarFilePath, Resource.Type.JAR, padding, true));
                added.add(jarFilePath);
                return;
            } catch (Exception ex) {
                trycount++;
                if (trycount >= 3) {
                    throw new OdpsException(ex);
                }
                try {
                    Thread.sleep(3000);
                } catch (InterruptedException e) {
                }
            }
        }

    }

    @SuppressWarnings("unchecked")
    protected void setUp() throws OdpsException {
        // Prepare additional config parameters

        // merge streaming job alias resources if exist
        if (job.get("stream.temp.resource.alias") != null) {
            String aliasJson = job.get("stream.temp.resource.alias");
            try {
                aliasToTempResource
                        .putAll((Map<String, String>) JSON.parseObject(aliasJson, Map.class, Feature.OrderedField));
            } catch (JSONException e) {
                throw new OdpsException("parse stream temp resource alias json failed!", e);
            }
        }
        // for user defined partitioner, estimate reduce number if not set
        boolean isEstimateReduceNum = (job.getPartitionerClass() != null)
                && (job.get("odps.stage.reducer.num") == null);
        long inputSize = 0;
        // Expand input columns if applicable.
        TableInfo[] infos = InputUtils.getTables(job);
        // for multi inputs not allow inner output in mapper
        if (infos != null && infos.length > 1) {
            job.setMapperInnerOutputEnable(false);
        }
        String project = metaExplorer.getDefaultProject();
        boolean changed = false;
        if (infos != null) {
            for (int i = 0; i < infos.length; i++) {
                TableInfo info = infos[i];
                if (info.getProjectName() == null) {
                    changed = true;
                    info.setProjectName(project);
                }

                Table tbl = metaExplorer.getTable(info.getProjectName(), info.getTableName());
                List<Column> schema = tbl.getSchema().getColumns();
                String[] inputCols = getInputColumnsFromCommandSettings(job, info);
                if (inputCols.length == 0 && info.getCols() == null) {
                    changed = true;
                    Column[] columns = schema.toArray(new Column[schema.size()]);
                    job.setInputSchema(info, columns);
                    info.setCols(SchemaUtils.getNames(columns));
                } else {
                    if (inputCols.length == 0) {
                        inputCols = info.getCols();
                    }
                    Column[] columns = new Column[inputCols.length];
                    for (int k = 0; k < inputCols.length; k++) {
                        String colName = inputCols[k];
                        for (Column c : schema) {
                            if (c.getName().equalsIgnoreCase(colName)) {
                                columns[k] = c;
                                break;
                            }
                        }
                    }
                    job.setInputSchema(info, columns);
                }
                if (isEstimateReduceNum) {
                    PartitionSpec part = info.getPartitionSpec();
                    if (!part.isEmpty()) {
                        // for partition table input
                        inputSize += tbl.getPartition(part).getSize();
                    } else {
                        inputSize += tbl.getSize();
                    }
                }
            }
        }
        if (changed) {
            InputUtils.setTables(infos, job);
        }
        if (isEstimateReduceNum) {
            job.setNumReduceTasks(estimateReduceNum(inputSize, job));
        }

        //add project information for volume if necessary
        changed = false;
        VolumeInfo[] volumeInfos = InputUtils.getVolumes(job);
        if (volumeInfos != null) {
            for (VolumeInfo volume : volumeInfos) {
                if (volume.getProjectName() == null) {
                    changed = true;
                    volume.setProjectName(project);
                }
            }
        }
        if (changed) {
            InputUtils.setVolumes(volumeInfos, job);
        }
        changed = false;
        volumeInfos = OutputUtils.getVolumes(job);
        if (volumeInfos != null) {
            for (VolumeInfo volume : volumeInfos) {
                if (volume.getProjectName() == null) {
                    changed = true;
                    volume.setProjectName(project);
                }
            }
        }
        if (changed) {
            OutputUtils.setVolumes(volumeInfos, job);
        }

        // Expand output columns.
        infos = OutputUtils.getTables(job);
        if (infos == null) {
            job.setOutputSchema(new Column[] { new Column("nil", OdpsType.STRING) }, TableInfo.DEFAULT_LABEL);
        } else {
            for (TableInfo info : infos) {
                if (info.getProjectName() == null) {
                    info.setProjectName(project);
                }
                List<Column> schema = metaExplorer.getTable(info.getProjectName(), info.getTableName()).getSchema()
                        .getColumns();
                Column[] schemaArray = schema.toArray(new Column[schema.size()]);
                info.setCols(SchemaUtils.getNames(schemaArray));
                job.setOutputSchema(schemaArray, info.getLabel());
            }
            OutputUtils.setTables(infos, job);
        }

        processTempResources();

        // Adding jobconf jar.
        ByteArrayOutputStream jarOut = null;
        try {
            jarOut = createJarArchive();
            jarOut.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        String resName = metaExplorer.addTempResourceWithRetry(new ByteArrayInputStream(jarOut.toByteArray()),
                jobId + ".jar", Resource.Type.JAR);
        aliasToTempResource.put("jobconf.jar", resName);

        applyFrameworkResources();

        List<String> totalRes = new ArrayList<String>();
        String[] resources = job.getResources();
        if (resources != null) {
            Collections.addAll(totalRes, resources);
        }
        totalRes.addAll(aliasToTempResource.keySet());
        job.setResources(StringUtils.join(totalRes, ","));
    }

    private String[] getInputColumnsFromCommandSettings(BridgeJobConf job, TableInfo info) {
        String[] columns = new String[0];
        String fullTableName = info.getProjectName() + "." + info.getTableName();
        String colsSetting = job.get("odps.mapred.input.columns." + fullTableName);
        if (colsSetting != null) {
            columns = colsSetting.split(",");
        }
        return columns;
    }

    private int estimateReduceNum(long inputSize, BridgeJobConf job) {
        long splitSize = job.getSplitSize() * 1024 * 1024; // bytes
        int numMapTasks = 1;
        if (inputSize > 0) {
            numMapTasks = (int) (inputSize / splitSize) + 1;
        } else {
            numMapTasks = job.getNumMapTasks();
        }

        return (int) (numMapTasks / 3) + 1;
    }

    /**
     * Register temporary resources.
     */
    private void processTempResources() throws OdpsException {
        String[] res = job.getResources();
        if (res == null) {
            return;
        }
        StringBuilder sb = new StringBuilder();
        for (String r : res) {
            if (r.toLowerCase().startsWith(TEMP_RESOURCE_PREFIX)) {
                Resource.Type type = r.endsWith(".jar") ? Resource.Type.JAR : Resource.Type.FILE;
                URL url;
                try {
                    url = new URL(r);
                } catch (MalformedURLException e) {
                    throw new OdpsException(e);
                }
                aliasToTempResource.put(FilenameUtils.getName(url.getPath()),
                        metaExplorer.addFileResourceWithRetry(url.getPath(), type, "_" + jobId, true));
            } else {
                if (sb.length() > 0) {
                    sb.append(',');
                }
                sb.append(r);
            }
        }
        job.setResources(sb.toString());
    }

    protected void tearDown() throws OdpsException {
        // Remove resources
        for (String resource : aliasToTempResource.values()) {
            metaExplorer.deleteResource(resource);
        }
        isClean = true;
    }

    abstract protected Instance submitInternal() throws OdpsException;

    @Override
    public RunningJob submit() throws OdpsException {
        // JobConf field should be initialized here because ReflectionUtils is
        // designed to create object
        // and then set JobConf.
        job = new BridgeJobConf(getConf());
        metaExplorer = getMetaExplorer();
        jobId = System.currentTimeMillis() + RandomStringUtils.randomNumeric(8);
        // Set a prefix to job name because odps only accept job name whose pattern
        // is '([a-z]|[A-Z]){1,}([a-z]|[A-Z]|[\d]|_)*' ...
        taskName = "MRonSQL_" + jobId;

        Validator validator = ValidatorFactory.getValidator(job, metaExplorer);
        validator.validate();

        setUp();

        Instance instance = submitInternal();
        BridgeRunningJob runningJob = new BridgeRunningJob(instance, taskName, this);
        if (SessionState.get().isCostMode()) {
            runningJob.setIsCostMode(true);
        }
        return runningJob;
    }

    protected MetaExplorer getMetaExplorer() {
        return new MetaExplorerImpl(SessionState.get().getOdps());
    }

    @Override
    public void onComplete() {
        try {
            if (!isClean) {
                tearDown();
            }
        } catch (OdpsException e) {
            // Silently swallow it.
        }
    }

}