org.apache.falcon.adfservice.ADFJob.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.falcon.adfservice.ADFJob.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.falcon.adfservice;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.io.IOUtils;
import org.apache.falcon.adfservice.util.ADFJsonConstants;
import org.apache.falcon.adfservice.util.FSUtils;
import org.apache.falcon.entity.EntityUtil;
import org.apache.falcon.entity.v0.Entity;
import org.apache.falcon.entity.v0.EntityType;
import org.apache.falcon.FalconException;
import org.apache.falcon.resource.AbstractSchedulableEntityManager;
import org.apache.falcon.security.CurrentUser;
import org.apache.hadoop.fs.Path;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Base class for Azure ADF jobs.
 */
public abstract class ADFJob {
    private static final Logger LOG = LoggerFactory.getLogger(ADFJob.class);

    // name prefix for all adf related entity, e.g. an adf hive process and the feeds associated with it
    public static final String ADF_ENTITY_NAME_PREFIX = "ADF-";
    public static final int ADF_ENTITY_NAME_PREFIX_LENGTH = ADF_ENTITY_NAME_PREFIX.length();
    // name prefix for all adf related job entity, i.e. adf hive/pig process and replication feed
    public static final String ADF_JOB_ENTITY_NAME_PREFIX = ADF_ENTITY_NAME_PREFIX + "JOB-";
    public static final int ADF_JOB_ENTITY_NAME_PREFIX_LENGTH = ADF_JOB_ENTITY_NAME_PREFIX.length();

    public static final String TEMPLATE_PATH_PREFIX = "/apps/falcon/adf/";
    public static final String PROCESS_SCRIPTS_PATH = TEMPLATE_PATH_PREFIX + Path.SEPARATOR + "generatedscripts";
    private static final String DEFAULT_FREQUENCY = "days(1)";

    public static boolean isADFJobEntity(String entityName) {
        return entityName.startsWith(ADF_JOB_ENTITY_NAME_PREFIX);
    }

    public static String getSessionID(String entityName) throws FalconException {
        if (!isADFJobEntity(entityName)) {
            throw new FalconException("The entity, " + entityName + ", is not an ADF Job Entity.");
        }
        return entityName.substring(ADF_JOB_ENTITY_NAME_PREFIX_LENGTH);
    }

    /**
     * Enum for job type.
     */
    public static enum JobType {
        HIVE, PIG, REPLICATION
    }

    private static enum RequestType {
        HADOOPMIRROR, HADOOPHIVE, HADOOPPIG
    }

    public static JobType getJobType(String msg) throws FalconException {
        try {
            JSONObject obj = new JSONObject(msg);
            JSONObject activity = obj.getJSONObject(ADFJsonConstants.ADF_REQUEST_ACTIVITY);
            if (activity == null) {
                throw new FalconException(
                        "JSON object " + ADFJsonConstants.ADF_REQUEST_ACTIVITY + " not found in ADF" + " request.");
            }

            JSONObject activityProperties = activity.getJSONObject(ADFJsonConstants.ADF_REQUEST_TRANSFORMATION);
            if (activityProperties == null) {
                throw new FalconException("JSON object " + ADFJsonConstants.ADF_REQUEST_TRANSFORMATION
                        + " not found " + "in ADF request.");
            }

            String type = activityProperties.getString(ADFJsonConstants.ADF_REQUEST_TYPE);
            if (StringUtils.isBlank(type)) {
                throw new FalconException(ADFJsonConstants.ADF_REQUEST_TYPE + " not found in ADF request msg");
            }

            switch (RequestType.valueOf(type.toUpperCase())) {
            case HADOOPMIRROR:
                return JobType.REPLICATION;
            case HADOOPHIVE:
                return JobType.HIVE;
            case HADOOPPIG:
                return JobType.PIG;
            default:
                throw new FalconException("Unrecognized ADF job type: " + type);
            }
        } catch (JSONException e) {
            throw new FalconException("Error while parsing ADF JSON message: " + msg, e);
        }
    }

    public abstract void startJob() throws FalconException;

    public abstract void cleanup() throws FalconException;

    protected JSONObject message;
    protected JSONObject activity;
    protected JSONObject activityExtendedProperties;
    protected String id;
    protected JobType type;
    protected String startTime, endTime;
    protected String frequency;
    protected String proxyUser;
    protected long timeout;
    protected ADFJobManager jobManager = new ADFJobManager();

    private Map<String, JSONObject> linkedServicesMap = new HashMap<String, JSONObject>();
    protected Map<String, JSONObject> tablesMap = new HashMap<String, JSONObject>();

    public ADFJob(String msg, String id) throws FalconException {
        this.id = id;
        FSUtils.createDir(new Path(PROCESS_SCRIPTS_PATH));
        try {
            message = new JSONObject(msg);

            frequency = DEFAULT_FREQUENCY;
            startTime = transformTimeFormat(message.getString(ADFJsonConstants.ADF_REQUEST_START_TIME));
            endTime = transformTimeFormat(message.getString(ADFJsonConstants.ADF_REQUEST_END_TIME));

            JSONArray linkedServices = message.getJSONArray(ADFJsonConstants.ADF_REQUEST_LINKED_SERVICES);
            for (int i = 0; i < linkedServices.length(); i++) {
                JSONObject linkedService = linkedServices.getJSONObject(i);
                linkedServicesMap.put(linkedService.getString(ADFJsonConstants.ADF_REQUEST_NAME), linkedService);
            }

            JSONArray tables = message.getJSONArray(ADFJsonConstants.ADF_REQUEST_TABLES);
            for (int i = 0; i < tables.length(); i++) {
                JSONObject table = tables.getJSONObject(i);
                tablesMap.put(table.getString(ADFJsonConstants.ADF_REQUEST_NAME), table);
            }

            // Set the activity extended properties
            activity = message.getJSONObject(ADFJsonConstants.ADF_REQUEST_ACTIVITY);
            if (activity == null) {
                throw new FalconException(
                        "JSON object " + ADFJsonConstants.ADF_REQUEST_ACTIVITY + " not found in ADF" + " request.");
            }

            JSONObject policy = activity.getJSONObject(ADFJsonConstants.ADF_REQUEST_POLICY);
            /* IS policy mandatory */
            if (policy == null) {
                throw new FalconException(
                        "JSON object " + ADFJsonConstants.ADF_REQUEST_POLICY + " not found" + " in ADF request.");
            }
            String adfTimeout = policy.getString(ADFJsonConstants.ADF_REQUEST_TIMEOUT);
            if (StringUtils.isBlank(adfTimeout)) {
                throw new FalconException(
                        "JSON object " + ADFJsonConstants.ADF_REQUEST_TIMEOUT + " not found" + " in ADF request.");
            }
            timeout = parseADFRequestTimeout(adfTimeout);

            JSONObject activityProperties = activity.getJSONObject(ADFJsonConstants.ADF_REQUEST_TRANSFORMATION);
            if (activityProperties == null) {
                throw new FalconException("JSON object " + ADFJsonConstants.ADF_REQUEST_TRANSFORMATION
                        + " not found" + " in ADF request.");
            }

            activityExtendedProperties = activityProperties
                    .getJSONObject(ADFJsonConstants.ADF_REQUEST_EXTENDED_PROPERTIES);
            if (activityExtendedProperties == null) {
                throw new FalconException("JSON object " + ADFJsonConstants.ADF_REQUEST_EXTENDED_PROPERTIES + " not"
                        + " found in ADF request.");
            }

            // should be called after setting activityExtendedProperties
            proxyUser = getRunAsUser();

            // log in the user
            CurrentUser.authenticate(proxyUser);
        } catch (JSONException e) {
            throw new FalconException("Error while parsing ADF JSON message: " + msg, e);
        }
    }

    public String jobEntityName() {
        return ADF_JOB_ENTITY_NAME_PREFIX + id;
    }

    public JobType jobType() {
        return type;
    }

    protected String getClusterName(String linkedServiceName) throws FalconException {
        JSONObject linkedService = linkedServicesMap.get(linkedServiceName);
        if (linkedService == null) {
            throw new FalconException("Linked service " + linkedServiceName + " not found in ADF request.");
        }

        try {
            return linkedService.getJSONObject(ADFJsonConstants.ADF_REQUEST_PROPERTIES)
                    .getJSONObject(ADFJsonConstants.ADF_REQUEST_EXTENDED_PROPERTIES)
                    .getString(ADFJsonConstants.ADF_REQUEST_CLUSTER_NAME);
        } catch (JSONException e) {
            throw new FalconException(
                    "Error while parsing linked service " + linkedServiceName + " in ADF request.");
        }
    }

    protected String getRunAsUser() throws FalconException {
        if (activityExtendedProperties.has(ADFJsonConstants.ADF_REQUEST_RUN_ON_BEHALF_USER)) {
            String runAsUser = null;
            try {
                runAsUser = activityExtendedProperties.getString(ADFJsonConstants.ADF_REQUEST_RUN_ON_BEHALF_USER);
            } catch (JSONException e) {
                throw new FalconException("JSON object " + ADFJsonConstants.ADF_REQUEST_RUN_ON_BEHALF_USER + " not"
                        + " found in ADF request.");
            }

            if (StringUtils.isBlank(runAsUser)) {
                throw new FalconException("JSON object " + ADFJsonConstants.ADF_REQUEST_RUN_ON_BEHALF_USER + " in"
                        + " ADF request activity extended properties cannot be empty.");
            }
            return runAsUser;
        } else {
            String hadoopLinkedService = getHadoopLinkedService();
            JSONObject linkedService = linkedServicesMap.get(hadoopLinkedService);
            if (linkedService == null) {
                throw new FalconException("JSON object " + hadoopLinkedService + " not" + " found in ADF request.");
            }

            try {
                return linkedService.getJSONObject(ADFJsonConstants.ADF_REQUEST_PROPERTIES)
                        .getJSONObject(ADFJsonConstants.ADF_REQUEST_EXTENDED_PROPERTIES)
                        .getString(ADFJsonConstants.ADF_REQUEST_RUN_ON_BEHALF_USER);
            } catch (JSONException e) {
                throw new FalconException("JSON object " + ADFJsonConstants.ADF_REQUEST_RUN_ON_BEHALF_USER + " not"
                        + " found in ADF request.");
            }
        }
    }

    protected List<String> getInputTables() throws FalconException {
        List<String> tables = new ArrayList<String>();
        try {
            JSONArray inputs = message.getJSONObject(ADFJsonConstants.ADF_REQUEST_ACTIVITY)
                    .getJSONArray(ADFJsonConstants.ADF_REQUEST_INPUTS);
            for (int i = 0; i < inputs.length(); i++) {
                tables.add(inputs.getJSONObject(i).getString(ADFJsonConstants.ADF_REQUEST_NAME));
            }
        } catch (JSONException e) {
            throw new FalconException("Error while reading input table names in ADF request.");
        }
        return tables;
    }

    protected List<String> getOutputTables() throws FalconException {
        List<String> tables = new ArrayList<String>();
        try {
            JSONArray outputs = message.getJSONObject(ADFJsonConstants.ADF_REQUEST_ACTIVITY)
                    .getJSONArray(ADFJsonConstants.ADF_REQUEST_OUTPUTS);
            for (int i = 0; i < outputs.length(); i++) {
                tables.add(outputs.getJSONObject(i).getString(ADFJsonConstants.ADF_REQUEST_NAME));
            }
        } catch (JSONException e) {
            throw new FalconException("Error while reading output table names in ADF request.");
        }
        return tables;
    }

    protected String getADFTablePath(String tableName) throws FalconException {
        JSONObject table = tablesMap.get(tableName);
        if (table == null) {
            throw new FalconException("JSON object " + tableName + " not" + " found in ADF request.");
        }

        try {
            JSONObject location = table.getJSONObject(ADFJsonConstants.ADF_REQUEST_PROPERTIES)
                    .getJSONObject(ADFJsonConstants.ADF_REQUEST_LOCATION);
            String requestType = location.getString(ADFJsonConstants.ADF_REQUEST_TYPE);
            if (requestType.equals(ADFJsonConstants.ADF_REQUEST_LOCATION_TYPE_AZURE_BLOB)) {
                String blobPath = location.getString(ADFJsonConstants.ADF_REQUEST_FOLDER_PATH);
                int index = blobPath.indexOf('/');
                if (index == -1) {
                    throw new FalconException("Invalid azure blob path: " + blobPath);
                }

                String linkedServiceName = location.getString(ADFJsonConstants.ADF_REQUEST_LINKED_SERVICE_NAME);
                JSONObject linkedService = linkedServicesMap.get(linkedServiceName);
                if (linkedService == null) {
                    throw new FalconException("Can't find linked service " + linkedServiceName + " for azure blob");
                }
                String connectionString = linkedService.getJSONObject(ADFJsonConstants.ADF_REQUEST_PROPERTIES)
                        .getString(ADFJsonConstants.ADF_REQUEST_CONNECTION_STRING);
                int accountNameIndex = connectionString.indexOf(ADFJsonConstants.ADF_REQUEST_BLOB_ACCOUNT_NAME)
                        + ADFJsonConstants.ADF_REQUEST_BLOB_ACCOUNT_NAME.length();
                String accountName = connectionString.substring(accountNameIndex,
                        connectionString.indexOf(';', accountNameIndex));

                StringBuilder blobUrl = new StringBuilder("wasb://").append(blobPath.substring(0, index))
                        .append("@").append(accountName).append(".blob.core.windows.net")
                        .append(blobPath.substring(index));
                return blobUrl.toString();
            }
            return location.getJSONObject(ADFJsonConstants.ADF_REQUEST_EXTENDED_PROPERTIES)
                    .getString(ADFJsonConstants.ADF_REQUEST_FOLDER_PATH);
        } catch (JSONException e) {
            throw new FalconException("Error while parsing ADF JSON message: " + tableName, e);
        }
    }

    protected String getTableCluster(String tableName) throws FalconException {
        JSONObject table = tablesMap.get(tableName);
        if (table == null) {
            throw new FalconException("Table " + tableName + " not found in ADF request.");
        }

        try {
            String linkedServiceName = table.getJSONObject(ADFJsonConstants.ADF_REQUEST_PROPERTIES)
                    .getJSONObject(ADFJsonConstants.ADF_REQUEST_LOCATION)
                    .getString(ADFJsonConstants.ADF_REQUEST_LINKED_SERVICE_NAME);
            return getClusterName(linkedServiceName);
        } catch (JSONException e) {
            throw new FalconException("Error while parsing table cluster " + tableName + " in ADF request.");
        }
    }

    protected boolean activityHasScriptPath() throws FalconException {
        if (JobType.REPLICATION == jobType()) {
            return false;
        }

        return activityExtendedProperties.has(ADFJsonConstants.ADF_REQUEST_SCRIPT_PATH);
    }

    protected String getScriptPath() throws FalconException {
        if (!activityHasScriptPath()) {
            throw new FalconException(
                    "JSON object does not have object: " + ADFJsonConstants.ADF_REQUEST_SCRIPT_PATH);
        }

        try {
            String scriptPath = activityExtendedProperties.getString(ADFJsonConstants.ADF_REQUEST_SCRIPT_PATH);
            if (StringUtils.isBlank(scriptPath)) {
                throw new FalconException("JSON object " + ADFJsonConstants.ADF_REQUEST_SCRIPT_PATH + " not"
                        + " found or empty in ADF request.");
            }
            return scriptPath;
        } catch (JSONException jsonException) {
            throw new FalconException("Error while parsing ADF JSON object: " + activityExtendedProperties,
                    jsonException);
        }
    }

    protected String getScriptContent() throws FalconException {
        if (activityHasScriptPath()) {
            throw new FalconException("JSON object does not have object: " + ADFJsonConstants.ADF_REQUEST_SCRIPT);
        }
        try {
            String script = activityExtendedProperties.getString(ADFJsonConstants.ADF_REQUEST_SCRIPT);
            if (StringUtils.isBlank(script)) {
                throw new FalconException("JSON object " + ADFJsonConstants.ADF_REQUEST_SCRIPT + " cannot"
                        + " be empty in ADF request.");
            }
            return script;
        } catch (JSONException jsonException) {
            throw new FalconException("Error while parsing ADF JSON object: " + activityExtendedProperties,
                    jsonException);
        }
    }

    protected String getClusterNameToRunProcessOn() throws FalconException {
        return getClusterName(getHadoopLinkedService());
    }

    protected Entity submitAndScheduleJob(String entityType, String msg) throws FalconException {
        Entity entity = jobManager.submitJob(entityType, msg);
        jobManager.scheduleJob(entityType, jobEntityName());
        return entity;
    }

    private String getHadoopLinkedService() throws FalconException {
        String hadoopLinkedService;
        try {
            hadoopLinkedService = activity.getString(ADFJsonConstants.ADF_REQUEST_LINKED_SERVICE_NAME);
        } catch (JSONException jsonException) {
            throw new FalconException("Error while parsing ADF JSON object: " + activity, jsonException);
        }

        if (StringUtils.isBlank(hadoopLinkedService)) {
            throw new FalconException("JSON object " + ADFJsonConstants.ADF_REQUEST_LINKED_SERVICE_NAME
                    + " in the activity cannot be empty in ADF request.");
        }
        return hadoopLinkedService;
    }

    protected void startProcess(Feed inputFeed, Feed outputFeed, String engineType, String scriptPath)
            throws FalconException {
        // submit input/output feeds
        LOG.info("submitting input feed {} for {} process", inputFeed.getName(), engineType);
        jobManager.submitJob(EntityType.FEED.name(), inputFeed.getEntityxml());

        LOG.info("submitting output feed {} for {} process", outputFeed.getName(), engineType);
        jobManager.submitJob(EntityType.FEED.name(), outputFeed.getEntityxml());

        // submit and schedule process
        String processRequest = new Process.Builder().withProcessName(jobEntityName()).withFrequency(frequency)
                .withStartTime(startTime).withEndTime(endTime).withClusterName(getClusterNameToRunProcessOn())
                .withInputFeedName(inputFeed.getName()).withOutputFeedName(outputFeed.getName())
                .withEngineType(engineType).withWFPath(scriptPath).withAclOwner(proxyUser).build().getEntityxml();

        LOG.info("submitting/scheduling {} process: {}", engineType, processRequest);
        submitAndScheduleJob(EntityType.PROCESS.name(), processRequest);
        LOG.info("submitted and scheduled {} process: {}", engineType, jobEntityName());
    }

    protected void cleanupProcess(Feed inputFeed, Feed outputFeed) throws FalconException {
        // delete the entities. Should be called after the job execution success/failure.
        jobManager.deleteEntity(EntityType.PROCESS.name(), jobEntityName());
        jobManager.deleteEntity(EntityType.FEED.name(), inputFeed.getName());
        jobManager.deleteEntity(EntityType.FEED.name(), outputFeed.getName());

        // delete script file
        FSUtils.removeDir(new Path(ADFJob.PROCESS_SCRIPTS_PATH, jobEntityName()));
    }

    protected String createScriptFile(String fileExt) throws FalconException {
        String content = getScriptContent();

        // create dir; dir path is unique as job name is always unique
        final Path dir = new Path(ADFJob.PROCESS_SCRIPTS_PATH, jobEntityName());
        FSUtils.createDir(dir);

        // create script file
        final Path path = new Path(dir, jobEntityName() + fileExt);
        return FSUtils.createFile(path, content);
    }

    private static long parseADFRequestTimeout(String timeout) throws FalconException {
        timeout = timeout.trim();
        //  [ws][-]{ d | d.hh:mm[:ss[.ff]] | hh:mm[:ss[.ff]] }[ws]
        if (timeout.startsWith("-")) {
            return -1;
        }

        long totalMinutes = 0;
        String[] dotParts = timeout.split(Pattern.quote("."));
        if (dotParts.length == 1) {
            // no d or ff
            // chk if only d
            // Formats can be d|hh:mm[:ss]
            String[] parts = timeout.split(":");
            if (parts.length == 1) {
                // only day. Convert days to minutes
                return Integer.parseInt(parts[0]) * 1440;
            } else {
                // hh:mm[:ss]
                return computeMinutes(parts);
            }
        }

        // if . is present, formats can be d.hh:mm[:ss[.ff]] | hh:mm[:ss[.ff]]
        if (dotParts.length == 2) {
            // can be d.hh:mm[:ss] or hh:mm[:ss[.ff]
            // check if first part has colons
            String[] parts = dotParts[0].split(":");
            if (parts.length == 1) {
                // format is d.hh:mm[:ss]
                totalMinutes = Integer.parseInt(dotParts[0]) * 1440;
                parts = dotParts[1].split(":");
                totalMinutes += computeMinutes(parts);
                return totalMinutes;
            } else {
                // format is hh:mm[:ss[.ff]
                parts = dotParts[0].split(":");
                totalMinutes += computeMinutes(parts);
                // round off ff
                totalMinutes += 1;
                return totalMinutes;
            }
        } else if (dotParts.length == 3) {
            // will be d.hh:mm[:ss[.ff]
            totalMinutes = Integer.parseInt(dotParts[0]) * 1440;
            String[] parts = dotParts[1].split(":");
            totalMinutes += computeMinutes(parts);
            // round off ff
            totalMinutes += 1;
            return totalMinutes;
        } else {
            throw new FalconException("Error parsing policy timeout: " + timeout);
        }
    }

    // format hh:mm[:ss]
    private static long computeMinutes(String[] parts) {
        // hh:mm[:ss]
        int totalMinutes = Integer.parseInt(parts[0]) * 60;
        totalMinutes += Integer.parseInt(parts[1]);
        if (parts.length == 3) {
            // Second round off to minutes
            totalMinutes += 1;
        }
        return totalMinutes;
    }

    private static String transformTimeFormat(String adfTime) {
        return adfTime.substring(0, adfTime.length() - 4) + "Z";
    }

    protected class ADFJobManager extends AbstractSchedulableEntityManager {
        public Entity submitJob(String entityType, String msg) throws FalconException {
            try {
                InputStream stream = IOUtils.toInputStream(msg);
                Entity entity = submitInternal(stream, entityType, proxyUser);
                return entity;
            } catch (Exception e) {
                LOG.info(e.toString());
                throw new FalconException("Error when submitting job: " + e.toString());
            }
        }

        public void scheduleJob(String entityType, String entityName) throws FalconException {
            try {
                scheduleInternal(entityType, entityName, null, EntityUtil.getPropertyMap(null));
            } catch (Exception e) {
                LOG.info(e.toString());
                throw new FalconException("Error when scheduling job: " + e.toString());
            }
        }

        public void deleteEntity(String entityType, String entityName) throws FalconException {
            delete(entityType, entityName, null);
        }
    }
}