org.apache.eagle.jobrunning.crawler.RunningJobCrawlerImpl.java Source code

Introduction

Here is the source code for org.apache.eagle.jobrunning.crawler.RunningJobCrawlerImpl.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.eagle.jobrunning.crawler;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReadWriteLock;

import org.apache.eagle.jobrunning.zkres.JobRunningZKStateManager;
import org.apache.commons.lang.time.DateUtils;
import org.apache.eagle.jobrunning.config.RunningJobCrawlConfig;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.eagle.job.JobFilter;
import org.apache.eagle.jobrunning.callback.RunningJobCallback;
import org.apache.eagle.jobrunning.common.JobConstants.JobState;
import org.apache.eagle.jobrunning.common.JobConstants.ResourceType;
import org.apache.eagle.jobrunning.common.JobConstants.YarnApplicationType;
import org.apache.eagle.jobrunning.util.JobUtils;
import org.apache.eagle.jobrunning.yarn.model.AppInfo;
import org.apache.eagle.common.DateTimeUtil;

public class RunningJobCrawlerImpl implements RunningJobCrawler {

    protected RunningJobCrawlConfig.RunningJobEndpointConfig endpointConfig;
    protected RunningJobCrawlConfig.ControlConfig controlConfig;
    protected JobFilter jobFilter;
    private ResourceFetcher fetcher;
    private JobRunningZKStateManager zkStateManager;
    private Thread jobConfigProcessThread;
    private Thread jobCompleteInfoProcessThread;
    private Thread jobCompleteStatusCheckerThread;
    private Thread zkCleanupThread;
    private final RunningJobCallback callback;
    private ReadWriteLock readWriteLock;
    private Map<ResourceType, Map<String, JobContext>> processingJobMap = new ConcurrentHashMap<ResourceType, Map<String, JobContext>>();

    private BlockingQueue<JobContext> queueOfConfig;
    private BlockingQueue<JobContext> queueOfCompleteJobInfo;
    private static final int DEFAULT_CONFIG_THREAD_COUNT = 20;
    private final long DELAY_TO_UPDATE_COMPLETION_JOB_INFO = 5 * DateUtils.MILLIS_PER_MINUTE;
    private static final Logger LOG = LoggerFactory.getLogger(RunningJobCrawlerImpl.class);

    private static final ObjectMapper OBJ_MAPPER = new ObjectMapper();

    static {
        OBJ_MAPPER.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, true);
    }

    public RunningJobCrawlerImpl(RunningJobCrawlConfig config, JobRunningZKStateManager zkStateManager,
            RunningJobCallback callback, JobFilter jobFilter, ReadWriteLock readWriteLock) {
        this.endpointConfig = config.endPointConfig;
        this.controlConfig = config.controlConfig;
        this.callback = callback;
        this.fetcher = new RMResourceFetcher(endpointConfig);
        this.jobFilter = jobFilter;
        this.readWriteLock = readWriteLock;
        if (config.controlConfig.jobInfoEnabled) {
            jobCompleteInfoProcessThread = new Thread() {
                @Override
                public void run() {
                    startCompleteJobInfoProcessThread();
                }
            };
            jobCompleteInfoProcessThread.setName("JobCompleteInfo-process-thread");
            jobCompleteInfoProcessThread.setDaemon(true);

            jobCompleteStatusCheckerThread = new Thread() {
                @Override
                public void run() {
                    startCompleteStatusCheckerThread();
                }
            };
            jobCompleteStatusCheckerThread.setName("JobComplete-statusChecker-thread");
            jobCompleteStatusCheckerThread.setDaemon(true);
        }

        if (config.controlConfig.jobConfigEnabled) {
            jobConfigProcessThread = new Thread() {
                @Override
                public void run() {
                    startJobConfigProcessThread();
                }
            };
            jobConfigProcessThread.setName("JobConfig-process-thread");
            jobConfigProcessThread.setDaemon(true);
        }

        zkCleanupThread = new Thread() {
            @Override
            public void run() {
                startzkCleanupThread();
            }
        };
        zkCleanupThread.setName("zk-cleanup-thread");
        zkCleanupThread.setDaemon(true);

        this.zkStateManager = zkStateManager;
        this.processingJobMap.put(ResourceType.JOB_CONFIGURATION, new ConcurrentHashMap<String, JobContext>());
        this.processingJobMap.put(ResourceType.JOB_COMPLETE_INFO, new ConcurrentHashMap<String, JobContext>());
        this.queueOfConfig = new ArrayBlockingQueue<JobContext>(controlConfig.sizeOfJobConfigQueue);
        this.queueOfCompleteJobInfo = new ArrayBlockingQueue<JobContext>(controlConfig.sizeOfJobCompletedInfoQueue);
    }

    private void startJobConfigProcessThread() {
        int configThreadCount = DEFAULT_CONFIG_THREAD_COUNT;
        LOG.info("Job Config crawler main thread started, pool size: " + DEFAULT_CONFIG_THREAD_COUNT);

        ThreadFactory factory = new ThreadFactory() {
            private final AtomicInteger count = new AtomicInteger(0);

            public Thread newThread(Runnable runnable) {
                count.incrementAndGet();
                Thread thread = Executors.defaultThreadFactory().newThread(runnable);
                thread.setName("config-crawler-workthread-" + count.get());
                return thread;
            }
        };

        ThreadPoolExecutor pool = new ThreadPoolExecutor(configThreadCount, configThreadCount, 0L,
                TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(), factory);

        while (true) {
            JobContext context;
            try {
                context = queueOfConfig.take();
                LOG.info("queueOfConfig size: " + queueOfConfig.size());
                Runnable configCrawlerThread = new ConfigWorkTask(new JobContext(context), fetcher, callback, this);
                pool.execute(configCrawlerThread);
            } catch (InterruptedException e) {
                LOG.warn("Got an InterruptedException: " + e.getMessage());
            } catch (RejectedExecutionException e2) {
                LOG.warn("Got RejectedExecutionException: " + e2.getMessage());
            } catch (Throwable t) {
                LOG.warn("Got an throwable t, " + t.getMessage());
            }
        }
    }

    private void startCompleteJobInfoProcessThread() {
        while (true) {
            JobContext context = null;
            try {
                context = queueOfCompleteJobInfo.take();
            } catch (InterruptedException ex) {
            }
            /** Delay an interval before fetch job complete info, for history url need some time to be accessible,
             *  The default interval is set as 5 min,
             *  Also need to consider if need multi thread to do this
             */
            while (System.currentTimeMillis() < context.fetchedTime + DELAY_TO_UPDATE_COMPLETION_JOB_INFO) {
                try {
                    Thread.sleep(50);
                } catch (InterruptedException e1) {
                }
            }
            try {
                List<Object> objs = fetcher.getResource(ResourceType.JOB_COMPLETE_INFO,
                        JobUtils.getAppIDByJobID(context.jobId));
                callback.onJobRunningInformation(context, ResourceType.JOB_COMPLETE_INFO, objs);
            } catch (Exception ex) {
                if (ex.getMessage().contains("Server returned HTTP response code: 500")) {
                    LOG.warn(
                            "The server returns 500 error, it's probably caused by job ACL setting, going to skip this job");
                    // the job remains in processing list, thus we will not do infructuous retry next round
                    // TODO need remove it from processing list when job finished to avoid memory leak 
                } else
                    LOG.error("Got an exception when fetching resource ", ex);
            }
        }
    }

    public void startCompleteStatusCheckerThread() {
        while (true) {
            List<Object> list;
            try {
                list = fetcher.getResource(ResourceType.JOB_LIST, JobState.COMPLETED.name());
                if (list == null) {
                    LOG.warn("Current Completed Job List is Empty");
                    continue;
                }
                @SuppressWarnings("unchecked")
                List<AppInfo> apps = (List<AppInfo>) list.get(0);
                Set<JobContext> completedJobSet = new HashSet<JobContext>();
                for (AppInfo app : apps) {
                    //Only fetch MapReduce job
                    if (!YarnApplicationType.MAPREDUCE.name().equals(app.getApplicationType())
                            || !jobFilter.accept(app.getUser())) {
                        continue;
                    }
                    if (System.currentTimeMillis()
                            - app.getFinishedTime() < controlConfig.completedJobOutofDateTimeInMin
                                    * DateUtils.MILLIS_PER_MINUTE) {
                        completedJobSet.add(new JobContext(JobUtils.getJobIDByAppID(app.getId()), app.getUser(),
                                System.currentTimeMillis()));
                    }
                }

                if (controlConfig.jobConfigEnabled) {
                    addIntoProcessingQueueAndList(completedJobSet, queueOfConfig, ResourceType.JOB_CONFIGURATION);
                }

                if (controlConfig.jobInfoEnabled) {
                    addIntoProcessingQueueAndList(completedJobSet, queueOfCompleteJobInfo,
                            ResourceType.JOB_COMPLETE_INFO);
                }
                Thread.sleep(20 * 1000);
            } catch (Throwable t) {
                LOG.error("Got a throwable in fetching job completed list :", t);
            }
        }
    }

    public void startzkCleanupThread() {
        LOG.info("zk cleanup thread started");
        while (true) {
            try {
                long thresholdTime = System.currentTimeMillis()
                        - controlConfig.zkCleanupTimeInday * DateUtils.MILLIS_PER_DAY;
                String date = DateTimeUtil.format(thresholdTime, "yyyyMMdd");
                zkStateManager.truncateJobBefore(ResourceType.JOB_CONFIGURATION, date);
                zkStateManager.truncateJobBefore(ResourceType.JOB_COMPLETE_INFO, date);
                Thread.sleep(30 * 60 * 1000);
            } catch (Throwable t) {
                LOG.warn("Got an throwable, t: ", t);
            }
        }
    }

    public void addIntoProcessingQueueAndList(Set<JobContext> jobSet, BlockingQueue<JobContext> queue,
            ResourceType type) {
        try {
            readWriteLock.writeLock().lock();
            LOG.info("Write lock acquired");
            List<String> processingList = zkStateManager.readProcessedJobs(type);
            processingList.addAll(extractJobList(type));
            for (JobContext context : jobSet) {
                String jobId = context.jobId;
                if (!processingList.contains(jobId)) {
                    addIntoProcessingList(type, context);
                    queue.add(context);
                }
            }
        } finally {
            try {
                readWriteLock.writeLock().unlock();
                LOG.info("Write lock released");
            } catch (Throwable t) {
                LOG.error("Fail to release Write lock", t);
            }
        }
    }

    private List<String> extractJobList(ResourceType type) {
        Map<String, JobContext> contexts = processingJobMap.get(type);
        return Arrays.asList(contexts.keySet().toArray(new String[0]));
    }

    @Override
    public void crawl() throws Exception {
        // bring up crawler threads when crawl method is invoked first time
        if (jobConfigProcessThread != null && !jobConfigProcessThread.isAlive()) {
            jobConfigProcessThread.start();
        }

        if (jobCompleteInfoProcessThread != null && !jobCompleteInfoProcessThread.isAlive()) {
            jobCompleteInfoProcessThread.start();
        }

        if (jobCompleteStatusCheckerThread != null && !jobCompleteStatusCheckerThread.isAlive()) {
            jobCompleteStatusCheckerThread.start();
        }

        if (!zkCleanupThread.isAlive()) {
            zkCleanupThread.start();
        }

        List<Object> list = fetcher.getResource(ResourceType.JOB_LIST, JobState.RUNNING.name());
        if (list == null) {
            LOG.warn("Current Running Job List is Empty");
            return;
        }

        @SuppressWarnings("unchecked")
        List<AppInfo> apps = (List<AppInfo>) list.get(0);
        LOG.info("Current Running Job List size : " + apps.size());
        Set<JobContext> currentRunningJobSet = new HashSet<JobContext>();
        for (AppInfo app : apps) {
            //Only fetch MapReduce job
            if (!YarnApplicationType.MAPREDUCE.name().equals(app.getApplicationType())
                    || !jobFilter.accept(app.getUser())) {
                continue;
            }
            currentRunningJobSet.add(new JobContext(JobUtils.getJobIDByAppID(app.getId()), app.getUser(),
                    System.currentTimeMillis()));
        }

        if (controlConfig.jobConfigEnabled) {
            addIntoProcessingQueueAndList(currentRunningJobSet, queueOfConfig, ResourceType.JOB_CONFIGURATION);
        }

        if (controlConfig.jobInfoEnabled) {
            // fetch job detail & jobcounters   
            for (JobContext context : currentRunningJobSet) {
                try {
                    List<Object> objs = fetcher.getResource(ResourceType.JOB_RUNNING_INFO,
                            JobUtils.getAppIDByJobID(context.jobId));
                    callback.onJobRunningInformation(context, ResourceType.JOB_RUNNING_INFO, objs);
                } catch (Exception ex) {
                    if (ex.getMessage().contains("Server returned HTTP response code: 500")) {
                        LOG.warn(
                                "The server returns 500 error, it's probably caused by job ACL setting, going to skip this job");
                        // the job remains in processing list, thus we will not do infructuous retry next round
                        // TODO need remove it from processing list when job finished to avoid memory leak 
                    } else
                        LOG.error("Got an exception when fetching resource, jobId: " + context.jobId, ex);
                }
            }
        }
    }

    @Override
    public void addIntoProcessingList(ResourceType type, JobContext context) {
        processingJobMap.get(type).put(context.jobId, context);
    }

    @Override
    public void removeFromProcessingList(ResourceType type, JobContext context) {
        processingJobMap.get(type).remove(context.jobId);
    }
}