Java tutorial
/* * Consonance - workflow software for multiple clouds * Copyright (C) 2016 OICR * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package info.pancancer.arch3.coordinator; import com.rabbitmq.client.Channel; import com.rabbitmq.client.ConsumerCancelledException; import com.rabbitmq.client.MessageProperties; import com.rabbitmq.client.QueueingConsumer; import com.rabbitmq.client.ShutdownSignalException; import info.pancancer.arch3.Base; import info.pancancer.arch3.beans.Job; import info.pancancer.arch3.beans.JobState; import info.pancancer.arch3.beans.Order; import info.pancancer.arch3.beans.Status; import info.pancancer.arch3.beans.StatusState; import info.pancancer.arch3.persistence.PostgreSQL; import info.pancancer.arch3.utils.Constants; import info.pancancer.arch3.utils.Utilities; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeoutException; import org.apache.commons.configuration.HierarchicalINIConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Created by boconnor on 15-04-18. * * This consumes the jobs and prepares messages for the VM and Job Queues. * * It then monitors the results queue to see when jobs fail or finish. * * Finally, for failed or finished workflows, it informats the VM about finished VMs that can be terminated. * * TODO: * * This needs to have a new thread that periodically checks on the DB table for Jobs to identify jobs that are lost/failed * * */ public class Coordinator extends Base { private static final int DEFAULT_THREADS = 3; public static void main(String[] argv) throws Exception { Coordinator coordinator = new Coordinator(argv); coordinator.doWork(); } public Coordinator(String[] argv) throws IOException { super(); parseOptions(argv); } public void doWork() throws InterruptedException, ExecutionException { ExecutorService pool = Executors.newFixedThreadPool(DEFAULT_THREADS); CoordinatorOrders coordinatorOrders = new CoordinatorOrders(this.configFile, this.options.has(this.endlessSpec)); CleanupJobs cleanupJobs = new CleanupJobs(this.configFile, this.options.has(this.endlessSpec)); FlagJobs flagJobs = new FlagJobs(this.configFile, this.options.has(this.endlessSpec)); List<Future<?>> futures = new ArrayList<>(); futures.add(pool.submit(coordinatorOrders)); futures.add(pool.submit(cleanupJobs)); futures.add(pool.submit(flagJobs)); try { for (Future<?> future : futures) { future.get(); } } catch (InterruptedException | ExecutionException ex) { log.error(ex.toString()); throw new RuntimeException(ex); } finally { pool.shutdown(); } } /** * Reads from the Order queue and breaks it up into VMs for the VM queue and jobs for the job queue. */ private static class CoordinatorOrders implements Callable<Void> { private Channel jobChannel = null; private Channel vmChannel = null; private Channel orderChannel = null; private String queueName = null; private final boolean endless; private String configFile = null; private final Logger log = LoggerFactory.getLogger(getClass()); public CoordinatorOrders(String config, boolean endless) throws InterruptedException { this.endless = endless; this.configFile = config; } @Override public Void call() throws Exception { try { HierarchicalINIConfiguration settings = Utilities.parseConfig(configFile); queueName = settings.getString(Constants.RABBIT_QUEUE_NAME); // read from orderChannel = Utilities.setupQueue(settings, queueName + "_orders"); // write to jobChannel = Utilities.setupQueue(settings, queueName + "_jobs"); // TODO: actually this one needs to be built on demand // with // full // info // write to vmChannel = Utilities.setupQueue(settings, queueName + "_vms"); // read from QueueingConsumer consumer = new QueueingConsumer(orderChannel); orderChannel.basicConsume(queueName + "_orders", false, consumer); // TODO: need threads that each read from orders and another that reads results do { QueueingConsumer.Delivery delivery = consumer.nextDelivery(FIVE_SECOND_IN_MILLISECONDS); if (delivery == null) { continue; } // jchannel.basicAck(delivery.getEnvelope().getDeliveryTag(), false); String message = new String(delivery.getBody(), StandardCharsets.UTF_8); log.info(" [x] RECEIVED ORDER:\n'" + message + "'\n"); // run the job Order order = new Order().fromJSON(message); requestVm(order.getProvision().toJSON()); requestJob(order.getJob().toJSON()); log.info("acknowledging " + delivery.getEnvelope().toString()); orderChannel.basicAck(delivery.getEnvelope().getDeliveryTag(), false); } while (endless); } catch (IOException ex) { log.error(ex.getMessage(), ex); throw new RuntimeException(ex); } catch (InterruptedException | ShutdownSignalException | ConsumerCancelledException | NullPointerException ex) { log.error(ex.getMessage(), ex); } finally { // orderChannel.close(); if (orderChannel != null) { orderChannel.getConnection().close(); } // jobChannel.close(); if (jobChannel != null) { jobChannel.getConnection().close(); } // vmChannel.close(); if (vmChannel != null) { vmChannel.getConnection().close(); } } return null; } /** * Requests a new VM from the VM queue. * * @param message * a JSON representation of a Provision * @return */ private String requestVm(String message) { // TODO: should save information to persistant storage try { log.info(" + SENDING VM ORDER! " + queueName + "_vms"); int messages = vmChannel.queueDeclarePassive(queueName + "_vms").getMessageCount(); log.info(" + VM QUEUE SIZE: " + messages); vmChannel.basicPublish("", queueName + "_vms", MessageProperties.PERSISTENT_TEXT_PLAIN, message.getBytes(StandardCharsets.UTF_8)); vmChannel.waitForConfirms(); log.info(" + MESSAGE SENT!\n" + message + "\n"); } catch (IOException | InterruptedException ex) { throw new RuntimeException(ex); } return null; } /** * This sends a Job message to the job queue. * * @param message * @return */ private String requestJob(String message) { StringBuilder result = new StringBuilder(); try { // TODO: future feature... // So this is strange, why does the queue name have all this info in it? It's // because we may have orders for the same workflow that actually need different resources // Channel vmchannel = u.setupQueue(settings, // queueName+"_job_requests_"+workflowName+"_"+workflowVersion+"_"+cores+"_"+memGb+"_"+storageGb); log.info(" + SENDING JOB ORDER! " + queueName + "_jobs"); int messages = jobChannel.queueDeclarePassive(queueName + "_jobs").getMessageCount(); log.info(" + JOB QUEUE SIZE: " + messages); jobChannel.basicPublish("", queueName + "_jobs", MessageProperties.PERSISTENT_TEXT_PLAIN, message.getBytes(StandardCharsets.UTF_8)); HierarchicalINIConfiguration settings = Utilities.parseConfig(configFile); PostgreSQL db = new PostgreSQL(settings); Job newJob = new Job().fromJSON(message); newJob.setState(JobState.PENDING); db.createJob(newJob); log.info(" + MESSAGE SENT!\n" + message + "\n"); } catch (IOException ex) { throw new RuntimeException(ex); } return result.toString(); } } /** * This dequeues the VM requests and stages them in the DB as pending so I can keep a count of what's running/pending/finished. * * This looks like a duplicate class from ContainerProvisionerThreads. */ private static class CleanupJobs implements Callable<Void> { protected static final Logger LOG = LoggerFactory.getLogger(CleanupJobs.class); private final boolean endless; private String configFile = null; public CleanupJobs(String config, boolean endless) throws InterruptedException { this.endless = endless; this.configFile = config; } @Override public Void call() throws IOException, TimeoutException { Channel resultsChannel = null; try { HierarchicalINIConfiguration settings = Utilities.parseConfig(configFile); String queueName = settings.getString(Constants.RABBIT_QUEUE_NAME); final String resultQueueName = queueName + "_results"; // read from resultsChannel = Utilities.setupExchange(settings, resultQueueName); // this declares a queue exchange where multiple consumers get the same message: // https://www.rabbitmq.com/tutorials/tutorial-three-java.html String resultsQueue = Utilities.setupQueueOnExchange(resultsChannel, queueName, "CleanupJobs"); resultsChannel.queueBind(resultsQueue, resultQueueName, ""); QueueingConsumer resultsConsumer = new QueueingConsumer(resultsChannel); resultsChannel.basicConsume(resultsQueue, false, resultsConsumer); // writes to DB as well PostgreSQL db = new PostgreSQL(settings); // TODO: need threads that each read from orders and another that reads results do { QueueingConsumer.Delivery delivery = resultsConsumer.nextDelivery(FIVE_SECOND_IN_MILLISECONDS); if (delivery == null) { continue; } // jchannel.basicAck(delivery.getEnvelope().getDeliveryTag(), false); String message = new String(delivery.getBody(), StandardCharsets.UTF_8); LOG.debug(" [x] RECEIVED RESULT MESSAGE - Coordinator: '" + message + "'"); // now parse it as JSONObj Status status = new Status().fromJSON(message); // now update that DB record to be exited // this is acutally finishing the VM and not the work if (status.getState() == StatusState.SUCCESS && Utilities.JOB_MESSAGE_TYPE.equals(status.getType())) { // this is where it reaps, the job status message also contains the UUID for the VM LOG.info("\n\n\nFINISHING THE JOB!!!!!!!!!!!!!!!\n\n"); db.finishJob(status.getJobUuid()); } else if ((status.getState() == StatusState.RUNNING || status.getState() == StatusState.FAILED || status.getState() == StatusState.PENDING) && Utilities.JOB_MESSAGE_TYPE.equals(status.getType())) { // this is where it reaps, the job status message also contains the UUID for the VM // convert from StatusState to JobState JobState valueOf = JobState.valueOf(status.getState().toString()); db.updateJob(status.getJobUuid(), status.getVmUuid(), valueOf); } // TODO: deal with other situations here like /* * try { // pause Thread.sleep(5000); } catch (InterruptedException ex) { //log.error(ex.toString()); } */ resultsChannel.basicAck(delivery.getEnvelope().getDeliveryTag(), false); } while (endless); } catch (IOException ex) { throw new RuntimeException(ex); } catch (InterruptedException | ShutdownSignalException | ConsumerCancelledException ex) { throw new RuntimeException(ex); } finally { if (resultsChannel != null) { resultsChannel.close(); resultsChannel.getConnection().close(); } } // log.error(ex.toString()); // log.error(ex.toString()); return null; } } /** * This looks for jobs in the database that have not been updated in a while to determine if they are lost. */ private static class FlagJobs implements Callable<Void> { private final boolean endless; private final String configFile; private final Logger log = LoggerFactory.getLogger(getClass()); public FlagJobs(String config, boolean endless) { this.endless = endless; this.configFile = config; } @Override public Void call() { HierarchicalINIConfiguration settings = Utilities.parseConfig(configFile); // writes to DB as well PostgreSQL db = new PostgreSQL(settings); // TODO: need threads that each read from orders and another that reads results do { // checks the jobs in the database and sees if any have become "lost" List<Job> jobs = db.getJobs(JobState.RUNNING); // how long before we call something lost? // it is tempting to un-lose jobs here, but the problem is that we only have the update timestamp and that is modified when // jobs are lost, meaning they instantly flip back long secBeforeLost = settings.getLong(Constants.COORDINATOR_SECONDS_BEFORE_LOST); for (Job job : jobs) { Timestamp nowTs = new Timestamp(new Date().getTime()); Timestamp updateTs = job.getUpdateTs(); long diff = nowTs.getTime() - updateTs.getTime(); long diffSec = Math.abs(diff / Base.ONE_SECOND_IN_MILLISECONDS); log.info(job.getUuid() + " DIFF SEC: " + diffSec + " MAX: " + secBeforeLost); JobState state = job.getState(); // if this is true need to mark the job as lost! if (state == JobState.RUNNING && diffSec > secBeforeLost) { // it must be lost log.error("Running job " + job.getUuid() + " not seen in " + diffSec + " > " + secBeforeLost + " MARKING AS LOST!"); db.updateJob(job.getUuid(), job.getVmUuid(), JobState.LOST); } } try { // pause Thread.sleep(Base.FIVE_SECOND_IN_MILLISECONDS); } catch (InterruptedException ex) { throw new RuntimeException(ex); } } while (endless); return null; } } // public FlagJobs(String configFile) { // inner = new Inner(configFile); // } }