org.apache.hadoop.mapred.TaskErrorCollector.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.mapred.TaskErrorCollector.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.server.jobtracker.TaskTracker;
import org.apache.hadoop.metrics.MetricsContext;
import org.apache.hadoop.metrics.MetricsRecord;
import org.apache.hadoop.metrics.MetricsUtil;
import org.apache.hadoop.metrics.Updater;
import org.apache.hadoop.metrics.util.MetricsBase;
import org.apache.hadoop.metrics.util.MetricsRegistry;
import org.apache.hadoop.metrics.util.MetricsTimeVaryingLong;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.SAXException;

public class TaskErrorCollector implements Updater {

    private static final String ERROR_XML = "error.xml";
    public static final String NUM_WINDOWS_KEY = "mapred.taskerrorcollector.window.number";
    public static final String WINDOW_LENGTH_KEY = "mapred.taskerrorcollector.window.milliseconds";
    public static final String CONFIG_FILE_KEY = "mapred.taskerrorcollector.error.file";
    public static final String COUNTER_GROUP_NAME = "TaskError";

    public static final Log LOG = LogFactory.getLog(TaskErrorCollector.class);

    private static final String METRICS_KEY_PREFIX = "task_error_";
    private final MetricsRecord metricsRecord;
    private final MetricsRegistry registry;
    private final Map<String, TaskError> knownErrors;
    private final TaskError UNKNOWN_ERROR = new TaskError("UNKNOWN", "",
            "Task diagnostic info does not match any pattern defined in xml file");

    // Used by UI
    static final private int WINDOW_LENGTH = 10 * 60 * 1000; // 10 minute
    static final private int NUM_WINDOWS = 3 * 86400 * 1000 / WINDOW_LENGTH; // 3 days
    private long lastWindowIndex = 0;
    private final int windowLength;
    private final int numWindows;
    private final LinkedList<Map<TaskError, Integer>> errorCountsQueue = new LinkedList<Map<TaskError, Integer>>();
    private final LinkedList<Long> startTimeQueue = new LinkedList<Long>();
    private final Map<TaskError, Integer> sinceStartErrorCounts;

    // Used by metrics
    private final Map<TaskError, MetricsTimeVaryingLong> errorCountsMetrics = new HashMap<TaskError, MetricsTimeVaryingLong>();
    // Cumulative counters
    private final Counters errorCounters = new Counters();

    public TaskErrorCollector(Configuration conf) {
        this(conf, conf.getInt(WINDOW_LENGTH_KEY, WINDOW_LENGTH), conf.getInt(NUM_WINDOWS_KEY, NUM_WINDOWS));
    }

    public TaskErrorCollector(Configuration conf, int windowLength, int numWindows) {
        this.windowLength = windowLength;
        this.numWindows = numWindows;

        MetricsContext context = MetricsUtil.getContext("mapred");
        metricsRecord = MetricsUtil.createRecord(context, "taskerror");
        registry = new MetricsRegistry();

        context.registerUpdater(this);

        URL configURL = null;
        String configFilePath = conf.get(CONFIG_FILE_KEY);
        if (configFilePath == null) {
            // Search the class path if it is not configured
            configURL = TaskErrorCollector.class.getClassLoader().getResource(ERROR_XML);
        } else {
            try {
                configURL = new URL("file://" + new File(configFilePath).getAbsolutePath());
            } catch (MalformedURLException e) {
                LOG.error("Error in creating config URL", e);
            }
        }

        if (configURL == null) {
            LOG.warn("Could not get error collector configuration. " + TaskErrorCollector.class.getSimpleName()
                    + " will see every error as UNKNOWN_ERROR.");
            knownErrors = Collections.emptyMap();
        } else {
            LOG.info("Parsing configuration from " + configURL);
            knownErrors = parseConfigFile(configURL);
        }
        createMetrics();
        sinceStartErrorCounts = createErrorCountsMap();
    }

    private void createMetrics() {
        for (TaskError error : knownErrors.values()) {
            LOG.info("metricsKey:" + error.metricsKey);
            errorCountsMetrics.put(error,
                    new MetricsTimeVaryingLong(error.metricsKey, registry, error.description));
        }
        errorCountsMetrics.put(UNKNOWN_ERROR,
                new MetricsTimeVaryingLong(UNKNOWN_ERROR.metricsKey, registry, UNKNOWN_ERROR.description));
    }

    private Map<TaskError, Integer> createErrorCountsMap() {
        Map<TaskError, Integer> errorCountsMap = new LinkedHashMap<TaskError, Integer>();
        Counters.Group grp = errorCounters.getGroup(COUNTER_GROUP_NAME);
        for (TaskError error : knownErrors.values()) {
            errorCountsMap.put(error, 0);
            // Make sure counter is present with value 0.
            grp.getCounterForName(error.name).increment(0);
        }
        errorCountsMap.put(UNKNOWN_ERROR, 0);
        return errorCountsMap;
    }

    public synchronized void collect(TaskInProgress tip, TaskAttemptID taskId, long now) {
        List<String> diagnostics = tip.getDiagnosticInfo(taskId);
        if (diagnostics == null || diagnostics.isEmpty()) {
            incErrorCounts(UNKNOWN_ERROR, now);
            return;
        }
        String latestDiagnostic = diagnostics.get(diagnostics.size() - 1);
        latestDiagnostic = latestDiagnostic.replace("\n", " ");
        boolean found = false;
        for (TaskError error : knownErrors.values()) {
            String p = error.pattern.toString();
            if (error.pattern.matcher(latestDiagnostic).matches()) {
                incErrorCounts(error, now);
                found = true;
                break;
            }
        }
        if (!found) {
            LOG.info("Undefined diagnostic info:" + latestDiagnostic);
            incErrorCounts(UNKNOWN_ERROR, now);
        }
    }

    /**
     */
    /**
     * Get recent TaskError counts within the given window
     * @param timeWindow Window size in milliseconds.
     *        Ex: 24 * 60 * 60 * 1000 gives you last day error counts
     * @return Counts for each TaskError
     */
    public synchronized Map<TaskError, Integer> getRecentErrorCounts(long timeWindow) {
        long start = System.currentTimeMillis() - timeWindow;
        Map<TaskError, Integer> errorCounts = createErrorCountsMap();
        Iterator<Map<TaskError, Integer>> errorCountsIter = errorCountsQueue.iterator();
        Iterator<Long> startTimeIter = startTimeQueue.iterator();
        while (errorCountsIter.hasNext() && start < startTimeIter.next()) {
            Map<TaskError, Integer> windowErrorCounts = errorCountsIter.next();
            for (Map.Entry<TaskError, Integer> entry : windowErrorCounts.entrySet()) {
                errorCounts.put(entry.getKey(), errorCounts.get(entry.getKey()) + entry.getValue());
            }
        }
        return errorCounts;
    }

    public synchronized Map<TaskError, Integer> getErrorCounts() {
        return Collections.unmodifiableMap(sinceStartErrorCounts);
    }

    private void incErrorCounts(TaskError error, long now) {

        Map<TaskError, Integer> current = getCurrentErrorCounts(now);
        current.put(error, current.get(error) + 1);

        errorCountsMetrics.get(error).inc();
        Counters.Group grp = errorCounters.getGroup(COUNTER_GROUP_NAME);
        Counters.Counter ctr = grp.getCounterForName(error.name);
        ctr.increment(1);

        sinceStartErrorCounts.put(error, sinceStartErrorCounts.get(error) + 1);

    }

    private Map<TaskError, Integer> getCurrentErrorCounts(long now) {
        long windowIndex = now / windowLength;
        if (windowIndex != lastWindowIndex || errorCountsQueue.isEmpty()) {
            lastWindowIndex = windowIndex;
            errorCountsQueue.addFirst(createErrorCountsMap());
            startTimeQueue.addFirst(windowIndex * windowLength);
            if (errorCountsQueue.size() > numWindows) {
                errorCountsQueue.removeLast();
                startTimeQueue.removeLast();
            }
        }
        return errorCountsQueue.getFirst();
    }

    public Counters getErrorCountsCounters() {
        return errorCounters;
    }

    public class TaskError {
        final String name;
        final Pattern pattern;
        final String metricsKey;
        final String description;

        TaskError(String name, String patternString, String description) {
            this.name = name;
            this.metricsKey = toMetricName(name);
            this.pattern = Pattern.compile(patternString);
            this.description = description;
        }

        private String toMetricName(String name) {
            return METRICS_KEY_PREFIX + name.toLowerCase().replaceAll("\\s+", "_");
        }

        @Override
        public String toString() {
            return "name:" + name + " pattern:" + pattern.toString() + " metricsKey:" + metricsKey + " description:"
                    + description;
        }
    }

    @Override
    public void doUpdates(MetricsContext context) {
        synchronized (this) {
            for (MetricsBase m : registry.getMetricsList()) {
                m.pushMetric(metricsRecord);
            }
        }
        metricsRecord.update();
    }

    /**
     * Parse the error.xml file which contains the error 
     * 
     * The following is an example of the error.xml
     * 
     * <?xml version="1.0"?>
     * <configuration>
     *   <error name="Map output lost">
     *     <pattern>Map output lost</pattern>
     *     <description>TaskTracker cannot find requested map output</description>
     *   </error>
     *   <error name="Memory killing">
     *     <pattern>Killing the top memory-consuming tasks.*</pattern>
     *     <description>When TaskTracker has no enough memory, it kills the task with highest memory</description>
     *   </error>
     *   <error name="Memory failing">
     *     <pattern>Failing the top memory-consuming tasks.*</pattern>
     *     <description>When TaskTracker has no enough memory, it check the task with highest memory. If it used more than configured memory, the task fails.</description>
     *   </error>
     *   <error name="Preemption">
     *     <pattern>Killed for preemption.*</pattern>
     *     <description>Task killed because of preemption</description>
     *   </error>
     *   <error name="Killed from JSP">
     *     <pattern>Killed from JSP page.*</pattern>
     *     <description>Someone kill the task from Web UI</description>
     *   </error>
     *   <error name="No space">
     *     <pattern>No space left on device.*</pattern>
     *     <description>Cannot find disk space on the TaskTracker</description>
     *   </error>
     * </configuration>
     * @param configURL
     * @throws IOException 
     * 
     */
    private Map<String, TaskError> parseConfigFile(URL configURL) {
        Map<String, TaskError> knownErrors = new LinkedHashMap<String, TaskError>();
        try {
            Element root = getRootElement(configURL);
            NodeList elements = root.getChildNodes();
            for (int i = 0; i < elements.getLength(); ++i) {
                Node node = elements.item(i);
                if (!(node instanceof Element)) {
                    continue;
                }
                Element element = (Element) node;
                if (matched(element, "error")) {
                    String name = element.getAttribute("name");
                    String pattern = "";
                    String description = "";
                    NodeList fields = element.getChildNodes();
                    for (int j = 0; j < fields.getLength(); ++j) {
                        Node fieldNode = fields.item(j);
                        if (!(fieldNode instanceof Element)) {
                            continue;
                        }
                        Element field = (Element) fieldNode;
                        if (matched(field, "pattern")) {
                            pattern = getText(field);
                        } else if (matched(field, "description")) {
                            description = getText(field);
                        }
                    }
                    TaskError taskError = new TaskError(name, pattern, description);
                    LOG.info("Adding TaskError " + taskError);
                    knownErrors.put(name, taskError);
                }
            }
        } catch (IOException ie) {
            LOG.error("Error parsing config file " + configURL, ie);
        }
        return knownErrors;
    }

    private Element getRootElement(URL configURL) throws IOException {
        Element root = null;
        try {
            DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
            docBuilderFactory.setIgnoringComments(true);
            DocumentBuilder builder = docBuilderFactory.newDocumentBuilder();
            Document doc = builder.parse(configURL.openStream());
            root = doc.getDocumentElement();
            if (!matched(root, "configuration")) {
                throw new IOException("Bad task error config at " + configURL);
            }
        } catch (SAXException se) {
            throw new IOException(se);
        } catch (ParserConfigurationException pe) {
            throw new IOException(pe);
        }
        return root;
    }

    private static boolean matched(Element element, String tagName) {
        return tagName.equals(element.getTagName());
    }

    private static String getText(Element element) {
        return ((Text) element.getFirstChild()).getData().trim();
    }
}