org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils.java

Source

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.mapreduce.v2.jobhistory;

import java.io.File;
import java.io.IOException;
import java.util.Calendar;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
import org.apache.hadoop.mapreduce.v2.util.MRApps;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;

@InterfaceAudience.Private
@InterfaceStability.Unstable
public class JobHistoryUtils {

    /**
     * Permissions for the history staging dir while JobInProgress.
     */
    public static final FsPermission HISTORY_STAGING_DIR_PERMISSIONS =

            FsPermission.createImmutable((short) 0700);

    /**
     * Permissions for the user directory under the staging directory.
     */
    public static final FsPermission HISTORY_STAGING_USER_DIR_PERMISSIONS = FsPermission
            .createImmutable((short) 0700);

    /**
     * Permissions for the history done dir and derivatives.
     */
    public static final FsPermission HISTORY_DONE_DIR_PERMISSION = FsPermission.createImmutable((short) 0770);

    public static final FsPermission HISTORY_DONE_FILE_PERMISSION = FsPermission.createImmutable((short) 0770); // rwx------

    /**
      * Umask for the done dir and derivatives.
      */
    public static final FsPermission HISTORY_DONE_DIR_UMASK = FsPermission.createImmutable((short) (0770 ^ 0777));

    /**
     * Permissions for the intermediate done directory.
     */
    public static final FsPermission HISTORY_INTERMEDIATE_DONE_DIR_PERMISSIONS = FsPermission
            .createImmutable((short) 01777);

    /**
     * Permissions for the user directory under the intermediate done directory.
     */
    public static final FsPermission HISTORY_INTERMEDIATE_USER_DIR_PERMISSIONS = FsPermission
            .createImmutable((short) 0770);

    public static final FsPermission HISTORY_INTERMEDIATE_FILE_PERMISSIONS = FsPermission
            .createImmutable((short) 0770); // rwx------

    /**
     * Suffix for configuration files.
     */
    public static final String CONF_FILE_NAME_SUFFIX = "_conf.xml";

    /**
     * Suffix for summary files.
     */
    public static final String SUMMARY_FILE_NAME_SUFFIX = ".summary";

    /**
     * Job History File extension.
     */
    public static final String JOB_HISTORY_FILE_EXTENSION = ".jhist";

    public static final int VERSION = 4;

    public static final int SERIAL_NUMBER_DIRECTORY_DIGITS = 6;

    public static final String TIMESTAMP_DIR_REGEX = "\\d{4}" + "\\" + Path.SEPARATOR + "\\d{2}" + "\\"
            + Path.SEPARATOR + "\\d{2}";
    public static final Pattern TIMESTAMP_DIR_PATTERN = Pattern.compile(TIMESTAMP_DIR_REGEX);
    private static final String TIMESTAMP_DIR_FORMAT = "%04d" + File.separator + "%02d" + File.separator + "%02d";
    private static final Log LOG = LogFactory.getLog(JobHistoryUtils.class);

    private static final PathFilter CONF_FILTER = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(CONF_FILE_NAME_SUFFIX);
        }
    };

    private static final PathFilter JOB_HISTORY_FILE_FILTER = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(JOB_HISTORY_FILE_EXTENSION);
        }
    };

    /**
     * Checks whether the provided path string is a valid job history file.
     * @param pathString the path to be checked.
     * @return true is the path is a valid job history filename else return false
     */
    public static boolean isValidJobHistoryFileName(String pathString) {
        return pathString.endsWith(JOB_HISTORY_FILE_EXTENSION);
    }

    /**
     * Returns the jobId from a job history file name.
     * @param pathString the path string.
     * @return the JobId
     * @throws IOException if the filename format is invalid.
     */
    public static JobID getJobIDFromHistoryFilePath(String pathString) throws IOException {
        String[] parts = pathString.split(Path.SEPARATOR);
        String fileNamePart = parts[parts.length - 1];
        JobIndexInfo jobIndexInfo = FileNameIndexUtils.getIndexInfo(fileNamePart);
        return TypeConverter.fromYarn(jobIndexInfo.getJobId());
    }

    /**
     * Gets a PathFilter which would match configuration files.
     * @return the patch filter {@link PathFilter} for matching conf files.
     */
    public static PathFilter getConfFileFilter() {
        return CONF_FILTER;
    }

    /**
     * Gets a PathFilter which would match job history file names.
     * @return the path filter {@link PathFilter} matching job history files.
     */
    public static PathFilter getHistoryFileFilter() {
        return JOB_HISTORY_FILE_FILTER;
    }

    /**
     * Gets the configured directory prefix for In Progress history files.
     * @param conf the configuration for hte job
     * @param jobId the id of the job the history file is for.
     * @return A string representation of the prefix.
     */
    public static String getConfiguredHistoryStagingDirPrefix(Configuration conf, String jobId) throws IOException {
        String user = UserGroupInformation.getCurrentUser().getShortUserName();
        Path stagingPath = MRApps.getStagingAreaDir(conf, user);
        Path path = new Path(stagingPath, jobId);
        String logDir = path.toString();
        return ensurePathInDefaultFileSystem(logDir, conf);
    }

    /**
     * Gets the configured directory prefix for intermediate done history files.
     * @param conf
     * @return A string representation of the prefix.
     */
    public static String getConfiguredHistoryIntermediateDoneDirPrefix(Configuration conf) {
        String doneDirPrefix = conf.get(JHAdminConfig.MR_HISTORY_INTERMEDIATE_DONE_DIR);
        if (doneDirPrefix == null) {
            doneDirPrefix = conf.get(MRJobConfig.MR_AM_STAGING_DIR, MRJobConfig.DEFAULT_MR_AM_STAGING_DIR)
                    + "/history/done_intermediate";
        }
        return ensurePathInDefaultFileSystem(doneDirPrefix, conf);
    }

    /**
     * Gets the configured directory prefix for Done history files.
     * @param conf the configuration object
     * @return the done history directory
     */
    public static String getConfiguredHistoryServerDoneDirPrefix(Configuration conf) {
        String doneDirPrefix = conf.get(JHAdminConfig.MR_HISTORY_DONE_DIR);
        if (doneDirPrefix == null) {
            doneDirPrefix = conf.get(MRJobConfig.MR_AM_STAGING_DIR, MRJobConfig.DEFAULT_MR_AM_STAGING_DIR)
                    + "/history/done";
        }
        return ensurePathInDefaultFileSystem(doneDirPrefix, conf);
    }

    /**
     * Get default file system URI for the cluster (used to ensure consistency
     * of history done/staging locations) over different context
     *
     * @return Default file context
     */
    private static FileContext getDefaultFileContext() {
        // If FS_DEFAULT_NAME_KEY was set solely by core-default.xml then we ignore
        // ignore it. This prevents defaulting history paths to file system specified
        // by core-default.xml which would not make sense in any case. For a test
        // case to exploit this functionality it should create core-site.xml
        FileContext fc = null;
        Configuration defaultConf = new Configuration();
        String[] sources;
        sources = defaultConf.getPropertySources(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY);
        if (sources != null && (!Arrays.asList(sources).contains("core-default.xml") || sources.length > 1)) {
            try {
                fc = FileContext.getFileContext(defaultConf);
                LOG.info("Default file system [" + fc.getDefaultFileSystem().getUri() + "]");
            } catch (UnsupportedFileSystemException e) {
                LOG.error("Unable to create default file context ["
                        + defaultConf.get(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY) + "]", e);
            }
        } else {
            LOG.info("Default file system is set solely " + "by core-default.xml therefore -  ignoring");
        }

        return fc;
    }

    /**
     * Ensure that path belongs to cluster's default file system unless
     * 1. it is already fully qualified.
     * 2. current job configuration uses default file system
     * 3. running from a test case without core-site.xml
     *
     * @param sourcePath source path
     * @param conf the job configuration
     * @return full qualified path (if necessary) in default file system
     */
    private static String ensurePathInDefaultFileSystem(String sourcePath, Configuration conf) {
        Path path = new Path(sourcePath);
        FileContext fc = getDefaultFileContext();
        if (fc == null
                || fc.getDefaultFileSystem().getUri().toString()
                        .equals(conf.get(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, ""))
                || path.toUri().getAuthority() != null || path.toUri().getScheme() != null) {
            return sourcePath;
        }

        return fc.makeQualified(path).toString();
    }

    /**
     * Gets the user directory for intermediate done history files.
     * @param conf the configuration object
     * @return the intermediate done directory for jobhistory files.
     */
    public static String getHistoryIntermediateDoneDirForUser(Configuration conf) throws IOException {
        return new Path(getConfiguredHistoryIntermediateDoneDirPrefix(conf),
                UserGroupInformation.getCurrentUser().getShortUserName()).toString();
    }

    public static boolean shouldCreateNonUserDirectory(Configuration conf) {
        // Returning true by default to allow non secure single node clusters to work
        // without any configuration change.
        return conf.getBoolean(MRJobConfig.MR_AM_CREATE_JH_INTERMEDIATE_BASE_DIR, true);
    }

    /**
     * Get the job history file path for non Done history files.
     */
    public static Path getStagingJobHistoryFile(Path dir, JobId jobId, int attempt) {
        return getStagingJobHistoryFile(dir, TypeConverter.fromYarn(jobId).toString(), attempt);
    }

    /**
     * Get the job history file path for non Done history files.
     */
    public static Path getStagingJobHistoryFile(Path dir, String jobId, int attempt) {
        return new Path(dir, jobId + "_" + attempt + JOB_HISTORY_FILE_EXTENSION);
    }

    /**
     * Get the done configuration file name for a job.
     * @param jobId the jobId.
     * @return the conf file name.
     */
    public static String getIntermediateConfFileName(JobId jobId) {
        return TypeConverter.fromYarn(jobId).toString() + CONF_FILE_NAME_SUFFIX;
    }

    /**
     * Get the done summary file name for a job.
     * @param jobId the jobId.
     * @return the conf file name.
     */
    public static String getIntermediateSummaryFileName(JobId jobId) {
        return TypeConverter.fromYarn(jobId).toString() + SUMMARY_FILE_NAME_SUFFIX;
    }

    /**
     * Gets the conf file path for jobs in progress.
     * 
     * @param logDir the log directory prefix.
     * @param jobId the jobId.
     * @param attempt attempt number for this job.
     * @return the conf file path for jobs in progress.
     */
    public static Path getStagingConfFile(Path logDir, JobId jobId, int attempt) {
        Path jobFilePath = null;
        if (logDir != null) {
            jobFilePath = new Path(logDir,
                    TypeConverter.fromYarn(jobId).toString() + "_" + attempt + CONF_FILE_NAME_SUFFIX);
        }
        return jobFilePath;
    }

    /**
     * Gets the serial number part of the path based on the jobId and serialNumber format.
     * @param id
     * @param serialNumberFormat
     * @return the serial number part of the patch based on the jobId and serial number format.
     */
    public static String serialNumberDirectoryComponent(JobId id, String serialNumberFormat) {
        return String.format(serialNumberFormat, Integer.valueOf(jobSerialNumber(id))).substring(0,
                SERIAL_NUMBER_DIRECTORY_DIGITS);
    }

    /**Extracts the timstamp component from the path.
     * @param path
     * @return the timestamp component from the path
     */
    public static String getTimestampPartFromPath(String path) {
        Matcher matcher = TIMESTAMP_DIR_PATTERN.matcher(path);
        if (matcher.find()) {
            String matched = matcher.group();
            String ret = matched.intern();
            return ret;
        } else {
            return null;
        }
    }

    /**
     * Gets the history subdirectory based on the jobId, timestamp and serial number format.
     * @param id
     * @param timestampComponent
     * @param serialNumberFormat
     * @return the history sub directory based on the jobid, timestamp and serial number format
     */
    public static String historyLogSubdirectory(JobId id, String timestampComponent, String serialNumberFormat) {
        //    String result = LOG_VERSION_STRING;
        String result = "";
        String serialNumberDirectory = serialNumberDirectoryComponent(id, serialNumberFormat);

        result = result + timestampComponent + File.separator + serialNumberDirectory + File.separator;

        return result;
    }

    /**
     * Gets the timestamp component based on millisecond time.
     * @param millisecondTime
     * @return the timestamp component based on millisecond time
     */
    public static String timestampDirectoryComponent(long millisecondTime) {
        Calendar timestamp = Calendar.getInstance();
        timestamp.setTimeInMillis(millisecondTime);
        String dateString = null;
        dateString = String.format(TIMESTAMP_DIR_FORMAT, timestamp.get(Calendar.YEAR),
                // months are 0-based in Calendar, but people will expect January to
                // be month #1.
                timestamp.get(Calendar.MONTH) + 1, timestamp.get(Calendar.DAY_OF_MONTH));
        dateString = dateString.intern();
        return dateString;
    }

    public static String doneSubdirsBeforeSerialTail() {
        // date
        String result = "/*/*/*"; // YYYY/MM/DD ;
        return result;
    }

    /**
     * Computes a serial number used as part of directory naming for the given jobId.
     * @param id the jobId.
     * @return the serial number used as part of directory naming for the given jobid
     */
    public static int jobSerialNumber(JobId id) {
        return id.getId();
    }

    public static List<FileStatus> localGlobber(FileContext fc, Path root, String tail) throws IOException {
        return localGlobber(fc, root, tail, null);
    }

    public static List<FileStatus> localGlobber(FileContext fc, Path root, String tail, PathFilter filter)
            throws IOException {
        return localGlobber(fc, root, tail, filter, null);
    }

    // hasMismatches is just used to return a second value if you want
    // one. I would have used MutableBoxedBoolean if such had been provided.
    public static List<FileStatus> localGlobber(FileContext fc, Path root, String tail, PathFilter filter,
            AtomicBoolean hasFlatFiles) throws IOException {
        if (tail.equals("")) {
            return (listFilteredStatus(fc, root, filter));
        }

        if (tail.startsWith("/*")) {
            Path[] subdirs = filteredStat2Paths(remoteIterToList(fc.listStatus(root)), true, hasFlatFiles);

            List<List<FileStatus>> subsubdirs = new LinkedList<List<FileStatus>>();

            int subsubdirCount = 0;

            if (subdirs.length == 0) {
                return new LinkedList<FileStatus>();
            }

            String newTail = tail.substring(2);

            for (int i = 0; i < subdirs.length; ++i) {
                subsubdirs.add(localGlobber(fc, subdirs[i], newTail, filter, null));
                // subsubdirs.set(i, localGlobber(fc, subdirs[i], newTail, filter,
                // null));
                subsubdirCount += subsubdirs.get(i).size();
            }

            List<FileStatus> result = new LinkedList<FileStatus>();

            for (int i = 0; i < subsubdirs.size(); ++i) {
                result.addAll(subsubdirs.get(i));
            }

            return result;
        }

        if (tail.startsWith("/")) {
            int split = tail.indexOf('/', 1);

            if (split < 0) {
                return listFilteredStatus(fc, new Path(root, tail.substring(1)), filter);
            } else {
                String thisSegment = tail.substring(1, split);
                String newTail = tail.substring(split);
                return localGlobber(fc, new Path(root, thisSegment), newTail, filter, hasFlatFiles);
            }
        }

        IOException e = new IOException("localGlobber: bad tail");

        throw e;
    }

    private static List<FileStatus> listFilteredStatus(FileContext fc, Path root, PathFilter filter)
            throws IOException {
        List<FileStatus> fsList = remoteIterToList(fc.listStatus(root));
        if (filter == null) {
            return fsList;
        } else {
            List<FileStatus> filteredList = new LinkedList<FileStatus>();
            for (FileStatus fs : fsList) {
                if (filter.accept(fs.getPath())) {
                    filteredList.add(fs);
                }
            }
            return filteredList;
        }
    }

    private static List<FileStatus> remoteIterToList(RemoteIterator<FileStatus> rIter) throws IOException {
        List<FileStatus> fsList = new LinkedList<FileStatus>();
        if (rIter == null)
            return fsList;
        while (rIter.hasNext()) {
            fsList.add(rIter.next());
        }
        return fsList;
    }

    // hasMismatches is just used to return a second value if you want
    // one. I would have used MutableBoxedBoolean if such had been provided.
    private static Path[] filteredStat2Paths(List<FileStatus> stats, boolean dirs, AtomicBoolean hasMismatches) {
        int resultCount = 0;

        if (hasMismatches == null) {
            hasMismatches = new AtomicBoolean(false);
        }

        for (int i = 0; i < stats.size(); ++i) {
            if (stats.get(i).isDirectory() == dirs) {
                stats.set(resultCount++, stats.get(i));
            } else {
                hasMismatches.set(true);
            }
        }

        Path[] result = new Path[resultCount];
        for (int i = 0; i < resultCount; i++) {
            result[i] = stats.get(i).getPath();
        }

        return result;
    }

    public static Path getPreviousJobHistoryPath(Configuration conf, ApplicationAttemptId applicationAttemptId)
            throws IOException {
        String jobId = TypeConverter.fromYarn(applicationAttemptId.getApplicationId()).toString();
        String jobhistoryDir = JobHistoryUtils.getConfiguredHistoryStagingDirPrefix(conf, jobId);
        Path histDirPath = FileContext.getFileContext(conf).makeQualified(new Path(jobhistoryDir));
        FileContext fc = FileContext.getFileContext(histDirPath.toUri(), conf);
        return fc.makeQualified(JobHistoryUtils.getStagingJobHistoryFile(histDirPath, jobId,
                (applicationAttemptId.getAttemptId() - 1)));
    }

    /**
     * Looks for the dirs to clean.  The folder structure is YYYY/MM/DD/Serial so
     * we can use that to more efficiently find the directories to clean by
     * comparing the cutoff timestamp with the timestamp from the folder
     * structure.
     *
     * @param fc done dir FileContext
     * @param root folder for completed jobs
     * @param cutoff The cutoff for the max history age
     * @return The list of directories for cleaning
     * @throws IOException
     */
    public static List<FileStatus> getHistoryDirsForCleaning(FileContext fc, Path root, long cutoff)
            throws IOException {
        List<FileStatus> fsList = new ArrayList<FileStatus>();
        Calendar cCal = Calendar.getInstance();
        cCal.setTimeInMillis(cutoff);
        int cYear = cCal.get(Calendar.YEAR);
        int cMonth = cCal.get(Calendar.MONTH) + 1;
        int cDate = cCal.get(Calendar.DATE);

        RemoteIterator<FileStatus> yearDirIt = fc.listStatus(root);
        while (yearDirIt.hasNext()) {
            FileStatus yearDir = yearDirIt.next();
            try {
                int year = Integer.parseInt(yearDir.getPath().getName());
                if (year <= cYear) {
                    RemoteIterator<FileStatus> monthDirIt = fc.listStatus(yearDir.getPath());
                    while (monthDirIt.hasNext()) {
                        FileStatus monthDir = monthDirIt.next();
                        try {
                            int month = Integer.parseInt(monthDir.getPath().getName());
                            // If we only checked the month here, then something like 07/2013
                            // would incorrectly not pass when the cutoff is 06/2014
                            if (year < cYear || month <= cMonth) {
                                RemoteIterator<FileStatus> dateDirIt = fc.listStatus(monthDir.getPath());
                                while (dateDirIt.hasNext()) {
                                    FileStatus dateDir = dateDirIt.next();
                                    try {
                                        int date = Integer.parseInt(dateDir.getPath().getName());
                                        // If we only checked the date here, then something like
                                        // 07/21/2013 would incorrectly not pass when the cutoff is
                                        // 08/20/2013 or 07/20/2012
                                        if (year < cYear || month < cMonth || date <= cDate) {
                                            fsList.addAll(remoteIterToList(fc.listStatus(dateDir.getPath())));
                                        }
                                    } catch (NumberFormatException nfe) {
                                        // the directory didn't fit the format we're looking for so
                                        // skip the dir
                                    }
                                }
                            }
                        } catch (NumberFormatException nfe) {
                            // the directory didn't fit the format we're looking for so skip
                            // the dir
                        }
                    }
                }
            } catch (NumberFormatException nfe) {
                // the directory didn't fit the format we're looking for so skip the dir
            }
        }
        return fsList;
    }
}