com.ning.metrics.collector.hadoop.processing.LocalSpoolManager.java Source code

Introduction

Here is the source code for com.ning.metrics.collector.hadoop.processing.LocalSpoolManager.java
Source

/*
 * Copyright 2010-2011 Ning, Inc.
 *
 * Ning licenses this file to you under the Apache License, version 2.0
 * (the "License"); you may not use this file except in compliance with the
 * License.  You may obtain a copy of the License at:
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

package com.ning.metrics.collector.hadoop.processing;

import com.ning.metrics.collector.binder.config.CollectorConfig;
import com.ning.metrics.serialization.event.Granularity;
import com.ning.metrics.serialization.event.GranularityPathMapper;
import org.apache.axis.utils.StringUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.ReadableDateTime;
import org.joda.time.ReadableInstant;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileFilter;
import java.util.Collection;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Utility class to manage local spools
 */
class LocalSpoolManager {
    private static final Logger log = LoggerFactory.getLogger(LocalSpoolManager.class);

    private static final Pattern filenamePattern = Pattern.compile(
            "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}-\\d{1,5}-[a-zA-Z0-9]{4}-(\\d{4}-\\d{2}-\\d{2}T\\d{2}.\\d{2}.\\d{2}.\\d{3})\\.[a-zA-Z0-9-]*\\.[a-zA-Z]*");

    // Can't use : in the pattern - Hadoop chokes on it when building the .crc Path
    protected static final DateTimeFormatter dateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH.mm.ss.SSS")
            .withZone(DateTimeZone.UTC);

    private final CollectorConfig config;
    private String eventName;
    private SerializationType serializationType;
    private String hdfsDir;
    private ReadableDateTime timeStamp;

    public static Collection<File> findOldSpoolDirectories(final String basePath, final long cutoff) {
        final Collection<File> files = new java.util.LinkedList<File>();
        // Candidates ("old") directories are the ones last modified more than 2 hours ago
        final File[] found = new File(basePath)
                .listFiles((FileFilter) FileFilterUtils.and(FileFilterUtils.directoryFileFilter(),
                        FileFilterUtils.ageFileFilter(System.currentTimeMillis() - cutoff, true)));

        // Can't use FileUtils.listFiles as it doesn't yield directories
        if (found != null) {
            for (final File file : found) {
                if (file.isDirectory()) {
                    files.add(file);
                }
            }
        }

        return files;
    }

    public static Collection<File> findFilesInSpoolDirectory(final File spoolDirectory) {
        if (!spoolDirectory.isDirectory()) {
            log.warn("Asked to find files in spool directory but [" + spoolDirectory + "] is not a directory!");
            return Collections.emptyList();
        }
        return FileUtils.listFiles(spoolDirectory, FileFilterUtils.trueFileFilter(),
                FileFilterUtils.notFileFilter(FileFilterUtils.nameFileFilter("_tmp")));
    }

    public static void cleanupOldSpoolDirectories(final Iterable<File> oldSpoolDirectories) {
        // Cleanup empty directories /_tmp, /_quarantine and /_lock first
        for (final File dir : oldSpoolDirectories) {
            for (final File subDir : dir.listFiles()) {
                if (subDir.isDirectory() && subDir.listFiles().length == 0) {
                    log.info(String.format("Deleting empty directory: %s", subDir.toString()));
                    FileUtils.deleteQuietly(subDir);
                }
            }
        }

        for (final File dir : oldSpoolDirectories) {
            if (dir.isDirectory() && dir.listFiles().length == 0) {
                log.info(String.format("Deleting empty directory: %s", dir.toString()));
                FileUtils.deleteQuietly(dir);
            }
        }
    }

    /**
     * Main constructor on the way in (to Hadoop)
     *
     * @param config            Configuration object
     * @param eventName         event name
     * @param serializationType event serialization type
     * @param hdfsDir           directory name in HDFS where the event will end up
     */
    public LocalSpoolManager(final CollectorConfig config, final String eventName,
            final SerializationType serializationType, final String hdfsDir) {
        this.config = config;
        this.eventName = eventName;
        this.serializationType = serializationType;
        this.hdfsDir = hdfsDir;
        this.timeStamp = new DateTime();
    }

    /**
     * Constructor for the Left Below usecase (restarts)
     *
     * @param config    Configuration object
     * @param directory Spool directory
     * @throws IllegalArgumentException if the directory is not valid
     */
    public LocalSpoolManager(final CollectorConfig config, final File directory) throws IllegalArgumentException {
        this.config = config;
        reverseEngineerFilename(directory.getName());
    }

    /**
     * Given a filename, find the timeStamp, eventName and serializationType
     *
     * @param directoryName directory candidate
     * @throws IllegalArgumentException if the directory is not valid
     */
    private void reverseEngineerFilename(final String directoryName) throws IllegalArgumentException {
        // directoryName is like 127.0.0.1-8080-pYb2-2011-05-27T15.42.03.586.FrontDoorVisit.smile
        final String[] directoryTokens = StringUtils.split(directoryName, '.');
        if (directoryTokens.length < 3) {
            throw new IllegalArgumentException();
        }
        final Matcher m = filenamePattern.matcher(directoryName);
        if (!m.matches()) {
            throw new IllegalArgumentException();
        }

        this.timeStamp = dateFormatter.parseDateTime(m.group(1));
        this.eventName = directoryTokens[directoryTokens.length - 2];
        this.serializationType = SerializationType.fromSuffix(directoryTokens[directoryTokens.length - 1]);
        // Build the final output directory in HDFS
        // TODO Use default granularity of HOURLY, we could have different granularity in the file though :(
        // This may be already broken in EventSpoolDispatcher
        final GranularityPathMapper pathMapper = new GranularityPathMapper(
                String.format("%s/%s", config.getEventOutputDirectory(), eventName), Granularity.HOURLY);
        this.hdfsDir = pathMapper.getPathForDateTime(timeStamp);
    }

    /**
     * @return the local spool directory name
     */
    public String getSpoolDirectoryName() {
        return String.format("%s-%d-%s-%s.%s.%s", config.getLocalIp(), config.getLocalPort(),
                RandomStringUtils.randomAlphanumeric(4), dateFormatter.print(timeStamp), eventName,
                serializationType.getFileSuffix());
    }

    /**
     * @return the full path to the local spool directory name
     */
    public String getSpoolDirectoryPath() {
        return String.format("%s/%s", config.getSpoolDirectoryName(), getSpoolDirectoryName());
    }

    /**
     * Get the full file path in HDFS
     * <p/>
     * The Hadoop fileName includes a number of things to avoid collisions:
     * ip  to avoid conflicts between machines
     * fileExtension  (serialization-type-specific) so multiple file extensions can be written to same directory
     * flushCount  so the same queue can write multiple times
     * queueCreationTimestamp  so if, for instance, we shut down and restart the collector within an hour, their writes won't conflict
     *
     * @param flushCount number of flushes for this queue
     * @return output path for the spool in HDFS
     */
    public String toHadoopPath(final int flushCount) {
        return String.format("%s/%s-%d-%s-%s-f%d.%s", hdfsDir, config.getLocalIp(), config.getLocalPort(),
                RandomStringUtils.randomAlphanumeric(4), dateFormatter.print(timeStamp), flushCount,
                serializationType.getFileSuffix());
    }

    public String getEventName() {
        return eventName;
    }

    public SerializationType getSerializationType() {
        return serializationType;
    }

    public ReadableInstant getTimeStamp() {
        return timeStamp;
    }
}