com.alexholmes.hdfsslurper.WorkerThread.java Source code

Java tutorial

Introduction

Here is the source code for com.alexholmes.hdfsslurper.WorkerThread.java

Source

/*
 * Copyright 2011 Alex Holmes
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.alexholmes.hdfsslurper;

import com.hadoop.compression.lzo.LzoIndex;
import com.hadoop.compression.lzo.LzoIndexer;
import com.hadoop.compression.lzo.LzopCodec;
import org.apache.commons.io.output.NullOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.log4j.MDC;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.zip.CRC32;
import java.util.zip.CheckedInputStream;

public class WorkerThread extends Thread {
    private static Log log = LogFactory.getLog(WorkerThread.class);
    private AtomicBoolean shuttingDown = new AtomicBoolean(false);
    private final Config config;
    private final FileSystemManager fileSystemManager;
    private final TimeUnit pollSleepUnit;
    private final LzoIndexer indexer;
    private String lzopExt;

    public WorkerThread(Config config, FileSystemManager fileSystemManager, TimeUnit pollSleepUnit,
            int threadIndex) {
        this.config = config;
        this.fileSystemManager = fileSystemManager;
        this.pollSleepUnit = pollSleepUnit;
        this.setDaemon(true);
        this.setName(WorkerThread.class.getSimpleName() + "-" + threadIndex);
        if (config.isCreateLzopIndex()) {
            this.indexer = new LzoIndexer(config.getConfig());
            this.lzopExt = new LzopCodec().getDefaultExtension();
        } else {
            this.indexer = null;
        }
    }

    @Override
    public void run() {
        MDC.put("threadName", this.getName());
        try {
            while (!shuttingDown.get() && !interrupted()) {
                doWork();
            }
        } catch (InterruptedException t) {
            log.warn("event#Caught interrupted exception, exiting");
        }
        log.info("event#Thread exiting");
    }

    protected void doWork() throws InterruptedException {
        try {
            copyFile(fileSystemManager.pollForInboundFile(pollSleepUnit, config.getPollSleepPeriodMillis()));
        } catch (InterruptedException ie) {
            throw ie;
        } catch (Throwable t) {
            log.warn("event#Caught exception in doWork", t);
        }
    }

    private synchronized void copyFile(FileStatus fs) throws IOException, InterruptedException {
        if (!shuttingDown.get() && !interrupted()) {
            process(fs);
        }
    }

    private void process(FileStatus srcFileStatus) throws IOException, InterruptedException {

        Path stagingFile = null;
        FileSystem destFs = null;
        String filenameBatchidDelimiter = config.getFileNameBatchIdDelimiter();

        try {
            FileSystem srcFs = srcFileStatus.getPath().getFileSystem(config.getConfig());

            // run a script which can change the name of the file as well as
            // write out a new version of the file
            //
            if (config.getWorkScript() != null) {
                Path newSrcFile = stageSource(srcFileStatus);
                srcFileStatus = srcFileStatus.getPath().getFileSystem(config.getConfig()).getFileStatus(newSrcFile);
            }

            Path srcFile = srcFileStatus.getPath();

            // get the target HDFS file
            //
            Path destFile = getHdfsTargetPath(srcFileStatus);

            if (config.getCodec() != null) {
                String ext = config.getCodec().getDefaultExtension();
                if (!destFile.getName().endsWith(ext)) {
                    destFile = new Path(destFile.toString() + ext);
                }
            }

            destFs = destFile.getFileSystem(config.getConfig());

            // get the staging HDFS file
            //
            stagingFile = fileSystemManager.getStagingFile(srcFileStatus, destFile);
            String batchId = srcFile.toString().substring(
                    srcFile.toString().lastIndexOf(filenameBatchidDelimiter) + 1, srcFile.toString().length());

            log.info("event#Copying source file '" + srcFile + "' to staging destination '" + stagingFile + "'"
                    + "$batchId#" + batchId);

            // if the directory of the target file doesn't exist, attempt to
            // create it
            //
            Path destParentDir = destFile.getParent();
            if (!destFs.exists(destParentDir)) {
                log.info("event#Attempting creation of target directory: " + destParentDir.toUri());
                if (!destFs.mkdirs(destParentDir)) {
                    throw new IOException("event#Failed to create target directory: " + destParentDir.toUri());
                }
            }

            // if the staging directory doesn't exist, attempt to create it
            //
            Path destStagingParentDir = stagingFile.getParent();
            if (!destFs.exists(destStagingParentDir)) {
                log.info("event#Attempting creation of staging directory: " + destStagingParentDir.toUri());
                if (!destFs.mkdirs(destStagingParentDir)) {
                    throw new IOException("event#Failed to create staging directory: " + destParentDir.toUri());
                }
            }

            // copy the file
            //
            InputStream is = null;
            OutputStream os = null;
            CRC32 crc = new CRC32();
            try {
                is = new BufferedInputStream(srcFs.open(srcFile));
                if (config.isVerify()) {
                    is = new CheckedInputStream(is, crc);
                }
                os = destFs.create(stagingFile);

                if (config.getCodec() != null) {
                    os = config.getCodec().createOutputStream(os);
                }

                IOUtils.copyBytes(is, os, 4096, false);
            } finally {
                IOUtils.closeStream(is);
                IOUtils.closeStream(os);
            }

            long srcFileSize = srcFs.getFileStatus(srcFile).getLen();
            long destFileSize = destFs.getFileStatus(stagingFile).getLen();
            if (config.getCodec() == null && srcFileSize != destFileSize) {
                throw new IOException(
                        "event#File sizes don't match, source = " + srcFileSize + ", dest = " + destFileSize);
            }

            log.info("event#Local file size = " + srcFileSize + ", HDFS file size = " + destFileSize + "$batchId#"
                    + batchId);

            if (config.isVerify()) {
                verify(stagingFile, crc.getValue());
            }

            if (destFs.exists(destFile)) {
                destFs.delete(destFile, false);
            }

            log.info("event#Moving staging file '" + stagingFile + "' to destination '" + destFile + "'"
                    + "$batchId#" + batchId);
            if (!destFs.rename(stagingFile, destFile)) {
                throw new IOException("event#Failed to rename file");
            }

            if (config.isCreateLzopIndex() && destFile.getName().endsWith(lzopExt)) {
                Path lzoIndexPath = new Path(destFile.toString() + LzoIndex.LZO_INDEX_SUFFIX);
                if (destFs.exists(lzoIndexPath)) {
                    log.info("event#Deleting index file as it already exists");
                    destFs.delete(lzoIndexPath, false);
                }
                indexer.index(destFile);
            }

            fileSystemManager.fileCopyComplete(srcFileStatus);

        } catch (Throwable t) {
            log.error("event#Caught exception working on file " + srcFileStatus.getPath(), t);

            // delete the staging file if it still exists
            //
            try {
                if (destFs != null && destFs.exists(stagingFile)) {
                    destFs.delete(stagingFile, false);
                }
            } catch (Throwable t2) {
                log.error("event#Failed to delete staging file " + stagingFile, t2);
            }

            fileSystemManager.fileCopyError(srcFileStatus);
        }

    }

    private Path stageSource(FileStatus srcFile) throws IOException {
        String filenameBatchidDelimiter = config.getFileNameBatchIdDelimiter();
        Path p = new Path(ScriptExecutor.getStdOutFromScript(config.getWorkScript(), srcFile.getPath().toString(),
                60, TimeUnit.SECONDS, config.getFileNameBatchIdDelimiter()));
        String batchId = p.toString().substring(p.toString().lastIndexOf(filenameBatchidDelimiter) + 1,
                p.toString().length());
        if (p.toUri().getScheme() == null) {
            throw new IOException(
                    "event#Work path from script must be a URI with a scheme: '" + p + "'" + "$batchId#" + batchId);
        }
        log.info("event#Staging script returned new file '" + p + " for old " + srcFile.getPath() + "$batchId#"
                + batchId);
        return p;
    }

    private void verify(Path hdfs, long localFileCRC) throws IOException {
        log.info("event#Verifying files");
        long hdfsCRC = hdfsFileCRC32(hdfs);

        if (localFileCRC != hdfsCRC) {
            throw new IOException(
                    "event#CRC's don't match, local file is " + localFileCRC + " HDFS file is " + hdfsCRC);
        }
        log.info("event#CRC's match (" + localFileCRC + ")");
    }

    private long hdfsFileCRC32(Path path) throws IOException {
        InputStream in = null;
        CRC32 crc = new CRC32();
        try {
            InputStream is = new BufferedInputStream(path.getFileSystem(config.getConfig()).open(path));
            if (config.getCodec() != null) {
                is = config.getCodec().createInputStream(is);
            }
            in = new CheckedInputStream(is, crc);
            org.apache.commons.io.IOUtils.copy(in, new NullOutputStream());
        } finally {
            org.apache.commons.io.IOUtils.closeQuietly(in);
        }
        return crc.getValue();
    }

    private Path getHdfsTargetPath(FileStatus srcFile) throws IOException {
        if (config.getDestDir() != null) {
            if (config.getCodec() != null) {
                return new Path(config.getDestDir(),
                        srcFile.getPath().getName() + config.getCodec().getDefaultExtension());
            } else {
                return new Path(config.getDestDir(), srcFile.getPath().getName());
            }
        } else {
            return getDestPathFromScript(srcFile);
        }
    }

    private Path getDestPathFromScript(FileStatus srcFile) throws IOException {
        Path p = new Path(ScriptExecutor.getStdOutFromScript(config.getScript(), srcFile.getPath().toString(), 60,
                TimeUnit.SECONDS, config.getFileNameBatchIdDelimiter()));
        String filenameBatchidDelimiter = config.getFileNameBatchIdDelimiter();
        String batchId = p.toString().substring(p.toString().lastIndexOf(filenameBatchidDelimiter) + 1,
                p.toString().length());
        if (p.toUri().getScheme() == null) {
            throw new IOException("event#Destination path from script must be a URI with a scheme: '" + p + "'"
                    + "$batchId#" + batchId);
        }
        return p;
    }

    public synchronized void shutdown() throws InterruptedException {
        if (!shuttingDown.getAndSet(true)) {
            log.info("event#Interrupting: " + this.getName());
            this.interrupt();
            log.info("event#Joining: " + this.getName());
            this.join();
        }
    }
}