com.datatorrent.lib.io.fs.AbstractFSDirectoryInputOperator.java Source code

Java tutorial

Introduction

Here is the source code for com.datatorrent.lib.io.fs.AbstractFSDirectoryInputOperator.java

Source

/*
 * Copyright (c) 2014 DataTorrent, Inc. ALL Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datatorrent.lib.io.fs;

import com.datatorrent.api.Context.OperatorContext;
import com.datatorrent.api.DefaultPartition;
import com.datatorrent.api.InputOperator;
import com.datatorrent.api.Partitioner;
import com.datatorrent.api.StatsListener;
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.minlog.Log;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.validation.constraints.NotNull;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Input operator that reads files from a directory.
 * <p/>
 * Derived class defines how to read entries from the input stream and emit to the port.
 * <p/>
 * The directory scanning logic is pluggable to support custom directory layouts and naming schemes. The default
 * implementation scans a single directory.
 * <p/>
 * Fault tolerant by tracking previously read files and current offset as part of checkpoint state. In case of failure
 * the operator will skip files that were already processed and fast forward to the offset of the current file.
 * <p/>
 * Supports partitioning and dynamic changes to number of partitions through property {@link #partitionCount}. The
 * directory scanner is responsible to only accept the files that belong to a partition.
 * <p/>
 * This class supports retrying of failed files by putting them into failed list, and retrying them after pending
 * files are processed. Retrying is disabled when maxRetryCount is set to zero.
 *
 * @since 1.0.2
 */
public abstract class AbstractFSDirectoryInputOperator<T>
        implements InputOperator, Partitioner<AbstractFSDirectoryInputOperator<T>>, StatsListener {
    private static final Logger LOG = LoggerFactory.getLogger(AbstractFSDirectoryInputOperator.class);

    @NotNull
    protected String directory;
    @NotNull
    protected DirectoryScanner scanner = new DirectoryScanner();
    protected int scanIntervalMillis = 5000;
    protected int offset;
    protected String currentFile;
    protected Set<String> processedFiles = new HashSet<String>();
    protected int emitBatchSize = 1000;
    protected int currentPartitions = 1;
    protected int partitionCount = 1;
    private int retryCount = 0;
    private int maxRetryCount = 5;
    transient protected int skipCount = 0;

    /**
     * Class representing failed file, When read fails on a file in middle, then the file is
     * added to failedList along with last read offset.
     * The files from failedList will be processed after all pendingFiles are processed, but
     * before checking for new files.
     * failed file is retried for maxRetryCount number of times, after that the file is
     * ignored.
     */
    protected static class FailedFile {
        String path;
        int offset;
        int retryCount;
        long lastFailedTime;

        /* For kryo serialization */
        protected FailedFile() {
        }

        protected FailedFile(String path, int offset) {
            this.path = path;
            this.offset = offset;
            this.retryCount = 0;
        }

        protected FailedFile(String path, int offset, int retryCount) {
            this.path = path;
            this.offset = offset;
            this.retryCount = retryCount;
        }

        @Override
        public String toString() {
            return "FailedFile[" + "path='" + path + '\'' + ", offset=" + offset + ", retryCount=" + retryCount
                    + ", lastFailedTime=" + lastFailedTime + ']';
        }
    }

    protected long lastRepartition = 0;
    private transient boolean emit = true;
    protected boolean idempotentEmit = false;
    /* List of unfinished files */
    protected Queue<FailedFile> unfinishedFiles = new LinkedList<FailedFile>();
    /* List of failed file */
    protected Queue<FailedFile> failedFiles = new LinkedList<FailedFile>();

    protected transient FileSystem fs;
    protected transient Configuration configuration;
    protected transient long lastScanMillis;
    protected transient Path filePath;
    protected transient InputStream inputStream;
    protected Set<String> pendingFiles = new LinkedHashSet<String>();

    public String getDirectory() {
        return directory;
    }

    public void setDirectory(String directory) {
        this.directory = directory;
    }

    public DirectoryScanner getScanner() {
        return scanner;
    }

    public void setScanner(DirectoryScanner scanner) {
        this.scanner = scanner;
    }

    public int getScanIntervalMillis() {
        return scanIntervalMillis;
    }

    public void setScanIntervalMillis(int scanIntervalMillis) {
        this.scanIntervalMillis = scanIntervalMillis;
    }

    public int getEmitBatchSize() {
        return emitBatchSize;
    }

    public void setEmitBatchSize(int emitBatchSize) {
        this.emitBatchSize = emitBatchSize;
    }

    public void setIdempotentEmit(boolean idempotentEmit) {
        this.idempotentEmit = idempotentEmit;
    }

    public boolean isIdempotentEmit() {
        return idempotentEmit;
    }

    public int getPartitionCount() {
        return partitionCount;
    }

    public void setPartitionCount(int requiredPartitions) {
        this.partitionCount = requiredPartitions;
    }

    public int getCurrentPartitions() {
        return currentPartitions;
    }

    @Override
    public void setup(OperatorContext context) {
        try {
            filePath = new Path(directory);
            configuration = new Configuration();
            fs = FileSystem.newInstance(filePath.toUri(), configuration);
            if (!unfinishedFiles.isEmpty()) {
                retryFailedFile(unfinishedFiles.poll());
                skipCount = 0;
            } else if (!failedFiles.isEmpty()) {
                retryFailedFile(failedFiles.poll());
                skipCount = 0;
            }
            long startTime = System.currentTimeMillis();
            LOG.info("Continue reading {} from index {} time={}", currentFile, offset, startTime);
            // fast forward to previous offset
            if (inputStream != null) {
                for (int index = 0; index < offset; index++) {
                    readEntity();
                }
            }
            LOG.info("Read offset={} records in setup time={}", offset, System.currentTimeMillis() - startTime);
        } catch (IOException ex) {
            if (maxRetryCount <= 0) {
                throw new RuntimeException(ex);
            }
            LOG.error("FS reader error", ex);
            addToFailedList();
        }
    }

    @Override
    public void teardown() {
        IOUtils.closeQuietly(inputStream);
        IOUtils.closeQuietly(fs);
    }

    @Override
    public void beginWindow(long windowId) {
        emit = true;
    }

    @Override
    public void endWindow() {
    }

    @Override
    public void emitTuples() {
        //emit will be true if the operator is not idempotent. If the operator is 
        //idempotent then emit will be true the first time emitTuples is called
        //within a window and false the other times emit tuples is called within a
        //window
        if (emit) {
            if (inputStream == null) {
                try {
                    if (!unfinishedFiles.isEmpty()) {
                        retryFailedFile(unfinishedFiles.poll());
                    } else if (!pendingFiles.isEmpty()) {
                        String newPathString = pendingFiles.iterator().next();
                        pendingFiles.remove(newPathString);
                        this.inputStream = openFile(new Path(newPathString));
                    } else if (!failedFiles.isEmpty()) {
                        retryFailedFile(failedFiles.poll());
                    } else {
                        if (System.currentTimeMillis() - scanIntervalMillis >= lastScanMillis) {
                            Set<Path> newPaths = scanner.scan(fs, filePath, processedFiles);

                            for (Path newPath : newPaths) {
                                String newPathString = newPath.toString();
                                pendingFiles.add(newPathString);
                                processedFiles.add(newPathString);
                            }
                            lastScanMillis = System.currentTimeMillis();
                        }
                    }
                } catch (IOException ex) {
                    if (maxRetryCount <= 0) {
                        throw new RuntimeException(ex);
                    }
                    LOG.error("FS reader error", ex);
                    addToFailedList();
                }
            }

            if (inputStream != null) {
                try {
                    int counterForTuple = 0;
                    while (counterForTuple++ < emitBatchSize) {
                        T line = readEntity();
                        if (line == null) {
                            LOG.info("done reading file ({} entries).", offset);
                            closeFile(inputStream);
                            break;
                        }

                        // If skipCount is non zero, then failed file recovery is going on, skipCount is
                        // used to prevent already emitted records from being emitted again during recovery.
                        // When failed file is open, skipCount is set to the last read offset for that file.
                        //
                        if (skipCount == 0) {
                            offset++;
                            emit(line);
                        } else
                            skipCount--;
                    }
                } catch (IOException e) {
                    if (maxRetryCount <= 0) {
                        throw new RuntimeException(e);
                    }
                    LOG.error("FS reader error", e);
                    addToFailedList();
                }
            }
            //If the operator is idempotent, do nothing on other calls to emittuples
            //within the same window
            if (idempotentEmit) {
                emit = false;
            }
        }
    }

    protected void addToFailedList() {

        FailedFile ff = new FailedFile(currentFile, offset, retryCount);

        try {
            // try to close file
            if (this.inputStream != null)
                this.inputStream.close();
        } catch (IOException e) {
            LOG.error("Could not close input stream on: " + currentFile);
        }

        ff.retryCount++;
        ff.lastFailedTime = System.currentTimeMillis();
        ff.offset = this.offset;

        // Clear current file state.
        this.currentFile = null;
        this.inputStream = null;
        this.offset = 0;

        if (ff.retryCount > maxRetryCount)
            return;

        LOG.info("adding to failed list path {} offset {} retry {}", ff.path, ff.offset, ff.retryCount);
        failedFiles.add(ff);
    }

    protected InputStream retryFailedFile(FailedFile ff) throws IOException {
        LOG.info("retrying failed file {} offset {} retry {}", ff.path, ff.offset, ff.retryCount);
        String path = ff.path;
        this.inputStream = openFile(new Path(path));
        this.offset = ff.offset;
        this.retryCount = ff.retryCount;
        this.skipCount = ff.offset;
        return this.inputStream;
    }

    protected InputStream openFile(Path path) throws IOException {
        LOG.info("opening file {}", path);
        InputStream input = fs.open(path);
        currentFile = path.toString();
        offset = 0;
        retryCount = 0;
        skipCount = 0;
        return input;
    }

    protected void closeFile(InputStream is) throws IOException {
        LOG.info("closing file {} offset {}", currentFile, offset);

        if (is != null)
            is.close();

        currentFile = null;
        inputStream = null;
    }

    @Override
    public Collection<Partition<AbstractFSDirectoryInputOperator<T>>> definePartitions(
            Collection<Partition<AbstractFSDirectoryInputOperator<T>>> partitions, int incrementalCapacity) {
        lastRepartition = System.currentTimeMillis();

        int totalCount = computedNewPartitionCount(partitions, incrementalCapacity);

        LOG.debug("Computed new partitions: {}", totalCount);

        if (totalCount == partitions.size()) {
            return partitions;
        }

        /*
         * Build collective state from all instances of the operator.
         */
        Set<String> totalProcessedFiles = new HashSet<String>();
        Set<FailedFile> currentFiles = new HashSet<FailedFile>();
        List<DirectoryScanner> oldscanners = new LinkedList<DirectoryScanner>();
        List<FailedFile> totalFailedFiles = new LinkedList<FailedFile>();
        List<String> totalPendingFiles = new LinkedList<String>();
        for (Partition<AbstractFSDirectoryInputOperator<T>> partition : partitions) {
            AbstractFSDirectoryInputOperator<T> oper = partition.getPartitionedInstance();
            totalProcessedFiles.addAll(oper.processedFiles);
            totalFailedFiles.addAll(oper.failedFiles);
            totalPendingFiles.addAll(oper.pendingFiles);
            currentFiles.addAll(unfinishedFiles);
            if (oper.currentFile != null)
                currentFiles.add(new FailedFile(oper.currentFile, oper.offset));
            oldscanners.add(oper.getScanner());
        }

        /*
         * Create partitions of scanners, scanner's partition method will do state
         * transfer for DirectoryScanner objects.
         */
        List<DirectoryScanner> scanners = scanner.partition(totalCount, oldscanners);

        Kryo kryo = new Kryo();
        Collection<Partition<AbstractFSDirectoryInputOperator<T>>> newPartitions = Lists
                .newArrayListWithExpectedSize(totalCount);
        for (int i = 0; i < scanners.size(); i++) {
            AbstractFSDirectoryInputOperator<T> oper = kryo.copy(this);
            DirectoryScanner scn = scanners.get(i);
            oper.setScanner(scn);

            // Do state transfer for processed files.
            oper.processedFiles.addAll(totalProcessedFiles);

            /* redistribute unfinished files properly */
            oper.unfinishedFiles.clear();
            oper.currentFile = null;
            oper.offset = 0;
            Iterator<FailedFile> unfinishedIter = currentFiles.iterator();
            while (unfinishedIter.hasNext()) {
                FailedFile unfinishedFile = unfinishedIter.next();
                if (scn.acceptFile(unfinishedFile.path)) {
                    oper.unfinishedFiles.add(unfinishedFile);
                    unfinishedIter.remove();
                }
            }

            /* transfer failed files */
            oper.failedFiles.clear();
            Iterator<FailedFile> iter = totalFailedFiles.iterator();
            while (iter.hasNext()) {
                FailedFile ff = iter.next();
                if (scn.acceptFile(ff.path)) {
                    oper.failedFiles.add(ff);
                    iter.remove();
                }
            }

            /* redistribute pending files properly */
            oper.pendingFiles.clear();
            Iterator<String> pendingFilesIterator = totalPendingFiles.iterator();
            while (pendingFilesIterator.hasNext()) {
                String pathString = pendingFilesIterator.next();
                if (scn.acceptFile(pathString)) {
                    oper.pendingFiles.add(pathString);
                    pendingFilesIterator.remove();
                }
            }
            newPartitions.add(new DefaultPartition<AbstractFSDirectoryInputOperator<T>>(oper));
        }

        LOG.info("definePartitions called returning {} partitions", newPartitions.size());
        return newPartitions;
    }

    protected int computedNewPartitionCount(Collection<Partition<AbstractFSDirectoryInputOperator<T>>> partitions,
            int incrementalCapacity) {
        boolean isInitialParitition = partitions.iterator().next().getStats() == null;

        if (isInitialParitition && partitionCount == 1) {
            partitionCount = currentPartitions = partitions.size() + incrementalCapacity;
        } else {
            incrementalCapacity = partitionCount - currentPartitions;
        }

        int totalCount = partitions.size() + incrementalCapacity;
        LOG.info("definePartitions trying to create {} partitions, current {}  required {}", totalCount,
                partitionCount, currentPartitions);
        return totalCount;
    }

    @Override
    public void partitioned(Map<Integer, Partition<AbstractFSDirectoryInputOperator<T>>> partitions) {
        currentPartitions = partitions.size();
    }

    /**
     * Read the next item from the stream. Depending on the type of stream, this could be a byte array, line or object.
     * Upon return of null, the stream will be considered fully consumed.
     */
    abstract protected T readEntity() throws IOException;

    /**
     * Emit the tuple on the port
     * @param tuple
     */
    abstract protected void emit(T tuple);

    /**
     * Repartition is required when number of partitions are not equal to required
     * partitions.
     */
    @Override
    public Response processStats(BatchedOperatorStats batchedOperatorStats) {
        Response res = new Response();
        res.repartitionRequired = false;
        if (currentPartitions != partitionCount) {
            LOG.info("processStats: trying repartition of input operator current {} required {}", currentPartitions,
                    partitionCount);
            res.repartitionRequired = true;
        }
        return res;
    }

    public int getMaxRetryCount() {
        return maxRetryCount;
    }

    public void setMaxRetryCount(int maxRetryCount) {
        this.maxRetryCount = maxRetryCount;
    }

    public static class DirectoryScanner implements Serializable {
        private static final long serialVersionUID = 4535844463258899929L;
        private String filePatternRegexp;
        private transient Pattern regex = null;
        private int partitionIndex;
        private int partitionCount;
        private final transient HashSet<String> ignoredFiles = new HashSet<String>();

        public String getFilePatternRegexp() {
            return filePatternRegexp;
        }

        public void setFilePatternRegexp(String filePatternRegexp) {
            this.filePatternRegexp = filePatternRegexp;
            this.regex = null;
        }

        public Pattern getRegex() {
            if (this.regex == null && this.filePatternRegexp != null)
                this.regex = Pattern.compile(this.filePatternRegexp);
            return this.regex;
        }

        public int getPartitionCount() {
            return partitionCount;
        }

        public int getPartitionIndex() {
            return partitionIndex;
        }

        public LinkedHashSet<Path> scan(FileSystem fs, Path filePath, Set<String> consumedFiles) {
            if (filePatternRegexp != null && this.regex == null) {
                this.regex = Pattern.compile(this.filePatternRegexp);
            }

            LinkedHashSet<Path> pathSet = Sets.newLinkedHashSet();
            try {
                LOG.debug("Scanning {} with pattern {}", filePath, this.filePatternRegexp);
                FileStatus[] files = fs.listStatus(filePath);
                for (FileStatus status : files) {
                    Path path = status.getPath();
                    String filePathStr = path.toString();

                    if (consumedFiles.contains(filePathStr)) {
                        continue;
                    }

                    if (ignoredFiles.contains(filePathStr)) {
                        continue;
                    }

                    if (acceptFile(filePathStr)) {
                        LOG.debug("Found {}", filePathStr);
                        pathSet.add(path);
                    } else {
                        // don't look at it again
                        ignoredFiles.add(filePathStr);
                    }
                }
            } catch (FileNotFoundException e) {
                LOG.warn("Failed to list directory {}", filePath, e);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return pathSet;
        }

        protected boolean acceptFile(String filePathStr) {
            if (partitionCount > 1) {
                int i = filePathStr.hashCode();
                int mod = i % partitionCount;
                if (mod < 0) {
                    mod += partitionCount;
                }
                LOG.debug("partition {} {} {} {}", partitionIndex, filePathStr, i, mod);

                if (mod != partitionIndex) {
                    return false;
                }
            }
            if (filePatternRegexp != null && this.regex == null) {
                regex = Pattern.compile(this.filePatternRegexp);
            }

            if (regex != null) {
                Matcher matcher = regex.matcher(filePathStr);
                if (!matcher.matches()) {
                    return false;
                }
            }
            return true;
        }

        public List<DirectoryScanner> partition(int count) {
            ArrayList<DirectoryScanner> partitions = Lists.newArrayListWithExpectedSize(count);
            for (int i = 0; i < count; i++) {
                partitions.add(this.createPartition(i, count));
            }
            return partitions;
        }

        public List<DirectoryScanner> partition(int count, Collection<DirectoryScanner> scanners) {
            return partition(count);
        }

        protected DirectoryScanner createPartition(int partitionIndex, int partitionCount) {
            DirectoryScanner that = new DirectoryScanner();
            that.filePatternRegexp = this.filePatternRegexp;
            that.regex = this.regex;
            that.partitionIndex = partitionIndex;
            that.partitionCount = partitionCount;
            return that;
        }

        @Override
        public String toString() {
            return "DirectoryScanner [filePatternRegexp=" + filePatternRegexp + " partitionIndex=" + partitionIndex
                    + " partitionCount=" + partitionCount + "]";
        }
    }
}