org.apache.nifi.processors.hadoop.AbstractPutHDFSRecord.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nifi.processors.hadoop.AbstractPutHDFSRecord.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nifi.processors.hadoop;

import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.nifi.annotation.behavior.TriggerWhenEmpty;
import org.apache.nifi.annotation.configuration.DefaultSettings;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.AllowableValue;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.PropertyValue;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.FlowFileAccessException;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.processors.hadoop.exception.FailureException;
import org.apache.nifi.processors.hadoop.exception.RecordReaderFactoryException;
import org.apache.nifi.processors.hadoop.record.HDFSRecordWriter;
import org.apache.nifi.schema.access.SchemaNotFoundException;
import org.apache.nifi.serialization.RecordReader;
import org.apache.nifi.serialization.RecordReaderFactory;
import org.apache.nifi.serialization.WriteResult;
import org.apache.nifi.serialization.record.RecordSchema;
import org.apache.nifi.serialization.record.RecordSet;
import org.apache.nifi.util.StopWatch;

/**
 * Base class for processors that write Records to HDFS.
 */
@TriggerWhenEmpty // trigger when empty so we have a chance to perform a Kerberos re-login
@DefaultSettings(yieldDuration = "100 ms") // decrease the default yield since we are triggering when empty
public abstract class AbstractPutHDFSRecord extends AbstractHadoopProcessor {

    public static final PropertyDescriptor COMPRESSION_TYPE = new PropertyDescriptor.Builder()
            .name("compression-type").displayName("Compression Type")
            .description("The type of compression for the file being written.").required(true).build();

    public static final PropertyDescriptor OVERWRITE = new PropertyDescriptor.Builder().name("overwrite")
            .displayName("Overwrite Files")
            .description(
                    "Whether or not to overwrite existing files in the same directory with the same name. When set to false, "
                            + "flow files will be routed to failure when a file exists in the same directory with the same name.")
            .allowableValues("true", "false").defaultValue("false").required(true).build();

    public static final PropertyDescriptor UMASK = new PropertyDescriptor.Builder().name("permissions-umask")
            .displayName("Permissions umask")
            .description(
                    "A umask represented as an octal number which determines the permissions of files written to HDFS. "
                            + "This overrides the Hadoop Configuration dfs.umaskmode")
            .addValidator(HadoopValidators.UMASK_VALIDATOR).build();

    public static final PropertyDescriptor REMOTE_OWNER = new PropertyDescriptor.Builder().name("remote-owner")
            .displayName("Remote Owner")
            .description("Changes the owner of the HDFS file to this value after it is written. "
                    + "This only works if NiFi is running as a user that has HDFS super user privilege to change owner")
            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR).build();

    public static final PropertyDescriptor REMOTE_GROUP = new PropertyDescriptor.Builder().name("remote-group")
            .displayName("Remote Group")
            .description("Changes the group of the HDFS file to this value after it is written. "
                    + "This only works if NiFi is running as a user that has HDFS super user privilege to change group")
            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR).build();

    public static final PropertyDescriptor RECORD_READER = new PropertyDescriptor.Builder().name("record-reader")
            .displayName("Record Reader").description("The service for reading records from incoming flow files.")
            .identifiesControllerService(RecordReaderFactory.class).required(true).build();

    public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
            .description("Flow Files that have been successfully processed are transferred to this relationship")
            .build();

    public static final Relationship REL_RETRY = new Relationship.Builder().name("retry").description(
            "Flow Files that could not be processed due to issues that can be retried are transferred to this relationship")
            .build();

    public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure").description(
            "Flow Files that could not be processed due to issue that cannot be retried are transferred to this relationship")
            .build();

    public static final String RECORD_COUNT_ATTR = "record.count";

    private volatile String remoteOwner;
    private volatile String remoteGroup;

    private volatile Set<Relationship> putHdfsRecordRelationships;
    private volatile List<PropertyDescriptor> putHdfsRecordProperties;

    @Override
    protected final void init(final ProcessorInitializationContext context) {
        super.init(context);

        final Set<Relationship> rels = new HashSet<>();
        rels.add(REL_SUCCESS);
        rels.add(REL_RETRY);
        rels.add(REL_FAILURE);
        this.putHdfsRecordRelationships = Collections.unmodifiableSet(rels);

        final List<PropertyDescriptor> props = new ArrayList<>(properties);
        props.add(RECORD_READER);

        props.add(new PropertyDescriptor.Builder().fromPropertyDescriptor(DIRECTORY).description(
                "The parent directory to which files should be written. Will be created if it doesn't exist.")
                .build());

        final AllowableValue[] compressionTypes = getCompressionTypes(context).toArray(new AllowableValue[0]);

        props.add(new PropertyDescriptor.Builder().fromPropertyDescriptor(COMPRESSION_TYPE)
                .allowableValues(compressionTypes).defaultValue(getDefaultCompressionType(context)).build());

        props.add(OVERWRITE);
        props.add(UMASK);
        props.add(REMOTE_GROUP);
        props.add(REMOTE_OWNER);
        props.addAll(getAdditionalProperties());
        this.putHdfsRecordProperties = Collections.unmodifiableList(props);
    }

    /**
     * @param context the initialization context
     * @return the possible compression types
     */
    public abstract List<AllowableValue> getCompressionTypes(final ProcessorInitializationContext context);

    /**
     * @param context the initialization context
     * @return the default compression type
     */
    public abstract String getDefaultCompressionType(final ProcessorInitializationContext context);

    /**
     * Allows sub-classes to add additional properties, called from initialize.
     *
     * @return additional properties to add to the overall list
     */
    public List<PropertyDescriptor> getAdditionalProperties() {
        return Collections.emptyList();
    }

    @Override
    public final Set<Relationship> getRelationships() {
        return putHdfsRecordRelationships;
    }

    @Override
    public final List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        return putHdfsRecordProperties;
    }

    @Override
    protected void preProcessConfiguration(Configuration config, ProcessContext context) {
        // Set umask once, to avoid thread safety issues doing it in onTrigger
        final PropertyValue umaskProp = context.getProperty(UMASK);
        final short dfsUmask;
        if (umaskProp.isSet()) {
            dfsUmask = Short.parseShort(umaskProp.getValue(), 8);
        } else {
            dfsUmask = FsPermission.DEFAULT_UMASK;
        }

        FsPermission.setUMask(config, new FsPermission(dfsUmask));
    }

    @OnScheduled
    public final void onScheduled(final ProcessContext context) throws IOException {
        super.abstractOnScheduled(context);

        this.remoteOwner = context.getProperty(REMOTE_OWNER).getValue();
        this.remoteGroup = context.getProperty(REMOTE_GROUP).getValue();
    }

    /**
     * Sub-classes provide the appropriate HDFSRecordWriter.
     *
     * @param context the process context to obtain additional configuration
     * @param flowFile the flow file being written
     * @param conf the Configuration instance
     * @param path the path to write to
     * @param schema the schema for writing
     * @return the HDFSRecordWriter
     * @throws IOException if an error occurs creating the writer or processing the schema
     */
    public abstract HDFSRecordWriter createHDFSRecordWriter(final ProcessContext context, final FlowFile flowFile,
            final Configuration conf, final Path path, final RecordSchema schema)
            throws IOException, SchemaNotFoundException;

    @Override
    public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
        // do this before getting a flow file so that we always get a chance to attempt Kerberos relogin
        final FileSystem fileSystem = getFileSystem();
        final Configuration configuration = getConfiguration();
        final UserGroupInformation ugi = getUserGroupInformation();

        if (configuration == null || fileSystem == null || ugi == null) {
            getLogger().error(
                    "Processor not configured properly because Configuration, FileSystem, or UserGroupInformation was null");
            context.yield();
            return;
        }

        final FlowFile flowFile = session.get();
        if (flowFile == null) {
            context.yield();
            return;
        }

        ugi.doAs((PrivilegedAction<Object>) () -> {
            Path tempDotCopyFile = null;
            FlowFile putFlowFile = flowFile;
            try {
                final String filenameValue = putFlowFile.getAttribute(CoreAttributes.FILENAME.key()); // TODO codec extension
                final String directoryValue = context.getProperty(DIRECTORY)
                        .evaluateAttributeExpressions(putFlowFile).getValue();

                // create the directory if it doesn't exist
                final Path directoryPath = new Path(directoryValue);
                createDirectory(fileSystem, directoryPath, remoteOwner, remoteGroup);

                // write to tempFile first and on success rename to destFile
                final Path tempFile = new Path(directoryPath, "." + filenameValue);
                final Path destFile = new Path(directoryPath, filenameValue);

                final boolean destinationExists = fileSystem.exists(destFile) || fileSystem.exists(tempFile);
                final boolean shouldOverwrite = context.getProperty(OVERWRITE).asBoolean();

                // if the tempFile or destFile already exist, and overwrite is set to false, then transfer to failure
                if (destinationExists && !shouldOverwrite) {
                    session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                    getLogger().warn(
                            "penalizing {} and routing to failure because file with same name already exists",
                            new Object[] { putFlowFile });
                    return null;
                }

                final AtomicReference<Throwable> exceptionHolder = new AtomicReference<>(null);
                final AtomicReference<WriteResult> writeResult = new AtomicReference<>();
                final RecordReaderFactory recordReaderFactory = context.getProperty(RECORD_READER)
                        .asControllerService(RecordReaderFactory.class);

                final FlowFile flowFileIn = putFlowFile;
                final StopWatch stopWatch = new StopWatch(true);

                // Read records from the incoming FlowFile and write them the tempFile
                session.read(putFlowFile, (final InputStream rawIn) -> {
                    RecordReader recordReader = null;
                    HDFSRecordWriter recordWriter = null;

                    try (final BufferedInputStream in = new BufferedInputStream(rawIn)) {

                        // if we fail to create the RecordReader then we want to route to failure, so we need to
                        // handle this separately from the other IOExceptions which normally route to retry
                        try {
                            recordReader = recordReaderFactory.createRecordReader(flowFileIn, in, getLogger());
                        } catch (Exception e) {
                            final RecordReaderFactoryException rrfe = new RecordReaderFactoryException(
                                    "Unable to create RecordReader", e);
                            exceptionHolder.set(rrfe);
                            return;
                        }

                        final RecordSet recordSet = recordReader.createRecordSet();

                        recordWriter = createHDFSRecordWriter(context, flowFile, configuration, tempFile,
                                recordReader.getSchema());
                        writeResult.set(recordWriter.write(recordSet));
                    } catch (Exception e) {
                        exceptionHolder.set(e);
                    } finally {
                        IOUtils.closeQuietly(recordReader);
                        IOUtils.closeQuietly(recordWriter);
                    }
                });
                stopWatch.stop();

                final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize());
                final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
                tempDotCopyFile = tempFile;

                // if any errors happened within the session.read then throw the exception so we jump
                // into one of the appropriate catch blocks below
                if (exceptionHolder.get() != null) {
                    throw exceptionHolder.get();
                }

                // Attempt to rename from the tempFile to destFile, and change owner if successfully renamed
                rename(fileSystem, tempFile, destFile);
                changeOwner(fileSystem, destFile, remoteOwner, remoteGroup);

                getLogger().info("Wrote {} to {} in {} milliseconds at a rate of {}",
                        new Object[] { putFlowFile, destFile, millis, dataRate });

                putFlowFile = postProcess(context, session, putFlowFile, destFile);

                final String newFilename = destFile.getName();
                final String hdfsPath = destFile.getParent().toString();

                // Update the filename and absolute path attributes
                final Map<String, String> attributes = new HashMap<>(writeResult.get().getAttributes());
                attributes.put(CoreAttributes.FILENAME.key(), newFilename);
                attributes.put(ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
                attributes.put(RECORD_COUNT_ATTR, String.valueOf(writeResult.get().getRecordCount()));
                putFlowFile = session.putAllAttributes(putFlowFile, attributes);

                // Send a provenance event and transfer to success
                final Path qualifiedPath = destFile.makeQualified(fileSystem.getUri(),
                        fileSystem.getWorkingDirectory());
                session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString());
                session.transfer(putFlowFile, REL_SUCCESS);

            } catch (IOException | FlowFileAccessException e) {
                deleteQuietly(fileSystem, tempDotCopyFile);
                getLogger().error("Failed to write due to {}", new Object[] { e });
                session.transfer(session.penalize(putFlowFile), REL_RETRY);
                context.yield();
            } catch (Throwable t) {
                deleteQuietly(fileSystem, tempDotCopyFile);
                getLogger().error("Failed to write due to {}", new Object[] { t });
                session.transfer(putFlowFile, REL_FAILURE);
            }

            return null;
        });
    }

    /**
     * This method will be called after successfully writing to the destination file and renaming the file to it's final name
     * in order to give sub-classes a chance to take action before transferring to success.
     *
     * @param context the context
     * @param session the session
     * @param flowFile the flow file being processed
     * @param destFile the destination file written to
     * @return an updated FlowFile reference
     */
    protected FlowFile postProcess(final ProcessContext context, final ProcessSession session,
            final FlowFile flowFile, final Path destFile) {
        return flowFile;
    }

    /**
     * Attempts to rename srcFile to destFile up to 10 times, with a 200ms sleep in between each attempt.
     *
     * If the file has not been renamed after 10 attempts, a FailureException is thrown.
     *
     * @param fileSystem the file system where the files are located
     * @param srcFile the source file
     * @param destFile the destination file to rename the source to
     * @throws IOException if IOException happens while attempting to rename
     * @throws InterruptedException if renaming is interrupted
     * @throws FailureException if the file couldn't be renamed after 10 attempts
     */
    protected void rename(final FileSystem fileSystem, final Path srcFile, final Path destFile)
            throws IOException, InterruptedException, FailureException {
        boolean renamed = false;
        for (int i = 0; i < 10; i++) { // try to rename multiple times.
            if (fileSystem.rename(srcFile, destFile)) {
                renamed = true;
                break;// rename was successful
            }
            Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve
        }
        if (!renamed) {
            fileSystem.delete(srcFile, false);
            throw new FailureException("Could not rename file " + srcFile + " to its final filename");
        }
    }

    /**
     * Deletes the given file from the given filesystem. Any exceptions that are encountered will be caught and logged, but not thrown.
     *
     * @param fileSystem the filesystem where the file exists
     * @param file the file to delete
     */
    protected void deleteQuietly(final FileSystem fileSystem, final Path file) {
        if (file != null) {
            try {
                fileSystem.delete(file, false);
            } catch (Exception e) {
                getLogger().error("Unable to remove file {} due to {}", new Object[] { file, e });
            }
        }
    }

    /**
     * Changes the ownership of the given file.
     *
     * @param fileSystem the filesystem where the file exists
     * @param path the file to change ownership on
     * @param remoteOwner the new owner for the file
     * @param remoteGroup the new group for the file
     */
    protected void changeOwner(final FileSystem fileSystem, final Path path, final String remoteOwner,
            final String remoteGroup) {
        try {
            // Change owner and group of file if configured to do so
            if (remoteOwner != null || remoteGroup != null) {
                fileSystem.setOwner(path, remoteOwner, remoteGroup);
            }
        } catch (Exception e) {
            getLogger().warn("Could not change owner or group of {} on due to {}", new Object[] { path, e });
        }
    }

    /**
     * Creates the given directory and changes the ownership to the specified owner/group.
     *
     * @param fileSystem the filesystem to create the directory on
     * @param directory the directory to create
     * @param remoteOwner the owner for changing ownership of the directory
     * @param remoteGroup the group for changing ownership of the directory
     * @throws IOException if an error occurs obtaining the file status or issuing the mkdir command
     * @throws FailureException if the directory could not be created
     */
    protected void createDirectory(final FileSystem fileSystem, final Path directory, final String remoteOwner,
            final String remoteGroup) throws IOException, FailureException {
        try {
            if (!fileSystem.getFileStatus(directory).isDirectory()) {
                throw new FailureException(directory.toString() + " already exists and is not a directory");
            }
        } catch (FileNotFoundException fe) {
            if (!fileSystem.mkdirs(directory)) {
                throw new FailureException(directory.toString() + " could not be created");
            }
            changeOwner(fileSystem, directory, remoteOwner, remoteGroup);
        }
    }

}