Example usage for org.apache.hadoop.fs FileSystem getWorkingDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getWorkingDirectory.

Prototype

public abstract Path getWorkingDirectory();

Source Link

Document

Get the current working directory for the given FileSystem

Usage

From source file:org.apache.mahout.classifier.df.mapreduce.OversamplingBuilder.java

License:Apache License

protected Path getOutputPath(Configuration conf) throws IOException {
    // the output directory is accessed only by this class, so use the default
    // file system
    FileSystem fs = FileSystem.get(conf);
    return new Path(fs.getWorkingDirectory(), outputDirName);
}

From source file:org.apache.mahout.classifier.df.tools.Frequencies.java

License:Apache License

private void runTool(String data, String dataset)
        throws IOException, ClassNotFoundException, InterruptedException {

    FileSystem fs = FileSystem.get(getConf());
    Path workingDir = fs.getWorkingDirectory();

    Path dataPath = new Path(data);
    Path datasetPath = new Path(dataset);

    log.info("Computing the frequencies...");
    FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath);

    int[][] counts = job.run(getConf());

    // outputing the frequencies
    log.info("counts[partition][class]");
    for (int[] count : counts) {
        log.info(Arrays.toString(count));
    }/*ww  w  .j ava  2 s. c o m*/
}

From source file:org.apache.mahout.df.mapred.Builder.java

License:Apache License

/**
 * Output Directory name// w ww. j av a  2s . co  m
 * 
 * @param conf
 * @return
 * @throws IOException
 */
public Path getOutputPath(Configuration conf) throws IOException {
    // the output directory is accessed only by this class, so use the default
    // file system
    FileSystem fs = FileSystem.get(conf);
    return new Path(fs.getWorkingDirectory(), outputDirName);
}

From source file:org.apache.mahout.df.tools.Frequencies.java

License:Apache License

private void runTool(String data, String dataset)
        throws IOException, ClassNotFoundException, InterruptedException {

    FileSystem fs = FileSystem.get(getConf());
    Path workingDir = fs.getWorkingDirectory();

    Path dataPath = new Path(data);
    Path datasetPath = new Path(dataset);

    log.info("Computing the frequencies...");
    FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath);

    int[][] counts = job.run(getConf());

    // compute the partitions' sizes
    int numPartitions = counts.length;
    // int[] sizes = new int[numPartitions]; // TODO this isn't used?
    // for (int p = 0; p < numPartitions; p++) {
    // sizes[p] = DataUtils.sum(counts[p]);
    // }//from www .ja  v  a  2s. c  o  m

    // outputing the frequencies
    log.info("counts[partition][class]");
    for (int p = 0; p < numPartitions; p++) {
        log.info(Arrays.toString(counts[p]));
    }
}

From source file:org.apache.mahout.ga.watchmaker.MahoutEvaluator.java

License:Apache License

/**
 * Create the input directory and stores the population in it.
 * /*from   w  w  w. jav a2s .  c  o m*/
 * @param fs
 *          <code>FileSystem</code> to use
 * @param population
 *          population to store
 * @return input <code>Path</code>
 */
private static Path prepareInput(FileSystem fs, List<?> population) throws IOException {
    Path inpath = new Path(fs.getWorkingDirectory(), "input");
    HadoopUtil.overwriteOutput(inpath);
    storePopulation(fs, new Path(inpath, "population"), population);
    return inpath;
}

From source file:org.apache.nifi.processors.hadoop.AbstractFetchHDFSRecord.java

License:Apache License

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    // do this before getting a flow file so that we always get a chance to attempt Kerberos relogin
    final FileSystem fileSystem = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();

    if (configuration == null || fileSystem == null || ugi == null) {
        getLogger().error(//from  w w w. j a v a  2  s  .  c  om
                "Processor not configured properly because Configuration, FileSystem, or UserGroupInformation was null");
        context.yield();
        return;
    }

    final FlowFile originalFlowFile = session.get();
    if (originalFlowFile == null) {
        context.yield();
        return;
    }

    ugi.doAs((PrivilegedAction<Object>) () -> {
        FlowFile child = null;
        final String filenameValue = context.getProperty(FILENAME)
                .evaluateAttributeExpressions(originalFlowFile).getValue();
        try {
            final Path path = new Path(filenameValue);
            final AtomicReference<Throwable> exceptionHolder = new AtomicReference<>(null);
            final AtomicReference<WriteResult> writeResult = new AtomicReference<>();

            final RecordSetWriterFactory recordSetWriterFactory = context.getProperty(RECORD_WRITER)
                    .asControllerService(RecordSetWriterFactory.class);

            final StopWatch stopWatch = new StopWatch(true);

            // use a child FlowFile so that if any error occurs we can route the original untouched FlowFile to retry/failure
            child = session.create(originalFlowFile);

            final AtomicReference<String> mimeTypeRef = new AtomicReference<>();
            child = session.write(child, (final OutputStream rawOut) -> {
                try (final BufferedOutputStream out = new BufferedOutputStream(rawOut);
                        final HDFSRecordReader recordReader = createHDFSRecordReader(context, originalFlowFile,
                                configuration, path)) {

                    Record record = recordReader.nextRecord();
                    final RecordSchema schema = recordSetWriterFactory.getSchema(
                            originalFlowFile.getAttributes(), record == null ? null : record.getSchema());

                    try (final RecordSetWriter recordSetWriter = recordSetWriterFactory
                            .createWriter(getLogger(), schema, out)) {
                        recordSetWriter.beginRecordSet();
                        if (record != null) {
                            recordSetWriter.write(record);
                        }

                        while ((record = recordReader.nextRecord()) != null) {
                            recordSetWriter.write(record);
                        }

                        writeResult.set(recordSetWriter.finishRecordSet());
                        mimeTypeRef.set(recordSetWriter.getMimeType());
                    }
                } catch (Exception e) {
                    exceptionHolder.set(e);
                }
            });

            stopWatch.stop();

            // if any errors happened within the session.write then throw the exception so we jump
            // into one of the appropriate catch blocks below
            if (exceptionHolder.get() != null) {
                throw exceptionHolder.get();
            }

            FlowFile successFlowFile = postProcess(context, session, child, path);

            final Map<String, String> attributes = new HashMap<>(writeResult.get().getAttributes());
            attributes.put(RECORD_COUNT_ATTR, String.valueOf(writeResult.get().getRecordCount()));
            attributes.put(CoreAttributes.MIME_TYPE.key(), mimeTypeRef.get());
            successFlowFile = session.putAllAttributes(successFlowFile, attributes);

            final Path qualifiedPath = path.makeQualified(fileSystem.getUri(),
                    fileSystem.getWorkingDirectory());
            getLogger().info("Successfully received content from {} for {} in {} milliseconds",
                    new Object[] { qualifiedPath, successFlowFile, stopWatch.getDuration() });
            session.getProvenanceReporter().fetch(successFlowFile, qualifiedPath.toString(),
                    stopWatch.getDuration(TimeUnit.MILLISECONDS));
            session.transfer(successFlowFile, REL_SUCCESS);
            session.remove(originalFlowFile);
            return null;

        } catch (final FileNotFoundException | AccessControlException e) {
            getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                    new Object[] { filenameValue, originalFlowFile, e });
            final FlowFile failureFlowFile = session.putAttribute(originalFlowFile, FETCH_FAILURE_REASON_ATTR,
                    e.getMessage() == null ? e.toString() : e.getMessage());
            session.transfer(failureFlowFile, REL_FAILURE);
        } catch (final IOException | FlowFileAccessException e) {
            getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to retry",
                    new Object[] { filenameValue, originalFlowFile, e });
            session.transfer(session.penalize(originalFlowFile), REL_RETRY);
            context.yield();
        } catch (final Throwable t) {
            getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                    new Object[] { filenameValue, originalFlowFile, t });
            final FlowFile failureFlowFile = session.putAttribute(originalFlowFile, FETCH_FAILURE_REASON_ATTR,
                    t.getMessage() == null ? t.toString() : t.getMessage());
            session.transfer(failureFlowFile, REL_FAILURE);
        }

        // if we got this far then we weren't successful so we need to clean up the child flow file if it got initialized
        if (child != null) {
            session.remove(child);
        }

        return null;
    });

}

From source file:org.apache.nifi.processors.hadoop.AbstractHadoopProcessor.java

License:Apache License

HdfsResources resetHDFSResources(String configResources, ProcessContext context) throws IOException {
    Configuration config = new ExtendedConfiguration(getLogger());
    config.setClassLoader(Thread.currentThread().getContextClassLoader());

    getConfigurationFromResources(config, configResources);

    // give sub-classes a chance to process configuration
    preProcessConfiguration(config, context);

    // first check for timeout on HDFS connection, because FileSystem has a hard coded 15 minute timeout
    checkHdfsUriForTimeout(config);//from w  w w  .j  a va2s .  co m

    // disable caching of Configuration and FileSystem objects, else we cannot reconfigure the processor without a complete
    // restart
    String disableCacheName = String.format("fs.%s.impl.disable.cache",
            FileSystem.getDefaultUri(config).getScheme());
    config.set(disableCacheName, "true");

    // If kerberos is enabled, create the file system as the kerberos principal
    // -- use RESOURCE_LOCK to guarantee UserGroupInformation is accessed by only a single thread at at time
    FileSystem fs;
    UserGroupInformation ugi;
    synchronized (RESOURCES_LOCK) {
        if (SecurityUtil.isSecurityEnabled(config)) {
            String principal = context.getProperty(kerberosProperties.getKerberosPrincipal())
                    .evaluateAttributeExpressions().getValue();
            String keyTab = context.getProperty(kerberosProperties.getKerberosKeytab())
                    .evaluateAttributeExpressions().getValue();
            ugi = SecurityUtil.loginKerberos(config, principal, keyTab);
            fs = getFileSystemAsUser(config, ugi);
        } else {
            config.set("ipc.client.fallback-to-simple-auth-allowed", "true");
            config.set("hadoop.security.authentication", "simple");
            ugi = SecurityUtil.loginSimple(config);
            fs = getFileSystemAsUser(config, ugi);
        }
    }
    getLogger().debug("resetHDFSResources UGI {}", new Object[] { ugi });

    final Path workingDir = fs.getWorkingDirectory();
    getLogger().info(
            "Initialized a new HDFS File System with working dir: {} default block size: {} default replication: {} config: {}",
            new Object[] { workingDir, fs.getDefaultBlockSize(workingDir), fs.getDefaultReplication(workingDir),
                    config.toString() });

    return new HdfsResources(config, fs, ugi);
}

From source file:org.apache.nifi.processors.hadoop.AbstractPutHDFSRecord.java

License:Apache License

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    // do this before getting a flow file so that we always get a chance to attempt Kerberos relogin
    final FileSystem fileSystem = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();

    if (configuration == null || fileSystem == null || ugi == null) {
        getLogger().error(/* w ww  .  j ava2s. c om*/
                "Processor not configured properly because Configuration, FileSystem, or UserGroupInformation was null");
        context.yield();
        return;
    }

    final FlowFile flowFile = session.get();
    if (flowFile == null) {
        context.yield();
        return;
    }

    ugi.doAs((PrivilegedAction<Object>) () -> {
        Path tempDotCopyFile = null;
        FlowFile putFlowFile = flowFile;
        try {
            final String filenameValue = putFlowFile.getAttribute(CoreAttributes.FILENAME.key()); // TODO codec extension
            final String directoryValue = context.getProperty(DIRECTORY)
                    .evaluateAttributeExpressions(putFlowFile).getValue();

            // create the directory if it doesn't exist
            final Path directoryPath = new Path(directoryValue);
            createDirectory(fileSystem, directoryPath, remoteOwner, remoteGroup);

            // write to tempFile first and on success rename to destFile
            final Path tempFile = new Path(directoryPath, "." + filenameValue);
            final Path destFile = new Path(directoryPath, filenameValue);

            final boolean destinationExists = fileSystem.exists(destFile) || fileSystem.exists(tempFile);
            final boolean shouldOverwrite = context.getProperty(OVERWRITE).asBoolean();

            // if the tempFile or destFile already exist, and overwrite is set to false, then transfer to failure
            if (destinationExists && !shouldOverwrite) {
                session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                getLogger().warn(
                        "penalizing {} and routing to failure because file with same name already exists",
                        new Object[] { putFlowFile });
                return null;
            }

            final AtomicReference<Throwable> exceptionHolder = new AtomicReference<>(null);
            final AtomicReference<WriteResult> writeResult = new AtomicReference<>();
            final RecordReaderFactory recordReaderFactory = context.getProperty(RECORD_READER)
                    .asControllerService(RecordReaderFactory.class);

            final FlowFile flowFileIn = putFlowFile;
            final StopWatch stopWatch = new StopWatch(true);

            // Read records from the incoming FlowFile and write them the tempFile
            session.read(putFlowFile, (final InputStream rawIn) -> {
                RecordReader recordReader = null;
                HDFSRecordWriter recordWriter = null;

                try (final BufferedInputStream in = new BufferedInputStream(rawIn)) {

                    // if we fail to create the RecordReader then we want to route to failure, so we need to
                    // handle this separately from the other IOExceptions which normally route to retry
                    try {
                        recordReader = recordReaderFactory.createRecordReader(flowFileIn, in, getLogger());
                    } catch (Exception e) {
                        final RecordReaderFactoryException rrfe = new RecordReaderFactoryException(
                                "Unable to create RecordReader", e);
                        exceptionHolder.set(rrfe);
                        return;
                    }

                    final RecordSet recordSet = recordReader.createRecordSet();

                    recordWriter = createHDFSRecordWriter(context, flowFile, configuration, tempFile,
                            recordReader.getSchema());
                    writeResult.set(recordWriter.write(recordSet));
                } catch (Exception e) {
                    exceptionHolder.set(e);
                } finally {
                    IOUtils.closeQuietly(recordReader);
                    IOUtils.closeQuietly(recordWriter);
                }
            });
            stopWatch.stop();

            final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize());
            final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
            tempDotCopyFile = tempFile;

            // if any errors happened within the session.read then throw the exception so we jump
            // into one of the appropriate catch blocks below
            if (exceptionHolder.get() != null) {
                throw exceptionHolder.get();
            }

            // Attempt to rename from the tempFile to destFile, and change owner if successfully renamed
            rename(fileSystem, tempFile, destFile);
            changeOwner(fileSystem, destFile, remoteOwner, remoteGroup);

            getLogger().info("Wrote {} to {} in {} milliseconds at a rate of {}",
                    new Object[] { putFlowFile, destFile, millis, dataRate });

            putFlowFile = postProcess(context, session, putFlowFile, destFile);

            final String newFilename = destFile.getName();
            final String hdfsPath = destFile.getParent().toString();

            // Update the filename and absolute path attributes
            final Map<String, String> attributes = new HashMap<>(writeResult.get().getAttributes());
            attributes.put(CoreAttributes.FILENAME.key(), newFilename);
            attributes.put(ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
            attributes.put(RECORD_COUNT_ATTR, String.valueOf(writeResult.get().getRecordCount()));
            putFlowFile = session.putAllAttributes(putFlowFile, attributes);

            // Send a provenance event and transfer to success
            final Path qualifiedPath = destFile.makeQualified(fileSystem.getUri(),
                    fileSystem.getWorkingDirectory());
            session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString());
            session.transfer(putFlowFile, REL_SUCCESS);

        } catch (IOException | FlowFileAccessException e) {
            deleteQuietly(fileSystem, tempDotCopyFile);
            getLogger().error("Failed to write due to {}", new Object[] { e });
            session.transfer(session.penalize(putFlowFile), REL_RETRY);
            context.yield();
        } catch (Throwable t) {
            deleteQuietly(fileSystem, tempDotCopyFile);
            getLogger().error("Failed to write due to {}", new Object[] { t });
            session.transfer(putFlowFile, REL_FAILURE);
        }

        return null;
    });
}

From source file:org.apache.nifi.processors.hadoop.DeleteHDFS.java

License:Apache License

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile originalFlowFile = session.get();

    // If this processor has an incoming connection, then do not run unless a
    // FlowFile is actually sent through
    if (originalFlowFile == null && context.hasIncomingConnection()) {
        context.yield();/*from www.  j  a va 2  s  .co m*/
        return;
    }

    // We need a FlowFile to report provenance correctly.
    FlowFile flowFile = originalFlowFile != null ? originalFlowFile : session.create();

    final String fileOrDirectoryName = context.getProperty(FILE_OR_DIRECTORY)
            .evaluateAttributeExpressions(flowFile).getValue();

    final FileSystem fileSystem = getFileSystem();
    try {
        // Check if the user has supplied a file or directory pattern
        List<Path> pathList = Lists.newArrayList();
        if (GLOB_MATCHER.reset(fileOrDirectoryName).find()) {
            FileStatus[] fileStatuses = fileSystem.globStatus(new Path(fileOrDirectoryName));
            if (fileStatuses != null) {
                for (FileStatus fileStatus : fileStatuses) {
                    pathList.add(fileStatus.getPath());
                }
            }
        } else {
            pathList.add(new Path(fileOrDirectoryName));
        }

        int failedPath = 0;
        for (Path path : pathList) {
            if (fileSystem.exists(path)) {
                try {
                    Map<String, String> attributes = Maps.newHashMapWithExpectedSize(2);
                    attributes.put("hdfs.filename", path.getName());
                    attributes.put("hdfs.path", path.getParent().toString());
                    flowFile = session.putAllAttributes(flowFile, attributes);

                    fileSystem.delete(path, context.getProperty(RECURSIVE).asBoolean());
                    getLogger().debug("For flowfile {} Deleted file at path {} with name {}",
                            new Object[] { originalFlowFile, path.getParent().toString(), path.getName() });
                    final Path qualifiedPath = path.makeQualified(fileSystem.getUri(),
                            fileSystem.getWorkingDirectory());
                    session.getProvenanceReporter().invokeRemoteProcess(flowFile, qualifiedPath.toString());
                } catch (IOException ioe) {
                    // One possible scenario is that the IOException is permissions based, however it would be impractical to check every possible
                    // external HDFS authorization tool (Ranger, Sentry, etc). Local ACLs could be checked but the operation would be expensive.
                    getLogger().warn("Failed to delete file or directory", ioe);

                    Map<String, String> attributes = Maps.newHashMapWithExpectedSize(1);
                    // The error message is helpful in understanding at a flowfile level what caused the IOException (which ACL is denying the operation, e.g.)
                    attributes.put("hdfs.error.message", ioe.getMessage());

                    session.transfer(session.putAllAttributes(session.clone(flowFile), attributes),
                            REL_FAILURE);
                    failedPath++;
                }
            }
        }

        if (failedPath == 0) {
            session.transfer(flowFile, DeleteHDFS.REL_SUCCESS);
        } else {
            // If any path has been failed to be deleted, remove the FlowFile as it's been cloned and sent to failure.
            session.remove(flowFile);
        }
    } catch (IOException e) {
        getLogger().error("Error processing delete for flowfile {} due to {}",
                new Object[] { flowFile, e.getMessage() }, e);
        session.transfer(flowFile, DeleteHDFS.REL_FAILURE);
    }

}

From source file:org.apache.nifi.processors.hadoop.FetchHDFS.java

License:Apache License

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();/* w w w  .  ja v  a2s  . c om*/
    if (flowFile == null) {
        return;
    }

    final FileSystem hdfs = getFileSystem();
    final UserGroupInformation ugi = getUserGroupInformation();
    final String filenameValue = context.getProperty(FILENAME).evaluateAttributeExpressions(flowFile)
            .getValue();

    final Path path;
    try {
        path = new Path(filenameValue);
    } catch (IllegalArgumentException e) {
        getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                new Object[] { filenameValue, flowFile, e });
        flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage());
        flowFile = session.penalize(flowFile);
        session.transfer(flowFile, REL_FAILURE);
        return;
    }

    final StopWatch stopWatch = new StopWatch(true);
    final FlowFile finalFlowFile = flowFile;

    ugi.doAs(new PrivilegedAction<Object>() {
        @Override
        public Object run() {
            InputStream stream = null;
            CompressionCodec codec = null;
            Configuration conf = getConfiguration();
            final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
            final CompressionType compressionType = CompressionType
                    .valueOf(context.getProperty(COMPRESSION_CODEC).toString());
            final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC;

            if (inferCompressionCodec) {
                codec = compressionCodecFactory.getCodec(path);
            } else if (compressionType != CompressionType.NONE) {
                codec = getCompressionCodec(context, getConfiguration());
            }

            FlowFile flowFile = finalFlowFile;
            final Path qualifiedPath = path.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory());
            try {
                final String outputFilename;
                final String originalFilename = path.getName();
                stream = hdfs.open(path, 16384);

                // Check if compression codec is defined (inferred or otherwise)
                if (codec != null) {
                    stream = codec.createInputStream(stream);
                    outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension());
                } else {
                    outputFilename = originalFilename;
                }

                flowFile = session.importFrom(stream, finalFlowFile);
                flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename);

                stopWatch.stop();
                getLogger().info("Successfully received content from {} for {} in {}",
                        new Object[] { qualifiedPath, flowFile, stopWatch.getDuration() });
                session.getProvenanceReporter().fetch(flowFile, qualifiedPath.toString(),
                        stopWatch.getDuration(TimeUnit.MILLISECONDS));
                session.transfer(flowFile, REL_SUCCESS);
            } catch (final FileNotFoundException | AccessControlException e) {
                getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                        new Object[] { qualifiedPath, flowFile, e });
                flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage());
                flowFile = session.penalize(flowFile);
                session.transfer(flowFile, REL_FAILURE);
            } catch (final IOException e) {
                getLogger().error(
                        "Failed to retrieve content from {} for {} due to {}; routing to comms.failure",
                        new Object[] { qualifiedPath, flowFile, e });
                flowFile = session.penalize(flowFile);
                session.transfer(flowFile, REL_COMMS_FAILURE);
            } finally {
                IOUtils.closeQuietly(stream);
            }

            return null;
        }
    });

}