Example usage for org.apache.hadoop.fs FileSystem getWorkingDirectory

List of usage examples for org.apache.hadoop.fs FileSystem getWorkingDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getWorkingDirectory.

Prototype

public abstract Path getWorkingDirectory();

Source Link

Document

Get the current working directory for the given FileSystem

Usage

From source file:org.apache.mahout.classifier.df.mapreduce.OversamplingBuilder.java

License:Apache License

protected Path getOutputPath(Configuration conf) throws IOException {
    // the output directory is accessed only by this class, so use the default
    // file system
    FileSystem fs = FileSystem.get(conf);
    return new Path(fs.getWorkingDirectory(), outputDirName);
}

From source file:org.apache.mahout.classifier.df.tools.Frequencies.java

License:Apache License

private void runTool(String data, String dataset)
        throws IOException, ClassNotFoundException, InterruptedException {

    FileSystem fs = FileSystem.get(getConf());
    Path workingDir = fs.getWorkingDirectory();

    Path dataPath = new Path(data);
    Path datasetPath = new Path(dataset);

    log.info("Computing the frequencies...");
    FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath);

    int[][] counts = job.run(getConf());

    // outputing the frequencies
    log.info("counts[partition][class]");
    for (int[] count : counts) {
        log.info(Arrays.toString(count));
    }/*ww  w  .j ava  2 s. c o m*/
}

From source file:org.apache.mahout.df.mapred.Builder.java

License:Apache License

/**
 * Output Directory name// w ww. j av a  2s . co  m
 * 
 * @param conf
 * @return
 * @throws IOException
 */
public Path getOutputPath(Configuration conf) throws IOException {
    // the output directory is accessed only by this class, so use the default
    // file system
    FileSystem fs = FileSystem.get(conf);
    return new Path(fs.getWorkingDirectory(), outputDirName);
}

From source file:org.apache.mahout.df.tools.Frequencies.java

License:Apache License

private void runTool(String data, String dataset)
        throws IOException, ClassNotFoundException, InterruptedException {

    FileSystem fs = FileSystem.get(getConf());
    Path workingDir = fs.getWorkingDirectory();

    Path dataPath = new Path(data);
    Path datasetPath = new Path(dataset);

    log.info("Computing the frequencies...");
    FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath);

    int[][] counts = job.run(getConf());

    // compute the partitions' sizes
    int numPartitions = counts.length;
    // int[] sizes = new int[numPartitions]; // TODO this isn't used?
    // for (int p = 0; p < numPartitions; p++) {
    // sizes[p] = DataUtils.sum(counts[p]);
    // }//from www .ja  v  a  2s. c  o  m

    // outputing the frequencies
    log.info("counts[partition][class]");
    for (int p = 0; p < numPartitions; p++) {
        log.info(Arrays.toString(counts[p]));
    }
}

From source file:org.apache.mahout.ga.watchmaker.MahoutEvaluator.java

License:Apache License

/**
 * Create the input directory and stores the population in it.
 * /*from   w  w  w. jav a2s .  c  o m*/
 * @param fs
 *          <code>FileSystem</code> to use
 * @param population
 *          population to store
 * @return input <code>Path</code>
 */
private static Path prepareInput(FileSystem fs, List<?> population) throws IOException {
    Path inpath = new Path(fs.getWorkingDirectory(), "input");
    HadoopUtil.overwriteOutput(inpath);
    storePopulation(fs, new Path(inpath, "population"), population);
    return inpath;
}

From source file:org.apache.nifi.processors.hadoop.AbstractFetchHDFSRecord.java

License:Apache License

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    // do this before getting a flow file so that we always get a chance to attempt Kerberos relogin
    final FileSystem fileSystem = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();

    if (configuration == null || fileSystem == null || ugi == null) {
        getLogger().error(//from  w w w. j a v a  2  s  .  c  om
                "Processor not configured properly because Configuration, FileSystem, or UserGroupInformation was null");
        context.yield();
        return;
    }

    final FlowFile originalFlowFile = session.get();
    if (originalFlowFile == null) {
        context.yield();
        return;
    }

    ugi.doAs((PrivilegedAction<Object>) () -> {
        FlowFile child = null;
        final String filenameValue = context.getProperty(FILENAME)
                .evaluateAttributeExpressions(originalFlowFile).getValue();
        try {
            final Path path = new Path(filenameValue);
            final AtomicReference<Throwable> exceptionHolder = new AtomicReference<>(null);
            final AtomicReference<WriteResult> writeResult = new AtomicReference<>();

            final RecordSetWriterFactory recordSetWriterFactory = context.getProperty(RECORD_WRITER)
                    .asControllerService(RecordSetWriterFactory.class);

            final StopWatch stopWatch = new StopWatch(true);

            // use a child FlowFile so that if any error occurs we can route the original untouched FlowFile to retry/failure
            child = session.create(originalFlowFile);

            final AtomicReference<String> mimeTypeRef = new AtomicReference<>();
            child = session.write(child, (final OutputStream rawOut) -> {
                try (final BufferedOutputStream out = new BufferedOutputStream(rawOut);
                        final HDFSRecordReader recordReader = createHDFSRecordReader(context, originalFlowFile,
                                configuration, path)) {

                    Record record = recordReader.nextRecord();
                    final RecordSchema schema = recordSetWriterFactory.getSchema(
                            originalFlowFile.getAttributes(), record == null ? null : record.getSchema());

                    try (final RecordSetWriter recordSetWriter = recordSetWriterFactory
                            .createWriter(getLogger(), schema, out)) {
                        recordSetWriter.beginRecordSet();
                        if (record != null) {
                            recordSetWriter.write(record);
                        }

                        while ((record = recordReader.nextRecord()) != null) {
                            recordSetWriter.write(record);
                        }

                        writeResult.set(recordSetWriter.finishRecordSet());
                        mimeTypeRef.set(recordSetWriter.getMimeType());
                    }
                } catch (Exception e) {
                    exceptionHolder.set(e);
                }
            });

            stopWatch.stop();

            // if any errors happened within the session.write then throw the exception so we jump
            // into one of the appropriate catch blocks below
            if (exceptionHolder.get() != null) {
                throw exceptionHolder.get();
            }

            FlowFile successFlowFile = postProcess(context, session, child, path);

            final Map<String, String> attributes = new HashMap<>(writeResult.get().getAttributes());
            attributes.put(RECORD_COUNT_ATTR, String.valueOf(writeResult.get().getRecordCount()));
            attributes.put(CoreAttributes.MIME_TYPE.key(), mimeTypeRef.get());
            successFlowFile = session.putAllAttributes(successFlowFile, attributes);

            final Path qualifiedPath = path.makeQualified(fileSystem.getUri(),
                    fileSystem.getWorkingDirectory());
            getLogger().info("Successfully received content from {} for {} in {} milliseconds",
                    new Object[] { qualifiedPath, successFlowFile, stopWatch.getDuration() });
            session.getProvenanceReporter().fetch(successFlowFile, qualifiedPath.toString(),
                    stopWatch.getDuration(TimeUnit.MILLISECONDS));
            session.transfer(successFlowFile, REL_SUCCESS);
            session.remove(originalFlowFile);
            return null;

        } catch (final FileNotFoundException | AccessControlException e) {
            getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                    new Object[] { filenameValue, originalFlowFile, e });
            final FlowFile failureFlowFile = session.putAttribute(originalFlowFile, FETCH_FAILURE_REASON_ATTR,
                    e.getMessage() == null ? e.toString() : e.getMessage());
            session.transfer(failureFlowFile, REL_FAILURE);
        } catch (final IOException | FlowFileAccessException e) {
            getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to retry",
                    new Object[] { filenameValue, originalFlowFile, e });
            session.transfer(session.penalize(originalFlowFile), REL_RETRY);
            context.yield();
        } catch (final Throwable t) {
            getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                    new Object[] { filenameValue, originalFlowFile, t });
            final FlowFile failureFlowFile = session.putAttribute(originalFlowFile, FETCH_FAILURE_REASON_ATTR,
                    t.getMessage() == null ? t.toString() : t.getMessage());
            session.transfer(failureFlowFile, REL_FAILURE);
        }

        // if we got this far then we weren't successful so we need to clean up the child flow file if it got initialized
        if (child != null) {
            session.remove(child);
        }

        return null;
    });

}

From source file:org.apache.nifi.processors.hadoop.AbstractHadoopProcessor.java

License:Apache License

HdfsResources resetHDFSResources(String configResources, ProcessContext context) throws IOException {
    Configuration config = new ExtendedConfiguration(getLogger());
    config.setClassLoader(Thread.currentThread().getContextClassLoader());

    getConfigurationFromResources(config, configResources);

    // give sub-classes a chance to process configuration
    preProcessConfiguration(config, context);

    // first check for timeout on HDFS connection, because FileSystem has a hard coded 15 minute timeout
    checkHdfsUriForTimeout(config);//from w  w w  .j  a va2s .  co m

    // disable caching of Configuration and FileSystem objects, else we cannot reconfigure the processor without a complete
    // restart
    String disableCacheName = String.format("fs.%s.impl.disable.cache",
            FileSystem.getDefaultUri(config).getScheme());
    config.set(disableCacheName, "true");

    // If kerberos is enabled, create the file system as the kerberos principal
    // -- use RESOURCE_LOCK to guarantee UserGroupInformation is accessed by only a single thread at at time
    FileSystem fs;
    UserGroupInformation ugi;
    synchronized (RESOURCES_LOCK) {
        if (SecurityUtil.isSecurityEnabled(config)) {
            String principal = context.getProperty(kerberosProperties.getKerberosPrincipal())
                    .evaluateAttributeExpressions().getValue();
            String keyTab = context.getProperty(kerberosProperties.getKerberosKeytab())
                    .evaluateAttributeExpressions().getValue();
            ugi = SecurityUtil.loginKerberos(config, principal, keyTab);
            fs = getFileSystemAsUser(config, ugi);
        } else {
            config.set("ipc.client.fallback-to-simple-auth-allowed", "true");
            config.set("hadoop.security.authentication", "simple");
            ugi = SecurityUtil.loginSimple(config);
            fs = getFileSystemAsUser(config, ugi);
        }
    }
    getLogger().debug("resetHDFSResources UGI {}", new Object[] { ugi });

    final Path workingDir = fs.getWorkingDirectory();
    getLogger().info(
            "Initialized a new HDFS File System with working dir: {} default block size: {} default replication: {} config: {}",
            new Object[] { workingDir, fs.getDefaultBlockSize(workingDir), fs.getDefaultReplication(workingDir),
                    config.toString() });

    return new HdfsResources(config, fs, ugi);
}

From source file:org.apache.nifi.processors.hadoop.AbstractPutHDFSRecord.java

License:Apache License

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    // do this before getting a flow file so that we always get a chance to attempt Kerberos relogin
    final FileSystem fileSystem = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();

    if (configuration == null || fileSystem == null || ugi == null) {
        getLogger().error(/* w ww  .  j ava2s. c om*/
                "Processor not configured properly because Configuration, FileSystem, or UserGroupInformation was null");
        context.yield();
        return;
    }

    final FlowFile flowFile = session.get();
    if (flowFile == null) {
        context.yield();
        return;
    }

    ugi.doAs((PrivilegedAction<Object>) () -> {
        Path tempDotCopyFile = null;
        FlowFile putFlowFile = flowFile;
        try {
            final String filenameValue = putFlowFile.getAttribute(CoreAttributes.FILENAME.key()); // TODO codec extension
            final String directoryValue = context.getProperty(DIRECTORY)
                    .evaluateAttributeExpressions(putFlowFile).getValue();

            // create the directory if it doesn't exist
            final Path directoryPath = new Path(directoryValue);
            createDirectory(fileSystem, directoryPath, remoteOwner, remoteGroup);

            // write to tempFile first and on success rename to destFile
            final Path tempFile = new Path(directoryPath, "." + filenameValue);
            final Path destFile = new Path(directoryPath, filenameValue);

            final boolean destinationExists = fileSystem.exists(destFile) || fileSystem.exists(tempFile);
            final boolean shouldOverwrite = context.getProperty(OVERWRITE).asBoolean();

            // if the tempFile or destFile already exist, and overwrite is set to false, then transfer to failure
            if (destinationExists && !shouldOverwrite) {
                session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                getLogger().warn(
                        "penalizing {} and routing to failure because file with same name already exists",
                        new Object[] { putFlowFile });
                return null;
            }

            final AtomicReference<Throwable> exceptionHolder = new AtomicReference<>(null);
            final AtomicReference<WriteResult> writeResult = new AtomicReference<>();
            final RecordReaderFactory recordReaderFactory = context.getProperty(RECORD_READER)
                    .asControllerService(RecordReaderFactory.class);

            final FlowFile flowFileIn = putFlowFile;
            final StopWatch stopWatch = new StopWatch(true);

            // Read records from the incoming FlowFile and write them the tempFile
            session.read(putFlowFile, (final InputStream rawIn) -> {
                RecordReader recordReader = null;
                HDFSRecordWriter recordWriter = null;

                try (final BufferedInputStream in = new BufferedInputStream(rawIn)) {

                    // if we fail to create the RecordReader then we want to route to failure, so we need to
                    // handle this separately from the other IOExceptions which normally route to retry
                    try {
                        recordReader = recordReaderFactory.createRecordReader(flowFileIn, in, getLogger());
                    } catch (Exception e) {
                        final RecordReaderFactoryException rrfe = new RecordReaderFactoryException(
                                "Unable to create RecordReader", e);
                        exceptionHolder.set(rrfe);
                        return;
                    }

                    final RecordSet recordSet = recordReader.createRecordSet();

                    recordWriter = createHDFSRecordWriter(context, flowFile, configuration, tempFile,
                            recordReader.getSchema());
                    writeResult.set(recordWriter.write(recordSet));
                } catch (Exception e) {
                    exceptionHolder.set(e);
                } finally {
                    IOUtils.closeQuietly(recordReader);
                    IOUtils.closeQuietly(recordWriter);
                }
            });
            stopWatch.stop();

            final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize());
            final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
            tempDotCopyFile = tempFile;

            // if any errors happened within the session.read then throw the exception so we jump
            // into one of the appropriate catch blocks below
            if (exceptionHolder.get() != null) {
                throw exceptionHolder.get();
            }

            // Attempt to rename from the tempFile to destFile, and change owner if successfully renamed
            rename(fileSystem, tempFile, destFile);
            changeOwner(fileSystem, destFile, remoteOwner, remoteGroup);

            getLogger().info("Wrote {} to {} in {} milliseconds at a rate of {}",
                    new Object[] { putFlowFile, destFile, millis, dataRate });

            putFlowFile = postProcess(context, session, putFlowFile, destFile);

            final String newFilename = destFile.getName();
            final String hdfsPath = destFile.getParent().toString();

            // Update the filename and absolute path attributes
            final Map<String, String> attributes = new HashMap<>(writeResult.get().getAttributes());
            attributes.put(CoreAttributes.FILENAME.key(), newFilename);
            attributes.put(ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
            attributes.put(RECORD_COUNT_ATTR, String.valueOf(writeResult.get().getRecordCount()));
            putFlowFile = session.putAllAttributes(putFlowFile, attributes);

            // Send a provenance event and transfer to success
            final Path qualifiedPath = destFile.makeQualified(fileSystem.getUri(),
                    fileSystem.getWorkingDirectory());
            session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString());
            session.transfer(putFlowFile, REL_SUCCESS);

        } catch (IOException | FlowFileAccessException e) {
            deleteQuietly(fileSystem, tempDotCopyFile);
            getLogger().error("Failed to write due to {}", new Object[] { e });
            session.transfer(session.penalize(putFlowFile), REL_RETRY);
            context.yield();
        } catch (Throwable t) {
            deleteQuietly(fileSystem, tempDotCopyFile);
            getLogger().error("Failed to write due to {}", new Object[] { t });
            session.transfer(putFlowFile, REL_FAILURE);
        }

        return null;
    });
}

From source file:org.apache.nifi.processors.hadoop.DeleteHDFS.java

License:Apache License

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile originalFlowFile = session.get();

    // If this processor has an incoming connection, then do not run unless a
    // FlowFile is actually sent through
    if (originalFlowFile == null && context.hasIncomingConnection()) {
        context.yield();/*from www.  j  a va 2  s  .co m*/
        return;
    }

    // We need a FlowFile to report provenance correctly.
    FlowFile flowFile = originalFlowFile != null ? originalFlowFile : session.create();

    final String fileOrDirectoryName = context.getProperty(FILE_OR_DIRECTORY)
            .evaluateAttributeExpressions(flowFile).getValue();

    final FileSystem fileSystem = getFileSystem();
    try {
        // Check if the user has supplied a file or directory pattern
        List<Path> pathList = Lists.newArrayList();
        if (GLOB_MATCHER.reset(fileOrDirectoryName).find()) {
            FileStatus[] fileStatuses = fileSystem.globStatus(new Path(fileOrDirectoryName));
            if (fileStatuses != null) {
                for (FileStatus fileStatus : fileStatuses) {
                    pathList.add(fileStatus.getPath());
                }
            }
        } else {
            pathList.add(new Path(fileOrDirectoryName));
        }

        int failedPath = 0;
        for (Path path : pathList) {
            if (fileSystem.exists(path)) {
                try {
                    Map<String, String> attributes = Maps.newHashMapWithExpectedSize(2);
                    attributes.put("hdfs.filename", path.getName());
                    attributes.put("hdfs.path", path.getParent().toString());
                    flowFile = session.putAllAttributes(flowFile, attributes);

                    fileSystem.delete(path, context.getProperty(RECURSIVE).asBoolean());
                    getLogger().debug("For flowfile {} Deleted file at path {} with name {}",
                            new Object[] { originalFlowFile, path.getParent().toString(), path.getName() });
                    final Path qualifiedPath = path.makeQualified(fileSystem.getUri(),
                            fileSystem.getWorkingDirectory());
                    session.getProvenanceReporter().invokeRemoteProcess(flowFile, qualifiedPath.toString());
                } catch (IOException ioe) {
                    // One possible scenario is that the IOException is permissions based, however it would be impractical to check every possible
                    // external HDFS authorization tool (Ranger, Sentry, etc). Local ACLs could be checked but the operation would be expensive.
                    getLogger().warn("Failed to delete file or directory", ioe);

                    Map<String, String> attributes = Maps.newHashMapWithExpectedSize(1);
                    // The error message is helpful in understanding at a flowfile level what caused the IOException (which ACL is denying the operation, e.g.)
                    attributes.put("hdfs.error.message", ioe.getMessage());

                    session.transfer(session.putAllAttributes(session.clone(flowFile), attributes),
                            REL_FAILURE);
                    failedPath++;
                }
            }
        }

        if (failedPath == 0) {
            session.transfer(flowFile, DeleteHDFS.REL_SUCCESS);
        } else {
            // If any path has been failed to be deleted, remove the FlowFile as it's been cloned and sent to failure.
            session.remove(flowFile);
        }
    } catch (IOException e) {
        getLogger().error("Error processing delete for flowfile {} due to {}",
                new Object[] { flowFile, e.getMessage() }, e);
        session.transfer(flowFile, DeleteHDFS.REL_FAILURE);
    }

}

From source file:org.apache.nifi.processors.hadoop.FetchHDFS.java

License:Apache License

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();/* w w w  .  ja v  a2s  . c om*/
    if (flowFile == null) {
        return;
    }

    final FileSystem hdfs = getFileSystem();
    final UserGroupInformation ugi = getUserGroupInformation();
    final String filenameValue = context.getProperty(FILENAME).evaluateAttributeExpressions(flowFile)
            .getValue();

    final Path path;
    try {
        path = new Path(filenameValue);
    } catch (IllegalArgumentException e) {
        getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                new Object[] { filenameValue, flowFile, e });
        flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage());
        flowFile = session.penalize(flowFile);
        session.transfer(flowFile, REL_FAILURE);
        return;
    }

    final StopWatch stopWatch = new StopWatch(true);
    final FlowFile finalFlowFile = flowFile;

    ugi.doAs(new PrivilegedAction<Object>() {
        @Override
        public Object run() {
            InputStream stream = null;
            CompressionCodec codec = null;
            Configuration conf = getConfiguration();
            final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
            final CompressionType compressionType = CompressionType
                    .valueOf(context.getProperty(COMPRESSION_CODEC).toString());
            final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC;

            if (inferCompressionCodec) {
                codec = compressionCodecFactory.getCodec(path);
            } else if (compressionType != CompressionType.NONE) {
                codec = getCompressionCodec(context, getConfiguration());
            }

            FlowFile flowFile = finalFlowFile;
            final Path qualifiedPath = path.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory());
            try {
                final String outputFilename;
                final String originalFilename = path.getName();
                stream = hdfs.open(path, 16384);

                // Check if compression codec is defined (inferred or otherwise)
                if (codec != null) {
                    stream = codec.createInputStream(stream);
                    outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension());
                } else {
                    outputFilename = originalFilename;
                }

                flowFile = session.importFrom(stream, finalFlowFile);
                flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename);

                stopWatch.stop();
                getLogger().info("Successfully received content from {} for {} in {}",
                        new Object[] { qualifiedPath, flowFile, stopWatch.getDuration() });
                session.getProvenanceReporter().fetch(flowFile, qualifiedPath.toString(),
                        stopWatch.getDuration(TimeUnit.MILLISECONDS));
                session.transfer(flowFile, REL_SUCCESS);
            } catch (final FileNotFoundException | AccessControlException e) {
                getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                        new Object[] { qualifiedPath, flowFile, e });
                flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage());
                flowFile = session.penalize(flowFile);
                session.transfer(flowFile, REL_FAILURE);
            } catch (final IOException e) {
                getLogger().error(
                        "Failed to retrieve content from {} for {} due to {}; routing to comms.failure",
                        new Object[] { qualifiedPath, flowFile, e });
                flowFile = session.penalize(flowFile);
                session.transfer(flowFile, REL_COMMS_FAILURE);
            } finally {
                IOUtils.closeQuietly(stream);
            }

            return null;
        }
    });

}