List of usage examples for org.apache.hadoop.fs FileSystem getWorkingDirectory
public abstract Path getWorkingDirectory();
From source file:org.apache.mahout.classifier.df.mapreduce.OversamplingBuilder.java
License:Apache License
protected Path getOutputPath(Configuration conf) throws IOException { // the output directory is accessed only by this class, so use the default // file system FileSystem fs = FileSystem.get(conf); return new Path(fs.getWorkingDirectory(), outputDirName); }
From source file:org.apache.mahout.classifier.df.tools.Frequencies.java
License:Apache License
private void runTool(String data, String dataset) throws IOException, ClassNotFoundException, InterruptedException { FileSystem fs = FileSystem.get(getConf()); Path workingDir = fs.getWorkingDirectory(); Path dataPath = new Path(data); Path datasetPath = new Path(dataset); log.info("Computing the frequencies..."); FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath); int[][] counts = job.run(getConf()); // outputing the frequencies log.info("counts[partition][class]"); for (int[] count : counts) { log.info(Arrays.toString(count)); }/*ww w .j ava 2 s. c o m*/ }
From source file:org.apache.mahout.df.mapred.Builder.java
License:Apache License
/** * Output Directory name// w ww. j av a 2s . co m * * @param conf * @return * @throws IOException */ public Path getOutputPath(Configuration conf) throws IOException { // the output directory is accessed only by this class, so use the default // file system FileSystem fs = FileSystem.get(conf); return new Path(fs.getWorkingDirectory(), outputDirName); }
From source file:org.apache.mahout.df.tools.Frequencies.java
License:Apache License
private void runTool(String data, String dataset) throws IOException, ClassNotFoundException, InterruptedException { FileSystem fs = FileSystem.get(getConf()); Path workingDir = fs.getWorkingDirectory(); Path dataPath = new Path(data); Path datasetPath = new Path(dataset); log.info("Computing the frequencies..."); FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath); int[][] counts = job.run(getConf()); // compute the partitions' sizes int numPartitions = counts.length; // int[] sizes = new int[numPartitions]; // TODO this isn't used? // for (int p = 0; p < numPartitions; p++) { // sizes[p] = DataUtils.sum(counts[p]); // }//from www .ja v a 2s. c o m // outputing the frequencies log.info("counts[partition][class]"); for (int p = 0; p < numPartitions; p++) { log.info(Arrays.toString(counts[p])); } }
From source file:org.apache.mahout.ga.watchmaker.MahoutEvaluator.java
License:Apache License
/** * Create the input directory and stores the population in it. * /*from w w w. jav a2s . c o m*/ * @param fs * <code>FileSystem</code> to use * @param population * population to store * @return input <code>Path</code> */ private static Path prepareInput(FileSystem fs, List<?> population) throws IOException { Path inpath = new Path(fs.getWorkingDirectory(), "input"); HadoopUtil.overwriteOutput(inpath); storePopulation(fs, new Path(inpath, "population"), population); return inpath; }
From source file:org.apache.nifi.processors.hadoop.AbstractFetchHDFSRecord.java
License:Apache License
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { // do this before getting a flow file so that we always get a chance to attempt Kerberos relogin final FileSystem fileSystem = getFileSystem(); final Configuration configuration = getConfiguration(); final UserGroupInformation ugi = getUserGroupInformation(); if (configuration == null || fileSystem == null || ugi == null) { getLogger().error(//from w w w. j a v a 2 s . c om "Processor not configured properly because Configuration, FileSystem, or UserGroupInformation was null"); context.yield(); return; } final FlowFile originalFlowFile = session.get(); if (originalFlowFile == null) { context.yield(); return; } ugi.doAs((PrivilegedAction<Object>) () -> { FlowFile child = null; final String filenameValue = context.getProperty(FILENAME) .evaluateAttributeExpressions(originalFlowFile).getValue(); try { final Path path = new Path(filenameValue); final AtomicReference<Throwable> exceptionHolder = new AtomicReference<>(null); final AtomicReference<WriteResult> writeResult = new AtomicReference<>(); final RecordSetWriterFactory recordSetWriterFactory = context.getProperty(RECORD_WRITER) .asControllerService(RecordSetWriterFactory.class); final StopWatch stopWatch = new StopWatch(true); // use a child FlowFile so that if any error occurs we can route the original untouched FlowFile to retry/failure child = session.create(originalFlowFile); final AtomicReference<String> mimeTypeRef = new AtomicReference<>(); child = session.write(child, (final OutputStream rawOut) -> { try (final BufferedOutputStream out = new BufferedOutputStream(rawOut); final HDFSRecordReader recordReader = createHDFSRecordReader(context, originalFlowFile, configuration, path)) { Record record = recordReader.nextRecord(); final RecordSchema schema = recordSetWriterFactory.getSchema( originalFlowFile.getAttributes(), record == null ? null : record.getSchema()); try (final RecordSetWriter recordSetWriter = recordSetWriterFactory .createWriter(getLogger(), schema, out)) { recordSetWriter.beginRecordSet(); if (record != null) { recordSetWriter.write(record); } while ((record = recordReader.nextRecord()) != null) { recordSetWriter.write(record); } writeResult.set(recordSetWriter.finishRecordSet()); mimeTypeRef.set(recordSetWriter.getMimeType()); } } catch (Exception e) { exceptionHolder.set(e); } }); stopWatch.stop(); // if any errors happened within the session.write then throw the exception so we jump // into one of the appropriate catch blocks below if (exceptionHolder.get() != null) { throw exceptionHolder.get(); } FlowFile successFlowFile = postProcess(context, session, child, path); final Map<String, String> attributes = new HashMap<>(writeResult.get().getAttributes()); attributes.put(RECORD_COUNT_ATTR, String.valueOf(writeResult.get().getRecordCount())); attributes.put(CoreAttributes.MIME_TYPE.key(), mimeTypeRef.get()); successFlowFile = session.putAllAttributes(successFlowFile, attributes); final Path qualifiedPath = path.makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory()); getLogger().info("Successfully received content from {} for {} in {} milliseconds", new Object[] { qualifiedPath, successFlowFile, stopWatch.getDuration() }); session.getProvenanceReporter().fetch(successFlowFile, qualifiedPath.toString(), stopWatch.getDuration(TimeUnit.MILLISECONDS)); session.transfer(successFlowFile, REL_SUCCESS); session.remove(originalFlowFile); return null; } catch (final FileNotFoundException | AccessControlException e) { getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { filenameValue, originalFlowFile, e }); final FlowFile failureFlowFile = session.putAttribute(originalFlowFile, FETCH_FAILURE_REASON_ATTR, e.getMessage() == null ? e.toString() : e.getMessage()); session.transfer(failureFlowFile, REL_FAILURE); } catch (final IOException | FlowFileAccessException e) { getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to retry", new Object[] { filenameValue, originalFlowFile, e }); session.transfer(session.penalize(originalFlowFile), REL_RETRY); context.yield(); } catch (final Throwable t) { getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { filenameValue, originalFlowFile, t }); final FlowFile failureFlowFile = session.putAttribute(originalFlowFile, FETCH_FAILURE_REASON_ATTR, t.getMessage() == null ? t.toString() : t.getMessage()); session.transfer(failureFlowFile, REL_FAILURE); } // if we got this far then we weren't successful so we need to clean up the child flow file if it got initialized if (child != null) { session.remove(child); } return null; }); }
From source file:org.apache.nifi.processors.hadoop.AbstractHadoopProcessor.java
License:Apache License
HdfsResources resetHDFSResources(String configResources, ProcessContext context) throws IOException { Configuration config = new ExtendedConfiguration(getLogger()); config.setClassLoader(Thread.currentThread().getContextClassLoader()); getConfigurationFromResources(config, configResources); // give sub-classes a chance to process configuration preProcessConfiguration(config, context); // first check for timeout on HDFS connection, because FileSystem has a hard coded 15 minute timeout checkHdfsUriForTimeout(config);//from w w w .j a va2s . co m // disable caching of Configuration and FileSystem objects, else we cannot reconfigure the processor without a complete // restart String disableCacheName = String.format("fs.%s.impl.disable.cache", FileSystem.getDefaultUri(config).getScheme()); config.set(disableCacheName, "true"); // If kerberos is enabled, create the file system as the kerberos principal // -- use RESOURCE_LOCK to guarantee UserGroupInformation is accessed by only a single thread at at time FileSystem fs; UserGroupInformation ugi; synchronized (RESOURCES_LOCK) { if (SecurityUtil.isSecurityEnabled(config)) { String principal = context.getProperty(kerberosProperties.getKerberosPrincipal()) .evaluateAttributeExpressions().getValue(); String keyTab = context.getProperty(kerberosProperties.getKerberosKeytab()) .evaluateAttributeExpressions().getValue(); ugi = SecurityUtil.loginKerberos(config, principal, keyTab); fs = getFileSystemAsUser(config, ugi); } else { config.set("ipc.client.fallback-to-simple-auth-allowed", "true"); config.set("hadoop.security.authentication", "simple"); ugi = SecurityUtil.loginSimple(config); fs = getFileSystemAsUser(config, ugi); } } getLogger().debug("resetHDFSResources UGI {}", new Object[] { ugi }); final Path workingDir = fs.getWorkingDirectory(); getLogger().info( "Initialized a new HDFS File System with working dir: {} default block size: {} default replication: {} config: {}", new Object[] { workingDir, fs.getDefaultBlockSize(workingDir), fs.getDefaultReplication(workingDir), config.toString() }); return new HdfsResources(config, fs, ugi); }
From source file:org.apache.nifi.processors.hadoop.AbstractPutHDFSRecord.java
License:Apache License
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { // do this before getting a flow file so that we always get a chance to attempt Kerberos relogin final FileSystem fileSystem = getFileSystem(); final Configuration configuration = getConfiguration(); final UserGroupInformation ugi = getUserGroupInformation(); if (configuration == null || fileSystem == null || ugi == null) { getLogger().error(/* w ww . j ava2s. c om*/ "Processor not configured properly because Configuration, FileSystem, or UserGroupInformation was null"); context.yield(); return; } final FlowFile flowFile = session.get(); if (flowFile == null) { context.yield(); return; } ugi.doAs((PrivilegedAction<Object>) () -> { Path tempDotCopyFile = null; FlowFile putFlowFile = flowFile; try { final String filenameValue = putFlowFile.getAttribute(CoreAttributes.FILENAME.key()); // TODO codec extension final String directoryValue = context.getProperty(DIRECTORY) .evaluateAttributeExpressions(putFlowFile).getValue(); // create the directory if it doesn't exist final Path directoryPath = new Path(directoryValue); createDirectory(fileSystem, directoryPath, remoteOwner, remoteGroup); // write to tempFile first and on success rename to destFile final Path tempFile = new Path(directoryPath, "." + filenameValue); final Path destFile = new Path(directoryPath, filenameValue); final boolean destinationExists = fileSystem.exists(destFile) || fileSystem.exists(tempFile); final boolean shouldOverwrite = context.getProperty(OVERWRITE).asBoolean(); // if the tempFile or destFile already exist, and overwrite is set to false, then transfer to failure if (destinationExists && !shouldOverwrite) { session.transfer(session.penalize(putFlowFile), REL_FAILURE); getLogger().warn( "penalizing {} and routing to failure because file with same name already exists", new Object[] { putFlowFile }); return null; } final AtomicReference<Throwable> exceptionHolder = new AtomicReference<>(null); final AtomicReference<WriteResult> writeResult = new AtomicReference<>(); final RecordReaderFactory recordReaderFactory = context.getProperty(RECORD_READER) .asControllerService(RecordReaderFactory.class); final FlowFile flowFileIn = putFlowFile; final StopWatch stopWatch = new StopWatch(true); // Read records from the incoming FlowFile and write them the tempFile session.read(putFlowFile, (final InputStream rawIn) -> { RecordReader recordReader = null; HDFSRecordWriter recordWriter = null; try (final BufferedInputStream in = new BufferedInputStream(rawIn)) { // if we fail to create the RecordReader then we want to route to failure, so we need to // handle this separately from the other IOExceptions which normally route to retry try { recordReader = recordReaderFactory.createRecordReader(flowFileIn, in, getLogger()); } catch (Exception e) { final RecordReaderFactoryException rrfe = new RecordReaderFactoryException( "Unable to create RecordReader", e); exceptionHolder.set(rrfe); return; } final RecordSet recordSet = recordReader.createRecordSet(); recordWriter = createHDFSRecordWriter(context, flowFile, configuration, tempFile, recordReader.getSchema()); writeResult.set(recordWriter.write(recordSet)); } catch (Exception e) { exceptionHolder.set(e); } finally { IOUtils.closeQuietly(recordReader); IOUtils.closeQuietly(recordWriter); } }); stopWatch.stop(); final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize()); final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS); tempDotCopyFile = tempFile; // if any errors happened within the session.read then throw the exception so we jump // into one of the appropriate catch blocks below if (exceptionHolder.get() != null) { throw exceptionHolder.get(); } // Attempt to rename from the tempFile to destFile, and change owner if successfully renamed rename(fileSystem, tempFile, destFile); changeOwner(fileSystem, destFile, remoteOwner, remoteGroup); getLogger().info("Wrote {} to {} in {} milliseconds at a rate of {}", new Object[] { putFlowFile, destFile, millis, dataRate }); putFlowFile = postProcess(context, session, putFlowFile, destFile); final String newFilename = destFile.getName(); final String hdfsPath = destFile.getParent().toString(); // Update the filename and absolute path attributes final Map<String, String> attributes = new HashMap<>(writeResult.get().getAttributes()); attributes.put(CoreAttributes.FILENAME.key(), newFilename); attributes.put(ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath); attributes.put(RECORD_COUNT_ATTR, String.valueOf(writeResult.get().getRecordCount())); putFlowFile = session.putAllAttributes(putFlowFile, attributes); // Send a provenance event and transfer to success final Path qualifiedPath = destFile.makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory()); session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString()); session.transfer(putFlowFile, REL_SUCCESS); } catch (IOException | FlowFileAccessException e) { deleteQuietly(fileSystem, tempDotCopyFile); getLogger().error("Failed to write due to {}", new Object[] { e }); session.transfer(session.penalize(putFlowFile), REL_RETRY); context.yield(); } catch (Throwable t) { deleteQuietly(fileSystem, tempDotCopyFile); getLogger().error("Failed to write due to {}", new Object[] { t }); session.transfer(putFlowFile, REL_FAILURE); } return null; }); }
From source file:org.apache.nifi.processors.hadoop.DeleteHDFS.java
License:Apache License
@Override public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException { final FlowFile originalFlowFile = session.get(); // If this processor has an incoming connection, then do not run unless a // FlowFile is actually sent through if (originalFlowFile == null && context.hasIncomingConnection()) { context.yield();/*from www. j a va 2 s .co m*/ return; } // We need a FlowFile to report provenance correctly. FlowFile flowFile = originalFlowFile != null ? originalFlowFile : session.create(); final String fileOrDirectoryName = context.getProperty(FILE_OR_DIRECTORY) .evaluateAttributeExpressions(flowFile).getValue(); final FileSystem fileSystem = getFileSystem(); try { // Check if the user has supplied a file or directory pattern List<Path> pathList = Lists.newArrayList(); if (GLOB_MATCHER.reset(fileOrDirectoryName).find()) { FileStatus[] fileStatuses = fileSystem.globStatus(new Path(fileOrDirectoryName)); if (fileStatuses != null) { for (FileStatus fileStatus : fileStatuses) { pathList.add(fileStatus.getPath()); } } } else { pathList.add(new Path(fileOrDirectoryName)); } int failedPath = 0; for (Path path : pathList) { if (fileSystem.exists(path)) { try { Map<String, String> attributes = Maps.newHashMapWithExpectedSize(2); attributes.put("hdfs.filename", path.getName()); attributes.put("hdfs.path", path.getParent().toString()); flowFile = session.putAllAttributes(flowFile, attributes); fileSystem.delete(path, context.getProperty(RECURSIVE).asBoolean()); getLogger().debug("For flowfile {} Deleted file at path {} with name {}", new Object[] { originalFlowFile, path.getParent().toString(), path.getName() }); final Path qualifiedPath = path.makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory()); session.getProvenanceReporter().invokeRemoteProcess(flowFile, qualifiedPath.toString()); } catch (IOException ioe) { // One possible scenario is that the IOException is permissions based, however it would be impractical to check every possible // external HDFS authorization tool (Ranger, Sentry, etc). Local ACLs could be checked but the operation would be expensive. getLogger().warn("Failed to delete file or directory", ioe); Map<String, String> attributes = Maps.newHashMapWithExpectedSize(1); // The error message is helpful in understanding at a flowfile level what caused the IOException (which ACL is denying the operation, e.g.) attributes.put("hdfs.error.message", ioe.getMessage()); session.transfer(session.putAllAttributes(session.clone(flowFile), attributes), REL_FAILURE); failedPath++; } } } if (failedPath == 0) { session.transfer(flowFile, DeleteHDFS.REL_SUCCESS); } else { // If any path has been failed to be deleted, remove the FlowFile as it's been cloned and sent to failure. session.remove(flowFile); } } catch (IOException e) { getLogger().error("Error processing delete for flowfile {} due to {}", new Object[] { flowFile, e.getMessage() }, e); session.transfer(flowFile, DeleteHDFS.REL_FAILURE); } }
From source file:org.apache.nifi.processors.hadoop.FetchHDFS.java
License:Apache License
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { FlowFile flowFile = session.get();/* w w w . ja v a2s . c om*/ if (flowFile == null) { return; } final FileSystem hdfs = getFileSystem(); final UserGroupInformation ugi = getUserGroupInformation(); final String filenameValue = context.getProperty(FILENAME).evaluateAttributeExpressions(flowFile) .getValue(); final Path path; try { path = new Path(filenameValue); } catch (IllegalArgumentException e) { getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { filenameValue, flowFile, e }); flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage()); flowFile = session.penalize(flowFile); session.transfer(flowFile, REL_FAILURE); return; } final StopWatch stopWatch = new StopWatch(true); final FlowFile finalFlowFile = flowFile; ugi.doAs(new PrivilegedAction<Object>() { @Override public Object run() { InputStream stream = null; CompressionCodec codec = null; Configuration conf = getConfiguration(); final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); final CompressionType compressionType = CompressionType .valueOf(context.getProperty(COMPRESSION_CODEC).toString()); final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC; if (inferCompressionCodec) { codec = compressionCodecFactory.getCodec(path); } else if (compressionType != CompressionType.NONE) { codec = getCompressionCodec(context, getConfiguration()); } FlowFile flowFile = finalFlowFile; final Path qualifiedPath = path.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory()); try { final String outputFilename; final String originalFilename = path.getName(); stream = hdfs.open(path, 16384); // Check if compression codec is defined (inferred or otherwise) if (codec != null) { stream = codec.createInputStream(stream); outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension()); } else { outputFilename = originalFilename; } flowFile = session.importFrom(stream, finalFlowFile); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename); stopWatch.stop(); getLogger().info("Successfully received content from {} for {} in {}", new Object[] { qualifiedPath, flowFile, stopWatch.getDuration() }); session.getProvenanceReporter().fetch(flowFile, qualifiedPath.toString(), stopWatch.getDuration(TimeUnit.MILLISECONDS)); session.transfer(flowFile, REL_SUCCESS); } catch (final FileNotFoundException | AccessControlException e) { getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { qualifiedPath, flowFile, e }); flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage()); flowFile = session.penalize(flowFile); session.transfer(flowFile, REL_FAILURE); } catch (final IOException e) { getLogger().error( "Failed to retrieve content from {} for {} due to {}; routing to comms.failure", new Object[] { qualifiedPath, flowFile, e }); flowFile = session.penalize(flowFile); session.transfer(flowFile, REL_COMMS_FAILURE); } finally { IOUtils.closeQuietly(stream); } return null; } }); }