List of usage examples for org.apache.hadoop.fs FileSystem create
public FSDataOutputStream create(Path f) throws IOException
From source file:cmd.freebase2hdfs.java
License:Apache License
@Override public int run(String[] args) throws Exception { String input = null;// w w w .j a v a2s .c om String output = null; if (args.length == 1) { input = "http://download.freebase.com/datadumps/latest/freebase-datadump-quadruples.tsv.bz2"; output = args[0]; } else if (args.length == 2) { input = args[0]; output = args[1]; } else { System.err.printf( "Usage: %s [generic options] [<http://path/to/freebase/datadump>] <hdfs://path/to/destination>\n", getClass().getName()); System.err.println( "[<http://path/to/freebase/datadump>] is optional, it defaults to http://download.freebase.com/datadumps/latest/freebase-datadump-quadruples.tsv.bz2\n"); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration configuration = getConf(); FileSystem fs = FileSystem.get(configuration); Path outputPath = new Path(output); InputStream in = new URL(input).openStream(); FSDataOutputStream out = fs.create(outputPath); IOUtils.copyBytes(in, out, BUFFER_SIZE, true); return 0; }
From source file:cmd.tdbloader4.java
License:Apache License
private void createOffsetsFile(FileSystem fs, String input, String output) throws IOException { log.debug("Creating offsets file..."); Map<Long, Long> offsets = new TreeMap<Long, Long>(); FileStatus[] status = fs.listStatus(new Path(input)); for (FileStatus fileStatus : status) { Path file = fileStatus.getPath(); if (file.getName().startsWith("part-r-")) { log.debug("Processing: {}", file.getName()); BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(file))); String line = in.readLine(); String[] tokens = line.split("\\s"); long partition = Long.valueOf(tokens[0]); long offset = Long.valueOf(tokens[1]); log.debug("Partition {} has offset {}", partition, offset); offsets.put(partition, offset); }/*from ww w. ja va 2 s . c o m*/ } Path outputPath = new Path(output, Constants.OFFSETS_FILENAME); PrintWriter out = new PrintWriter(new OutputStreamWriter(fs.create(outputPath))); for (Long partition : offsets.keySet()) { out.println(partition + "\t" + offsets.get(partition)); } out.close(); log.debug("Offset file created."); }
From source file:cn.lhfei.hadoop.ch04.FileDecompressor.java
License:Apache License
/** * use case: % hadoop FileDecompressor file.gz * @param args/*from w ww .jav a 2s . c om*/ */ public static void main(String[] args) { FileSystem fs = null; String uri = args[0]; Path inputPath = null; Configuration conf = new Configuration(); CompressionCodecFactory factory = null; InputStream in = null; OutputStream out = null; try { fs = FileSystem.get(URI.create(uri), conf); inputPath = new Path(uri); factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:cn.uway.util.apache.parquet.hadoop.ParquetFileWriter.java
License:Apache License
private static void writeMetadataFile(Path outputPath, ParquetMetadata metadataFooter, FileSystem fs, String parquetMetadataFile) throws IOException { Path metaDataPath = new Path(outputPath, parquetMetadataFile); FSDataOutputStream metadata = fs.create(metaDataPath); metadata.write(MAGIC);//from w w w . ja v a 2 s . co m serializeFooter(metadataFooter, metadata); metadata.close(); }
From source file:co.cask.cdap.template.etl.batch.ETLMapReduceTest.java
License:Apache License
@Test public void testFiletoTPFS() throws Exception { String filePath = "file:///tmp/test/text.txt"; String testData = "String for testing purposes."; Path textFile = new Path(filePath); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FSDataOutputStream writeData = fs.create(textFile); writeData.write(testData.getBytes()); writeData.flush();// www . ja v a2 s.c om writeData.close(); ETLStage source = new ETLStage("File", ImmutableMap.<String, String>builder() .put(Properties.File.FILESYSTEM, "Text").put(Properties.File.PATH, filePath).build()); ETLStage sink = new ETLStage("TPFSAvro", ImmutableMap.of(Properties.TimePartitionedFileSetDataset.SCHEMA, FileBatchSource.DEFAULT_SCHEMA.toString(), Properties.TimePartitionedFileSetDataset.TPFS_NAME, "fileSink")); ETLBatchConfig etlConfig = new ETLBatchConfig("* * * * *", source, sink, Lists.<ETLStage>newArrayList()); AdapterConfig adapterConfig = new AdapterConfig("", TEMPLATE_ID.getId(), GSON.toJsonTree(etlConfig)); Id.Adapter adapterId = Id.Adapter.from(NAMESPACE, "testFileAdapter"); AdapterManager manager = createAdapter(adapterId, adapterConfig); manager.start(); manager.waitForOneRunToFinish(2, TimeUnit.MINUTES); manager.stop(); DataSetManager<TimePartitionedFileSet> fileSetManager = getDataset("fileSink"); TimePartitionedFileSet fileSet = fileSetManager.get(); List<GenericRecord> records = readOutput(fileSet, FileBatchSource.DEFAULT_SCHEMA); Assert.assertEquals(1, records.size()); Assert.assertEquals(testData, records.get(0).get("body").toString()); fileSet.close(); }
From source file:co.cask.hydrator.action.ftp.FTPCopyAction.java
License:Apache License
@Override public void run(ActionContext context) throws Exception { Path destination = new Path(config.getDestDirectory()); FileSystem fileSystem = FileSystem.get(new Configuration()); destination = fileSystem.makeQualified(destination); if (!fileSystem.exists(destination)) { fileSystem.mkdirs(destination);/*ww w . ja v a 2 s .com*/ } FTPClient ftp; if ("ftp".equals(config.getProtocol().toLowerCase())) { ftp = new FTPClient(); } else { ftp = new FTPSClient(); } ftp.setControlKeepAliveTimeout(5); // UNIX type server FTPClientConfig ftpConfig = new FTPClientConfig(); // Set additional parameters required for the ftp // for example config.setServerTimeZoneId("Pacific/Pitcairn") ftp.configure(ftpConfig); try { ftp.connect(config.getHost(), config.getPort()); ftp.enterLocalPassiveMode(); String replyString = ftp.getReplyString(); LOG.info("Connected to server {} and port {} with reply from connect as {}.", config.getHost(), config.getPort(), replyString); // Check the reply code for actual success int replyCode = ftp.getReplyCode(); if (!FTPReply.isPositiveCompletion(replyCode)) { ftp.disconnect(); throw new RuntimeException(String.format("FTP server refused connection with code %s and reply %s.", replyCode, replyString)); } if (!ftp.login(config.getUserName(), config.getPassword())) { LOG.error("login command reply code {}, {}", ftp.getReplyCode(), ftp.getReplyString()); ftp.logout(); throw new RuntimeException(String.format( "Login to the FTP server %s and port %s failed. " + "Please check user name and password.", config.getHost(), config.getPort())); } FTPFile[] ftpFiles = ftp.listFiles(config.getSrcDirectory()); LOG.info("listFiles command reply code: {}, {}.", ftp.getReplyCode(), ftp.getReplyString()); // Check the reply code for listFiles call. // If its "522 Data connections must be encrypted" then it means data channel also need to be encrypted if (ftp.getReplyCode() == 522 && "sftp".equalsIgnoreCase(config.getProtocol())) { // encrypt data channel and listFiles again ((FTPSClient) ftp).execPROT("P"); LOG.info("Attempting command listFiles on encrypted data channel."); ftpFiles = ftp.listFiles(config.getSrcDirectory()); } for (FTPFile file : ftpFiles) { String source = config.getSrcDirectory() + "/" + file.getName(); LOG.info("Current file {}, source {}", file.getName(), source); if (config.getExtractZipFiles() && file.getName().endsWith(".zip")) { copyZip(ftp, source, fileSystem, destination); } else { Path destinationPath = fileSystem.makeQualified(new Path(destination, file.getName())); LOG.debug("Downloading {} to {}", file.getName(), destinationPath.toString()); try (OutputStream output = fileSystem.create(destinationPath)) { InputStream is = ftp.retrieveFileStream(source); ByteStreams.copy(is, output); } } if (!ftp.completePendingCommand()) { LOG.error("Error completing command."); } } ftp.logout(); } finally { if (ftp.isConnected()) { try { ftp.disconnect(); } catch (Throwable e) { LOG.error("Failure to disconnect the ftp connection.", e); } } } }
From source file:co.cask.hydrator.action.ftp.FTPCopyAction.java
License:Apache License
private void copyZip(FTPClient ftp, String source, FileSystem fs, Path destination) throws IOException { InputStream is = ftp.retrieveFileStream(source); try (ZipInputStream zis = new ZipInputStream(new BufferedInputStream(is))) { ZipEntry entry;/*from ww w. j av a2 s . c o m*/ while ((entry = zis.getNextEntry()) != null) { LOG.debug("Extracting {}", entry); Path destinationPath = fs.makeQualified(new Path(destination, entry.getName())); try (OutputStream os = fs.create(destinationPath)) { LOG.debug("Downloading {} to {}", entry.getName(), destinationPath.toString()); ByteStreams.copy(zis, os); } } } }
From source file:co.cask.hydrator.plugin.batch.action.FileAction.java
License:Apache License
@SuppressWarnings("ConstantConditions") @Override/*ww w.j a v a2 s . c om*/ public void run(BatchActionContext context) throws Exception { if (!config.shouldRun(context)) { return; } config.substituteMacros(context); Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); FileSystem fileSystem = FileSystem.get(conf); Path[] paths; Path sourcePath = new Path(config.path); if (fileSystem.isDirectory(sourcePath)) { FileStatus[] status = fileSystem.listStatus(sourcePath); paths = FileUtil.stat2Paths(status); } else { paths = new Path[] { sourcePath }; } //get regex pattern for file name filtering. boolean patternSpecified = !Strings.isNullOrEmpty(config.pattern); if (patternSpecified) { regex = Pattern.compile(config.pattern); } switch (config.action.toLowerCase()) { case "delete": for (Path path : paths) { if (!patternSpecified || isFileNameMatch(path.getName())) { fileSystem.delete(path, true); } } break; case "move": for (Path path : paths) { if (!patternSpecified || isFileNameMatch(path.getName())) { Path targetFileMovePath = new Path(config.targetFolder, path.getName()); fileSystem.rename(path, targetFileMovePath); } } break; case "archive": for (Path path : paths) { if (!patternSpecified || isFileNameMatch(path.getName())) { try (FSDataOutputStream archivedStream = fileSystem .create(new Path(config.targetFolder, path.getName() + ".zip")); ZipOutputStream zipArchivedStream = new ZipOutputStream(archivedStream); FSDataInputStream fdDataInputStream = fileSystem.open(path)) { zipArchivedStream.putNextEntry(new ZipEntry(path.getName())); int length; byte[] buffer = new byte[1024]; while ((length = fdDataInputStream.read(buffer)) > 0) { zipArchivedStream.write(buffer, 0, length); } zipArchivedStream.closeEntry(); } fileSystem.delete(path, true); } } break; default: LOG.warn("No action required on the file."); break; } }
From source file:co.cask.hydrator.plugin.batch.ETLMapReduceTestRun.java
License:Apache License
@Test public void testFiletoMultipleTPFS() throws Exception { String filePath = "file:///tmp/test/text.txt"; String testData = "String for testing purposes."; Path textFile = new Path(filePath); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FSDataOutputStream writeData = fs.create(textFile); writeData.write(testData.getBytes()); writeData.flush();//from w ww. java 2s.com writeData.close(); ETLStage source = new ETLStage("source", new ETLPlugin("File", BatchSource.PLUGIN_TYPE, ImmutableMap.<String, String>builder().put(Constants.Reference.REFERENCE_NAME, "TestFile") .put(Properties.File.FILESYSTEM, "Text").put(Properties.File.PATH, filePath).build(), null)); ETLStage sink1 = new ETLStage("sink1", new ETLPlugin("TPFSAvro", BatchSink.PLUGIN_TYPE, ImmutableMap.of(Properties.TimePartitionedFileSetDataset.SCHEMA, FileBatchSource.DEFAULT_SCHEMA.toString(), Properties.TimePartitionedFileSetDataset.TPFS_NAME, "fileSink1"), null)); ETLStage sink2 = new ETLStage("sink2", new ETLPlugin("TPFSParquet", BatchSink.PLUGIN_TYPE, ImmutableMap.of(Properties.TimePartitionedFileSetDataset.SCHEMA, FileBatchSource.DEFAULT_SCHEMA.toString(), Properties.TimePartitionedFileSetDataset.TPFS_NAME, "fileSink2"), null)); ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink1) .addStage(sink2).addConnection(source.getName(), sink1.getName()) .addConnection(source.getName(), sink2.getName()).build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(ETLBATCH_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "FileToTPFS"); ApplicationManager appManager = deployApplication(appId, appRequest); MapReduceManager mrManager = appManager.getMapReduceManager(ETLMapReduce.NAME); mrManager.start(); mrManager.waitForFinish(2, TimeUnit.MINUTES); for (String sinkName : new String[] { "fileSink1", "fileSink2" }) { DataSetManager<TimePartitionedFileSet> fileSetManager = getDataset(sinkName); try (TimePartitionedFileSet fileSet = fileSetManager.get()) { List<GenericRecord> records = readOutput(fileSet, FileBatchSource.DEFAULT_SCHEMA); Assert.assertEquals(1, records.size()); Assert.assertEquals(testData, records.get(0).get("body").toString()); } } }
From source file:coldstorage.io.Writer.java
License:Apache License
public static void main(String[] args) throws IOException { Schema.Parser parser = new Schema.Parser(); Schema schema = parser.parse("{" + "\"namespace\": \"example.avro\", " + "\"type\": \"record\", " + "\"name\": \"User\", " + "\"fields\": [" + " {\"name\": \"id\", \"type\": \"long\"}," + " {\"name\": \"data\", \"type\": \"string\"}" + " ]}"); GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw); // Path pathData = new Path("./out/data.avro"); // Path pathIndex = new Path("./out/data.index"); Path pathData = new Path("hdfs://localhost:9000/avro/out/data.avro"); Path pathIndex = new Path("hdfs://localhost:9000/avro/out/data.index"); Configuration configuration = new Configuration(); FileSystem fileSystem = pathData.getFileSystem(configuration); FSDataOutputStream indexOutputStream = fileSystem.create(pathIndex); FSDataOutputStream outputStream = fileSystem.create(pathData); dfw.create(schema, outputStream);// www.jav a 2 s.com GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(schema); Random random = new Random(1); final int syncPoint = 1000; int count = 0; for (int i = 0; i < 100000000; i++) { genericRecordBuilder.set("id", (long) i); genericRecordBuilder.set("data", Long.toString(random.nextLong())); Record record = genericRecordBuilder.build(); dfw.append(record); if (count >= syncPoint) { long sync = dfw.sync(); Object object = record.get("id"); writeIndex(indexOutputStream, sync, object); count = 0; } count++; } indexOutputStream.close(); dfw.close(); }