List of usage examples for org.apache.hadoop.fs FileSystem create
public FSDataOutputStream create(Path f) throws IOException
From source file:com.cloudera.sqoop.manager.NetezzaExportManualTest.java
License:Apache License
protected void createExportFile(ColumnGenerator... extraCols) throws IOException, SQLException { String ext = ".txt"; Path tablePath = getTablePath(); Path filePath = new Path(tablePath, "part0" + ext); Configuration conf = new Configuration(); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); }//from w w w.j av a2s. c om FileSystem fs = FileSystem.get(conf); fs.mkdirs(tablePath); OutputStream os = fs.create(filePath); BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); for (int i = 0; i < 3; i++) { String line = getRecordLine(i, extraCols); w.write(line); LOG.debug("Create Export file - Writing line : " + line); } w.close(); os.close(); }
From source file:com.cloudera.sqoop.TestAvroExport.java
License:Apache License
/** * Create a data file that gets exported to the db. * @param fileNum the number of the file (for multi-file export) * @param numRecords how many records to write to the file. *///from w ww . j a v a 2 s .c om protected void createAvroFile(int fileNum, int numRecords, ColumnGenerator... extraCols) throws IOException { Path tablePath = getTablePath(); Path filePath = new Path(tablePath, "part" + fileNum); Configuration conf = new Configuration(); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } FileSystem fs = FileSystem.get(conf); fs.mkdirs(tablePath); OutputStream os = fs.create(filePath); Schema schema = buildAvroSchema(extraCols); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter); dataFileWriter.create(schema, os); for (int i = 0; i < numRecords; i++) { GenericRecord record = new GenericData.Record(schema); record.put("id", i); record.put("msg", getMsgPrefix() + i); addExtraColumns(record, i, extraCols); dataFileWriter.append(record); } dataFileWriter.close(); os.close(); }
From source file:com.cloudera.sqoop.TestExport.java
License:Apache License
/** * Create a data file that gets exported to the db. * @param fileNum the number of the file (for multi-file export) * @param numRecords how many records to write to the file. * @param gzip is true if the file should be gzipped. *//*from w ww . j a v a 2s.c om*/ protected void createTextFile(int fileNum, int numRecords, boolean gzip, ColumnGenerator... extraCols) throws IOException { int startId = fileNum * numRecords; String ext = ".txt"; if (gzip) { ext = ext + ".gz"; } Path tablePath = getTablePath(); Path filePath = new Path(tablePath, "part" + fileNum + ext); Configuration conf = new Configuration(); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } FileSystem fs = FileSystem.get(conf); fs.mkdirs(tablePath); OutputStream os = fs.create(filePath); if (gzip) { CompressionCodecFactory ccf = new CompressionCodecFactory(conf); CompressionCodec codec = ccf.getCodec(filePath); os = codec.createOutputStream(os); } BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); for (int i = 0; i < numRecords; i++) { w.write(getRecordLine(startId + i, extraCols)); } w.close(); os.close(); if (gzip) { verifyCompressedFile(filePath, numRecords); } }
From source file:com.cloudera.sqoop.TestExportUpdate.java
License:Apache License
/** * <p>Creates update files for multi-key update test. The total number of * update records will be number of files times the number of aKeysPerFile * times 3. Column A value will start with the specified <tt>startAtValue</tt> * and for each value there will be three records corresponding to Column * B values [0-2].</p>/*from ww w . jav a 2 s. c o m*/ * @param numFiles number of files to create * @param aKeysPerFile number of records sets with different column A values * @param startAtValue the starting value of column A * @param bKeyValues the list of values for the column B * @throws IOException */ private void createMultiKeyUpdateFiles(int numFiles, int aKeysPerFile, int startAtValue, int[] bKeyValues) throws IOException { Configuration conf = getConf(); if (!isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } FileSystem fs = FileSystem.get(conf); int aValue = startAtValue; for (int i = 0; i < numFiles; i++) { OutputStream os = fs.create(new Path(getTablePath(), "" + i + ".txt")); BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); for (int j = 0; j < aKeysPerFile; j++) { for (int k = 0; k < bKeyValues.length; k++) { w.write(getUpdateStringForMultiKeyRow(aValue, bKeyValues[k])); } aValue++; } w.close(); os.close(); } }
From source file:com.cloudera.sqoop.TestExportUpdate.java
License:Apache License
/** * Create a set of files that will be used as the input to the update * process.// w w w . j ava 2 s .c om * @param numFiles the number of files to generate * @param updatesPerFile the number of rows to create in each file * @param keyCol a value between 0 and 2 specifying whether 'a', * 'b', or 'c' ({@see populateDatabase()}) is the key column to keep * the same. * @param startOffsets is an optional list of row ids/values for a/c * which are the record ids at which the update files begin. * For instance, if numFiles=3, updatesPerFile=2, and keyCol=0 then * if startOffsets is {5, 10, 12}, files will be generated to update * rows with A=5,6; A=10,11; A=12,13. * * If startOffsets is empty or underspecified (given numFiles), then * subsequent files will start immediately after the previous file. */ private void createUpdateFiles(int numFiles, int updatesPerFile, int keyCol, int... startOffsets) throws IOException { Configuration conf = getConf(); if (!isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } FileSystem fs = FileSystem.get(conf); int rowId = 0; for (int i = 0; i < numFiles; i++) { OutputStream os = fs.create(new Path(getTablePath(), "" + i + ".txt")); BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); if (null != startOffsets && startOffsets.length > i) { // If a start offset has been specified for this file, go there. // Otherwise, just carry over from the previous file iteration. rowId = startOffsets[i]; } for (int j = 0; j < updatesPerFile; j++) { w.write(getUpdateStringForRow(keyCol, rowId++)); } w.close(); os.close(); } }
From source file:com.collective.celos.ci.testing.fixtures.deploy.HdfsInputDeployer.java
License:Apache License
@Override public void deploy(TestRun testRun) throws Exception { FileSystem fileSystem = testRun.getCiContext().getFileSystem(); CollectFilesAndPathsProcessor pathToFile = new CollectFilesAndPathsProcessor(); TreeObjectProcessor.process(fixObjectCreator.create(testRun), pathToFile); Path pathPrefixed = new Path(Util.augmentHdfsPath(testRun.getHdfsPrefix(), path.toString())); for (java.nio.file.Path childPath : pathToFile.pathToFiles.keySet()) { Path pathTo = new Path(pathPrefixed, childPath.toString()); fileSystem.mkdirs(pathTo.getParent()); FSDataOutputStream outputStream = fileSystem.create(pathTo); try {//from ww w. j av a2s . c o m IOUtils.copy(pathToFile.pathToFiles.get(childPath).getContent(), outputStream); } finally { outputStream.flush(); outputStream.close(); } } }
From source file:com.collective.celos.ci.testing.fixtures.deploy.hive.HiveTableDeployer.java
License:Apache License
private Path createTempHdfsFileForInsertion(FixTable fixTable, TestRun testRun) throws Exception { Path pathToParent = new Path(testRun.getHdfsPrefix(), ".hive"); Path pathTo = new Path(pathToParent, UUID.randomUUID().toString()); FileSystem fileSystem = testRun.getCiContext().getFileSystem(); fileSystem.mkdirs(pathTo.getParent()); FSDataOutputStream outputStream = fileSystem.create(pathTo); CSVWriter writer = new CSVWriter(new OutputStreamWriter(outputStream), '\t', CSVWriter.NO_QUOTE_CHARACTER); for (FixTable.FixRow fixRow : fixTable.getRows()) { List<String> rowData = Lists.newArrayList(); for (String colName : fixTable.getColumnNames()) { rowData.add(fixRow.getCells().get(colName)); }/*from w w w . j av a 2 s . co m*/ String[] dataArray = rowData.toArray(new String[rowData.size()]); writer.writeNext(dataArray); } writer.close(); fileSystem.setPermission(pathToParent, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); fileSystem.setPermission(pathTo, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); return pathTo; }
From source file:com.dasasian.chok.util.FileUtil.java
License:Apache License
public static void unzipInDfs(FileSystem fileSystem, final Path source, final Path target) { try {/* w w w.j a v a2 s . c o m*/ FSDataInputStream dfsInputStream = fileSystem.open(source); fileSystem.mkdirs(target); final ZipInputStream zipInputStream = new ZipInputStream(dfsInputStream); ZipEntry entry; while ((entry = zipInputStream.getNextEntry()) != null) { final String entryPath = entry.getName(); final int indexOf = entryPath.indexOf("/"); final String cleanUpPath = entryPath.substring(indexOf + 1, entryPath.length()); Path path = target; if (!cleanUpPath.equals("")) { path = new Path(target, cleanUpPath); } LOG.info("Extracting: " + entry + " to " + path); if (entry.isDirectory()) { fileSystem.mkdirs(path); } else { int count; final byte data[] = new byte[4096]; FSDataOutputStream fsDataOutputStream = fileSystem.create(path); while ((count = zipInputStream.read(data, 0, 4096)) != -1) { fsDataOutputStream.write(data, 0, count); } fsDataOutputStream.flush(); fsDataOutputStream.close(); } } zipInputStream.close(); } catch (final Exception e) { LOG.error("can not open zip file", e); throw new RuntimeException("unable to expand upgrade files", e); } }
From source file:com.datasalt.pangool.examples.gameoflife.GameOfLifeJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { failArguments("Wrong number of arguments"); return -1; }//from w ww. j a v a2s.c o m String output = args[0]; String input = GameOfLifeJob.class.getName() + "-prepared-input"; delete(output); delete(input); final int gridSize = Integer.parseInt(args[1]); // Write the input of the job as a set of (min, max) intervals // Each number between (min, max) represents a possible initial configuration for Game of Life int parallelism = Integer.parseInt(args[2]); int maxCombinations = (int) Math.pow(2, gridSize * gridSize); int splitSize = maxCombinations / parallelism; FileSystem fS = FileSystem.get(conf); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fS.create(new Path(input)))); for (int i = 0; i < parallelism; i++) { writer.write(((i * splitSize) + 1) + "\t" + ((i + 1) * splitSize) + "\n"); } writer.close(); // Optional parameters: maxX, maxY, #iterations final int maxX = conf.getInt("gol.max_x", 32); final int maxY = conf.getInt("gol.max_y", 32); final int iterations = conf.getInt("gol.iterations", 1000); Log.info( "using parameters: maxX grid: " + maxX + " maxY grid: " + maxY + " max #iterations: " + iterations); // Define the intermediate schema: a pair of ints final Schema schema = new Schema("minMax", Fields.parse("min:int, max:int")); TupleMRBuilder job = new TupleMRBuilder(conf); job.addIntermediateSchema(schema); job.setGroupByFields("min", "max"); job.setCustomPartitionFields("min"); // Define the input and its associated mapper // The mapper will just emit the (min, max) pairs to the reduce stage job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new TupleMapper<LongWritable, Text>() { Tuple tuple = new Tuple(schema); @Override public void map(LongWritable key, Text value, TupleMRContext context, Collector collector) throws IOException, InterruptedException { String[] fields = value.toString().split("\t"); tuple.set("min", Integer.parseInt(fields[0])); tuple.set("max", Integer.parseInt(fields[1])); collector.write(tuple); } }); // Define the reducer // The reducer will run as many games of life as (max - min) for each interval it receives // It will emit the inputs of GOL that converged together with the number of iterations // Note that inputs that produce grid overflow are ignored (but may have longer iteration convergence) job.setTupleReducer(new TupleReducer<Text, NullWritable>() { public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector) throws IOException, InterruptedException, TupleMRException { int min = (Integer) group.get("min"), max = (Integer) group.get("max"); for (int i = min; i < max; i++) { try { GameOfLife gameOfLife = new GameOfLife(gridSize, GameOfLife.longToBytes((long) i), maxX, maxY, iterations); while (true) { gameOfLife.nextCycle(); } } catch (GameOfLifeException e) { context.getHadoopContext().progress(); context.getHadoopContext().getCounter("stats", e.getCauseMessage() + "").increment(1); if (e.getCauseMessage().equals(CauseMessage.CONVERGENCE_REACHED)) { collector.write(new Text( Arrays.toString(GameOfLife.longToBytes((long) i)) + "\t" + e.getIterations()), NullWritable.get()); } } } }; }); job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class); try { job.createJob().waitForCompletion(true); } finally { job.cleanUpInstanceFiles(); } delete(input); return 0; }
From source file:com.datasalt.pangool.utils.InstancesDistributor.java
License:Apache License
/** * Utility method for serializing an object and saving it in a way that later can be recovered * anywhere in the cluster./*from w w w . ja va 2 s. co m*/ * <p> * The file where it has been serialized will be saved into a Hadoop Configuration property so that you can call * {@link InstancesDistributor#loadInstance(Configuration, Class, String, boolean)} to re-instantiate the serialized instance. * * @param obj The obj instance to serialize using Java serialization. * @param fileName The file name where the instance will be serialized. * @param conf The Hadoop Configuration. * @throws FileNotFoundException * @throws IOException * @throws URISyntaxException */ public static void distribute(Object obj, String fileName, Configuration conf) throws FileNotFoundException, IOException, URISyntaxException { FileSystem fS = FileSystem.get(conf); // set the temporary folder for Pangool instances to the temporary of the user that is running the Job // This folder will be used across the cluster for location the instances. // The default value can be changed by a user-provided one. String tmpHdfsFolder = conf.get(HDFS_TMP_FOLDER_CONF, DEFAULT_HDFS_TMP_FOLDER_CONF_VALUE); Path toHdfs = new Path(tmpHdfsFolder, fileName); if (fS.exists(toHdfs)) { // Optionally, copy to DFS if fS.delete(toHdfs, false); } ObjectOutput out = new ObjectOutputStream(fS.create(toHdfs)); out.writeObject(obj); out.close(); DistributedCache.addCacheFile(toHdfs.toUri(), conf); }