Example usage for org.apache.hadoop.fs FileSystem create

List of usage examples for org.apache.hadoop.fs FileSystem create

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem create.

Prototype

public FSDataOutputStream create(Path f) throws IOException 

Source Link

Document

Create an FSDataOutputStream at the indicated Path.

Usage

From source file:com.cloudera.sqoop.manager.NetezzaExportManualTest.java

License:Apache License

protected void createExportFile(ColumnGenerator... extraCols) throws IOException, SQLException {
    String ext = ".txt";

    Path tablePath = getTablePath();
    Path filePath = new Path(tablePath, "part0" + ext);

    Configuration conf = new Configuration();
    if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
        conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }//from w  w w.j  av a2s.  c  om
    FileSystem fs = FileSystem.get(conf);
    fs.mkdirs(tablePath);
    OutputStream os = fs.create(filePath);

    BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));
    for (int i = 0; i < 3; i++) {
        String line = getRecordLine(i, extraCols);
        w.write(line);
        LOG.debug("Create Export file - Writing line : " + line);
    }
    w.close();
    os.close();
}

From source file:com.cloudera.sqoop.TestAvroExport.java

License:Apache License

/**
 * Create a data file that gets exported to the db.
 * @param fileNum the number of the file (for multi-file export)
 * @param numRecords how many records to write to the file.
 *///from   w ww .  j  a v a 2  s .c  om
protected void createAvroFile(int fileNum, int numRecords, ColumnGenerator... extraCols) throws IOException {

    Path tablePath = getTablePath();
    Path filePath = new Path(tablePath, "part" + fileNum);

    Configuration conf = new Configuration();
    if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
        conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }
    FileSystem fs = FileSystem.get(conf);
    fs.mkdirs(tablePath);
    OutputStream os = fs.create(filePath);

    Schema schema = buildAvroSchema(extraCols);
    DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>();
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    dataFileWriter.create(schema, os);

    for (int i = 0; i < numRecords; i++) {
        GenericRecord record = new GenericData.Record(schema);
        record.put("id", i);
        record.put("msg", getMsgPrefix() + i);
        addExtraColumns(record, i, extraCols);
        dataFileWriter.append(record);
    }

    dataFileWriter.close();
    os.close();
}

From source file:com.cloudera.sqoop.TestExport.java

License:Apache License

/**
 * Create a data file that gets exported to the db.
 * @param fileNum the number of the file (for multi-file export)
 * @param numRecords how many records to write to the file.
 * @param gzip is true if the file should be gzipped.
 *//*from  w  ww . j  a v a 2s.c om*/
protected void createTextFile(int fileNum, int numRecords, boolean gzip, ColumnGenerator... extraCols)
        throws IOException {
    int startId = fileNum * numRecords;

    String ext = ".txt";
    if (gzip) {
        ext = ext + ".gz";
    }
    Path tablePath = getTablePath();
    Path filePath = new Path(tablePath, "part" + fileNum + ext);

    Configuration conf = new Configuration();
    if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
        conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }
    FileSystem fs = FileSystem.get(conf);
    fs.mkdirs(tablePath);
    OutputStream os = fs.create(filePath);
    if (gzip) {
        CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
        CompressionCodec codec = ccf.getCodec(filePath);
        os = codec.createOutputStream(os);
    }
    BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));
    for (int i = 0; i < numRecords; i++) {
        w.write(getRecordLine(startId + i, extraCols));
    }
    w.close();
    os.close();

    if (gzip) {
        verifyCompressedFile(filePath, numRecords);
    }
}

From source file:com.cloudera.sqoop.TestExportUpdate.java

License:Apache License

/**
 * <p>Creates update files for multi-key update test. The total number of
 * update records will be number of files times the number of aKeysPerFile
 * times 3. Column A value will start with the specified <tt>startAtValue</tt>
 * and for each value there will be three records corresponding to Column
 * B values [0-2].</p>/*from   ww w . jav  a 2 s. c o m*/
 * @param numFiles number of files to create
 * @param aKeysPerFile number of records sets with different column A values
 * @param startAtValue the starting value of column A
 * @param bKeyValues the list of values for the column B
 * @throws IOException
 */
private void createMultiKeyUpdateFiles(int numFiles, int aKeysPerFile, int startAtValue, int[] bKeyValues)
        throws IOException {
    Configuration conf = getConf();
    if (!isOnPhysicalCluster()) {
        conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }
    FileSystem fs = FileSystem.get(conf);

    int aValue = startAtValue;
    for (int i = 0; i < numFiles; i++) {
        OutputStream os = fs.create(new Path(getTablePath(), "" + i + ".txt"));
        BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));

        for (int j = 0; j < aKeysPerFile; j++) {
            for (int k = 0; k < bKeyValues.length; k++) {
                w.write(getUpdateStringForMultiKeyRow(aValue, bKeyValues[k]));
            }
            aValue++;
        }

        w.close();
        os.close();
    }
}

From source file:com.cloudera.sqoop.TestExportUpdate.java

License:Apache License

/**
 * Create a set of files that will be used as the input to the update
 * process.//  w  w w  .  j  ava  2  s  .c om
 * @param numFiles the number of files to generate
 * @param updatesPerFile the number of rows to create in each file
 * @param keyCol a value between 0 and 2 specifying whether 'a',
 * 'b', or 'c' ({@see populateDatabase()}) is the key column to keep
 * the same.
 * @param startOffsets is an optional list of row ids/values for a/c
 * which are the record ids at which the update files begin.
 * For instance, if numFiles=3, updatesPerFile=2, and keyCol=0 then
 * if startOffsets is {5, 10, 12}, files will be generated to update
 * rows with A=5,6; A=10,11; A=12,13.
 *
 * If startOffsets is empty or underspecified (given numFiles), then
 * subsequent files will start immediately after the previous file.
 */
private void createUpdateFiles(int numFiles, int updatesPerFile, int keyCol, int... startOffsets)
        throws IOException {
    Configuration conf = getConf();
    if (!isOnPhysicalCluster()) {
        conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }
    FileSystem fs = FileSystem.get(conf);

    int rowId = 0;
    for (int i = 0; i < numFiles; i++) {
        OutputStream os = fs.create(new Path(getTablePath(), "" + i + ".txt"));
        BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));

        if (null != startOffsets && startOffsets.length > i) {
            // If a start offset has been specified for this file, go there.
            // Otherwise, just carry over from the previous file iteration.
            rowId = startOffsets[i];
        }

        for (int j = 0; j < updatesPerFile; j++) {
            w.write(getUpdateStringForRow(keyCol, rowId++));
        }

        w.close();
        os.close();
    }
}

From source file:com.collective.celos.ci.testing.fixtures.deploy.HdfsInputDeployer.java

License:Apache License

@Override
public void deploy(TestRun testRun) throws Exception {
    FileSystem fileSystem = testRun.getCiContext().getFileSystem();

    CollectFilesAndPathsProcessor pathToFile = new CollectFilesAndPathsProcessor();
    TreeObjectProcessor.process(fixObjectCreator.create(testRun), pathToFile);

    Path pathPrefixed = new Path(Util.augmentHdfsPath(testRun.getHdfsPrefix(), path.toString()));
    for (java.nio.file.Path childPath : pathToFile.pathToFiles.keySet()) {
        Path pathTo = new Path(pathPrefixed, childPath.toString());
        fileSystem.mkdirs(pathTo.getParent());

        FSDataOutputStream outputStream = fileSystem.create(pathTo);
        try {//from ww w. j  av  a2s  . c  o m
            IOUtils.copy(pathToFile.pathToFiles.get(childPath).getContent(), outputStream);
        } finally {
            outputStream.flush();
            outputStream.close();
        }

    }
}

From source file:com.collective.celos.ci.testing.fixtures.deploy.hive.HiveTableDeployer.java

License:Apache License

private Path createTempHdfsFileForInsertion(FixTable fixTable, TestRun testRun) throws Exception {

    Path pathToParent = new Path(testRun.getHdfsPrefix(), ".hive");
    Path pathTo = new Path(pathToParent, UUID.randomUUID().toString());
    FileSystem fileSystem = testRun.getCiContext().getFileSystem();
    fileSystem.mkdirs(pathTo.getParent());
    FSDataOutputStream outputStream = fileSystem.create(pathTo);

    CSVWriter writer = new CSVWriter(new OutputStreamWriter(outputStream), '\t', CSVWriter.NO_QUOTE_CHARACTER);

    for (FixTable.FixRow fixRow : fixTable.getRows()) {
        List<String> rowData = Lists.newArrayList();
        for (String colName : fixTable.getColumnNames()) {
            rowData.add(fixRow.getCells().get(colName));
        }/*from w w  w . j  av a  2  s  .  co m*/
        String[] dataArray = rowData.toArray(new String[rowData.size()]);
        writer.writeNext(dataArray);
    }

    writer.close();

    fileSystem.setPermission(pathToParent, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
    fileSystem.setPermission(pathTo, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
    return pathTo;
}

From source file:com.dasasian.chok.util.FileUtil.java

License:Apache License

public static void unzipInDfs(FileSystem fileSystem, final Path source, final Path target) {
    try {/* w w w.j  a  v a2 s  .  c o  m*/
        FSDataInputStream dfsInputStream = fileSystem.open(source);
        fileSystem.mkdirs(target);
        final ZipInputStream zipInputStream = new ZipInputStream(dfsInputStream);
        ZipEntry entry;

        while ((entry = zipInputStream.getNextEntry()) != null) {
            final String entryPath = entry.getName();
            final int indexOf = entryPath.indexOf("/");
            final String cleanUpPath = entryPath.substring(indexOf + 1, entryPath.length());
            Path path = target;
            if (!cleanUpPath.equals("")) {
                path = new Path(target, cleanUpPath);
            }
            LOG.info("Extracting: " + entry + " to " + path);
            if (entry.isDirectory()) {
                fileSystem.mkdirs(path);
            } else {
                int count;
                final byte data[] = new byte[4096];
                FSDataOutputStream fsDataOutputStream = fileSystem.create(path);
                while ((count = zipInputStream.read(data, 0, 4096)) != -1) {
                    fsDataOutputStream.write(data, 0, count);
                }
                fsDataOutputStream.flush();
                fsDataOutputStream.close();
            }
        }
        zipInputStream.close();
    } catch (final Exception e) {
        LOG.error("can not open zip file", e);
        throw new RuntimeException("unable to expand upgrade files", e);
    }

}

From source file:com.datasalt.pangool.examples.gameoflife.GameOfLifeJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        failArguments("Wrong number of arguments");
        return -1;
    }//from   w ww. j a v a2s.c o m
    String output = args[0];
    String input = GameOfLifeJob.class.getName() + "-prepared-input";
    delete(output);
    delete(input);

    final int gridSize = Integer.parseInt(args[1]);
    // Write the input of the job as a set of (min, max) intervals
    // Each number between (min, max) represents a possible initial configuration for Game of Life
    int parallelism = Integer.parseInt(args[2]);
    int maxCombinations = (int) Math.pow(2, gridSize * gridSize);
    int splitSize = maxCombinations / parallelism;
    FileSystem fS = FileSystem.get(conf);
    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fS.create(new Path(input))));
    for (int i = 0; i < parallelism; i++) {
        writer.write(((i * splitSize) + 1) + "\t" + ((i + 1) * splitSize) + "\n");
    }
    writer.close();

    // Optional parameters: maxX, maxY, #iterations
    final int maxX = conf.getInt("gol.max_x", 32);
    final int maxY = conf.getInt("gol.max_y", 32);
    final int iterations = conf.getInt("gol.iterations", 1000);
    Log.info(
            "using parameters: maxX grid: " + maxX + " maxY grid: " + maxY + " max #iterations: " + iterations);

    // Define the intermediate schema: a pair of ints
    final Schema schema = new Schema("minMax", Fields.parse("min:int, max:int"));

    TupleMRBuilder job = new TupleMRBuilder(conf);
    job.addIntermediateSchema(schema);
    job.setGroupByFields("min", "max");
    job.setCustomPartitionFields("min");
    // Define the input and its associated mapper
    // The mapper will just emit the (min, max) pairs to the reduce stage
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
            new TupleMapper<LongWritable, Text>() {

                Tuple tuple = new Tuple(schema);

                @Override
                public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
                        throws IOException, InterruptedException {
                    String[] fields = value.toString().split("\t");
                    tuple.set("min", Integer.parseInt(fields[0]));
                    tuple.set("max", Integer.parseInt(fields[1]));
                    collector.write(tuple);
                }
            });

    // Define the reducer
    // The reducer will run as many games of life as (max - min) for each interval it receives
    // It will emit the inputs of GOL that converged together with the number of iterations
    // Note that inputs that produce grid overflow are ignored (but may have longer iteration convergence)
    job.setTupleReducer(new TupleReducer<Text, NullWritable>() {

        public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
                throws IOException, InterruptedException, TupleMRException {

            int min = (Integer) group.get("min"), max = (Integer) group.get("max");
            for (int i = min; i < max; i++) {
                try {
                    GameOfLife gameOfLife = new GameOfLife(gridSize, GameOfLife.longToBytes((long) i), maxX,
                            maxY, iterations);
                    while (true) {
                        gameOfLife.nextCycle();
                    }
                } catch (GameOfLifeException e) {
                    context.getHadoopContext().progress();
                    context.getHadoopContext().getCounter("stats", e.getCauseMessage() + "").increment(1);
                    if (e.getCauseMessage().equals(CauseMessage.CONVERGENCE_REACHED)) {
                        collector.write(new Text(
                                Arrays.toString(GameOfLife.longToBytes((long) i)) + "\t" + e.getIterations()),
                                NullWritable.get());
                    }
                }
            }
        };
    });

    job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
            NullWritable.class);
    try {
        job.createJob().waitForCompletion(true);
    } finally {
        job.cleanUpInstanceFiles();
    }
    delete(input);
    return 0;
}

From source file:com.datasalt.pangool.utils.InstancesDistributor.java

License:Apache License

/**
 * Utility method for serializing an object and saving it in a way that later can be recovered
* anywhere in the cluster./*from  w  w  w  .  ja  va  2 s. co  m*/
 * <p>
 * The file where it has been serialized will be saved into a Hadoop Configuration property so that you can call
 * {@link InstancesDistributor#loadInstance(Configuration, Class, String, boolean)} to re-instantiate the serialized instance.
 * 
 * @param obj The obj instance to serialize using Java serialization.
 * @param fileName The file name where the instance will be serialized.
 * @param conf The Hadoop Configuration.
 * @throws FileNotFoundException
 * @throws IOException
 * @throws URISyntaxException
 */
public static void distribute(Object obj, String fileName, Configuration conf)
        throws FileNotFoundException, IOException, URISyntaxException {

    FileSystem fS = FileSystem.get(conf);
    // set the temporary folder for Pangool instances to the temporary of the user that is running the Job
    // This folder will be used across the cluster for location the instances.
    // The default value can be changed by a user-provided one.
    String tmpHdfsFolder = conf.get(HDFS_TMP_FOLDER_CONF, DEFAULT_HDFS_TMP_FOLDER_CONF_VALUE);
    Path toHdfs = new Path(tmpHdfsFolder, fileName);
    if (fS.exists(toHdfs)) { // Optionally, copy to DFS if
        fS.delete(toHdfs, false);
    }

    ObjectOutput out = new ObjectOutputStream(fS.create(toHdfs));
    out.writeObject(obj);
    out.close();

    DistributedCache.addCacheFile(toHdfs.toUri(), conf);
}