Example usage for org.apache.hadoop.fs FileSystem create

List of usage examples for org.apache.hadoop.fs FileSystem create

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem create.

Prototype

public FSDataOutputStream create(Path f) throws IOException 

Source Link

Document

Create an FSDataOutputStream at the indicated Path.

Usage

From source file:com.ibm.bi.dml.runtime.io.WriterMatrixMarket.java

License:Open Source License

@Override
public void writeEmptyMatrixToHDFS(String fname, long rlen, long clen, int brlen, int bclen)
        throws IOException, DMLRuntimeException {
    Path path = new Path(fname);
    FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());

    FSDataOutputStream writer = fs.create(path);
    writer.writeBytes("1 1 0");
    writer.close();/* www  . ja  v  a2 s  . c o m*/
}

From source file:com.ibm.bi.dml.runtime.matrix.DataGenMR.java

License:Open Source License

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 * /*from w  ww  .ja  v a  2 s .  c  o  m*/
 * @param numRows number of rows for each random object
 * @param numCols number of columns for each random object
 * @param blockRowSize number of rows in a block for each random object
 * @param blockColSize number of columns in a block for each random object
 * @param minValue minimum of the random values for each random object
 * @param maxValue maximum of the random values for each random object
 * @param sparsity sparsity for each random object
 * @param pdf probability density function for each random object
 * @param replication file replication
 * @param inputs input file for each random object
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @param instructionsInMapper instruction for each random object
 * @param resultIndexes result indexes for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if an error occurred in the MapReduce phase
 */

public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper,
        String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication,
        byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();

        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);

        if (mrtype == MRINSTRUCTION_TYPE.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + "tmp" + _seqRandInput.getNextID() + ".randinput";
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);

            //for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();

            //seed generation
            Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
            long[] nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i],
                    randInst.getSparsity());
            int nnzIx = 0;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                for (long c = 0; c < clens[i]; c += bclens[i]) {
                    long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

                    sb.append((r / brlens[i]) + 1);
                    sb.append(',');
                    sb.append((c / bclens[i]) + 1);
                    sb.append(',');
                    sb.append(curBlockRowSize);
                    sb.append(',');
                    sb.append(curBlockColSize);
                    sb.append(',');
                    sb.append(nnz[nnzIx++]);
                    sb.append(',');
                    sb.append(bigrand.nextLong());
                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            }
            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            maxsparsity = 1.0; //always dense

            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;

            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

            // Compute the number of rows in the sequence
            long numrows = 1 + (long) Math.floor((to - from) / incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException(
                            "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                                    + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }

            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException(
                        "Unexpected error while processing sequence instruction. Number of columns (" + clens[i]
                                + ") must be equal to 1.");
            else
                clens[i] = 1;

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);
            StringBuilder sb = new StringBuilder();

            double temp = from;
            double block_from, block_to;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

                // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) 
                long bid_i = ((r / brlens[i]) + 1);
                long bid_j = 1;
                block_from = temp;
                block_to = temp + (curBlockRowSize - 1) * incr;
                temp = block_to + incr; // next block starts from here

                sb.append(bid_i);
                sb.append(',');
                sb.append(bid_j);
                sb.append(',');
                /*
                // Need not include block size while generating seq()
                sb.append(curBlockRowSize);
                sb.append(',');
                sb.append(1);
                sb.append(',');*/
                sb.append(block_from);
                sb.append(',');
                sb.append(block_to);
                sb.append(',');
                sb.append(incr);

                pw.println(sb.toString());
                //System.out.println("MapTask " + r + ": " + sb.toString());
                sb.setLength(0);
                numblocks++;
            }

            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    dataGenInsStr = dataGenInsStr.substring(1);//remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
                ConvertTarget.BLOCK);

        //set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

        //set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

        //set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

        //set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

        //set up the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //determine degree of parallelism (nmappers: 1<=n<=capacity)
        //TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        //correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math
                .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);

        //set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer,
                resultIndexes);

        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null,
                otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;

        //set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);

        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }

        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform")
                || instructionsInMapper.contains("groupedagg");

        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos,
                true, mayContainCtable);

        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }

        //set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);

        //configure reducer
        job.setReducerClass(GMRReducer.class);
        //job.setReducerClass(PassThroughReducer.class);

        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        runjob = JobClient.runJob(job);

        /* Process different counters */

        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }

        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
        for (String input : inputs)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.test.utils.TestUtils.java

License:Open Source License

/**
 * <p>//from ww w  . j  ava2s.c  o  m
 * Generates a test matrix with the specified parameters and writes it to a
 * file using the text format.
 * </p>
 * <p>
 * Set seed to -1 to use the current time as seed.
 * </p>
 * 
 * @param file
 *            output file
 * @param rows
 *            number of rows
 * @param cols
 *            number of columns
 * @param min
 *            minimum value
 * @param max
 *            maximum value
 * @param sparsity
 *            sparsity
 * @param seed
 *            seed
 */
public static void generateTestMatrixToFile(String file, int rows, int cols, double min, double max,
        double sparsity, long seed) {
    try {
        FileSystem fs = FileSystem.get(conf);
        Path inFile = new Path(file);
        DataOutputStream out = fs.create(inFile);
        PrintWriter pw = new PrintWriter(out);
        Random random;
        if (seed == -1)
            random = TestUtils.random;
        else
            random = new Random(seed);

        for (int i = 1; i <= rows; i++) {
            for (int j = 1; j <= cols; j++) {
                if (random.nextDouble() > sparsity)
                    continue;
                double value = (random.nextDouble() * (max - min) + min);
                if (value != 0)
                    pw.println(i + " " + j + " " + value);
            }
        }
        pw.close();
        out.close();
    } catch (IOException e) {
        fail("unable to write test matrix: " + e.getMessage());
    }
}

From source file:com.ibm.bi.dml.test.utils.TestUtils.java

License:Open Source License

/**
 * <p>//from  www  . j  a  v  a2s .c o  m
 * Creates an empty file.
 * </p>
 * 
 * @param file
 *            filename
 */
public static void createFile(String filename) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    fs.create(new Path(filename));
}

From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java

License:Apache License

public void writeSequentialHeap() throws Exception {
    System.out.println("writing sequential file in heap mode " + path);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    FSDataOutputStream instream = fs.create(path);
    byte[] buf = new byte[size];
    double sumbytes = 0;
    double ops = 0;
    System.out.println("read size " + size);
    System.out.println("operations " + loop);

    long start = System.currentTimeMillis();
    while (ops < loop) {
        //         System.out.println("writing data, len " + buf.length);
        instream.write(buf, 0, buf.length);
        sumbytes = sumbytes + buf.length;
        ops = ops + 1.0;/*  w  w  w  .  ja  va  2 s . co  m*/
    }
    instream.flush();
    long end = System.currentTimeMillis();
    double executionTime = ((double) (end - start)) / 1000.0;
    double throughput = 0.0;
    double latency = 0.0;
    double sumbits = sumbytes * 8.0;
    if (executionTime > 0) {
        throughput = sumbits / executionTime / 1024.0 / 1024.0;
        latency = 1000000.0 * executionTime / ops;
    }

    System.out.println("execution time " + executionTime);
    System.out.println("ops " + ops);
    System.out.println("sumbytes " + sumbytes);
    System.out.println("throughput " + throughput);
    System.out.println("latency " + latency);
    System.out.println("closing stream");
    instream.close();
    fs.close();
}

From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java

License:Apache License

void createFile() throws Exception, InterruptedException {
    System.out.println("create file async hdfs, path " + path + ", size " + size + ", loop " + loop);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    int repfactor = 4;
    for (int k = 0; k < repfactor; k++) {
        LinkedBlockingQueue<Path> pathQueue = new LinkedBlockingQueue<Path>();
        fs.mkdirs(path);/*from   w  w w  . j a v a  2 s . c om*/
        for (int i = 0; i < loop * size; i++) {
            String name = "" + i;
            Path f = new Path(path, name);
            pathQueue.add(f);
        }

        LinkedBlockingQueue<FSDataOutputStream> streamQueue = new LinkedBlockingQueue<FSDataOutputStream>();
        long start = System.currentTimeMillis();
        for (int i = 0; i < size; i++) {
            //single operation == loop
            for (int j = 0; j < loop; j++) {
                Path path = pathQueue.poll();
                fs.create(path).close();
            }
        }
        long end = System.currentTimeMillis();
        double executionTime = ((double) (end - start));
        double latency = executionTime * 1000.0 / ((double) size);
        System.out.println("execution time [ms] " + executionTime);
        System.out.println("latency [us] " + latency);

        while (!streamQueue.isEmpty()) {
            FSDataOutputStream stream = streamQueue.poll();
            stream.close();
        }

        if (k < repfactor - 1) {
            fs.delete(path, true);
            Thread.sleep(2000);
        }
    }
    fs.close();
}

From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java

License:Apache License

void keyGet() throws Exception {
    System.out.println("key get, path " + path + ", size " + size + ", loop " + loop);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    Path[] paths = new Path[loop];
    for (int i = 0; i < loop; i++) {
        String child = "" + i;
        paths[i] = new Path(path, child);
        System.out.println("path " + paths[i]);
    }//from   w w  w  . j a  v a 2 s  .c  o m

    byte[] outBuf = new byte[size];
    for (Path p : paths) {
        FSDataOutputStream outputStream = fs.create(p);
        outputStream.write(outBuf);
        outputStream.close();
    }

    long start = System.currentTimeMillis();
    ByteBuffer inBuf = ByteBuffer.allocateDirect(size);
    for (int i = 0; i < loop; i++) {
        Path p = paths[i];
        FSDataInputStream inputStream = fs.open(p);
        inBuf.clear();
        while (inBuf.remaining() > 0) {
            inputStream.read(inBuf);
        }
        inputStream.close();
    }
    long end = System.currentTimeMillis();
    double executionTime = ((double) (end - start));
    double latency = executionTime * 1000.0 / ((double) loop);
    System.out.println("execution time [ms] " + executionTime);
    System.out.println("latency [us] " + latency);

    fs.close();
}

From source file:com.ibm.stocator.fs.swift2d.systemtests.StreamingSwiftTest.java

License:Open Source License

@Test
public void accessObjectWithSpaceTest() throws Exception {
    FileSystem fs = new ObjectStoreFileSystem();
    Configuration conf = new Configuration();
    String uriString = conf.get("fs.swift2d.test.uri");
    Assume.assumeNotNull(uriString);//  w w w .j a va  2 s . c o  m
    // adding suffix with space to the container name
    String scheme = "swift2d";
    String objectName = "/a/testObject.txt";
    URI publicContainerURI = new URI(uriString + objectName);
    // initialize file system
    fs.initialize(publicContainerURI, conf);
    FileStatus objectFS = null;
    Path f = null;
    try {
        FSDataOutputStream fsDataOutputStream = null;
        String currObjName = null;
        for (int i = 0; i < 5; i++) {
            currObjName = objectName + String.valueOf(i);
            // create timer
            createObjectTimer(90000.0, currObjName);
            publicContainerURI = new URI(scheme + "://" + getHost(URI.create(uriString)) + "/" + currObjName);
            f = new Path(publicContainerURI.toString());
            fsDataOutputStream = fs.create(f);
            String line = null;
            while (!objectExpired) {
                // generates input
                byte[] bytes = new byte[0];
                line = "\"2017-7-15 3:6:43\"," + String.valueOf(Math.random()) + ",6,18" + "\n";
                ByteBuffer linesBB = ByteBuffer.wrap(line.getBytes());
                bytes = new byte[linesBB.limit()];
                linesBB.get(bytes);

                // writes to output
                fsDataOutputStream.write(bytes);

                // simulate delays in input
                Thread.sleep(50);
            }
            fsDataOutputStream.close();
            objectExpired = false;
        }
    } catch (Exception e) {
        e.printStackTrace();
        Assert.assertNotNull("Unable to access public object.", objectFS);
    } finally {
        fs.delete(f, true);
    }
}

From source file:com.idvp.platform.hdfs.HDFSDataStream.java

License:Apache License

protected void doOpen(Configuration conf, Path dstPath, FileSystem hdfs) throws IOException {

    boolean appending = false;
    if (conf.getBoolean("hdfs.append.support", false) == true && hdfs.isFile(dstPath)) {
        outStream = hdfs.append(dstPath);
        appending = true;/*w ww .j a  v  a2  s . c o  m*/
    } else {
        outStream = hdfs.create(dstPath);
    }

    serializer = new BodyTextEventSerializer.Builder().build(outStream);
    if (appending && !serializer.supportsReopen()) {
        outStream.close();
        serializer = null;
        throw new IOException("serializer (" + "TEXT" + ") does not support append");
    }

    // must call superclass to check for replication issues
    registerCurrentStream(outStream, hdfs, dstPath);

    if (appending) {
        serializer.afterReopen();
    } else {
        serializer.afterCreate();
    }
}

From source file:com.inmobi.conduit.CompressedFileReaderTest.java

License:Apache License

private void uncompress(String fileName) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs;
    fs = FileSystem.getLocal(conf);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(new Path(fileName));
    if (codec == null) {
        System.out.println("cant find codec");
        System.exit(1);// w w  w .  java 2  s  .  c o m
    }
    LOG.info("Using compression codec [" + codec.toString() + "]");
    CompressionInputStream is = codec.createInputStream(fs.open(new Path(fileName)));
    OutputStream out = null;
    try {
        String outputURI = CompressionCodecFactory.removeSuffix(fileName, codec.getDefaultExtension());
        out = fs.create(new Path(outputURI + "-uncompressed"));
        org.apache.hadoop.io.IOUtils.copyBytes(is, out, conf);
    } finally {
        org.apache.hadoop.io.IOUtils.closeStream(out);
        IOUtils.closeStream(is);

    }
}