Example usage for org.apache.hadoop.conf Configuration set

List of usage examples for org.apache.hadoop.conf Configuration set

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration set.

Prototype

public void set(String name, String value) 

Source Link

Document

Set the value of the name property.

Usage

From source file:cascading.tap.hadoop.Hfs.java

License:Open Source License

/**
 * Based on the configuration, handles and sets {@link CombineFileInputFormat} as the input
 * format.//from w ww .j  a va  2s  .co  m
 */
private void handleCombineFileInputFormat(Configuration conf) {
    // if combining files, override the configuration to use CombineFileInputFormat
    if (!getUseCombinedInput(conf))
        return;

    // get the prescribed individual input format from the underlying scheme so it can be used by CombinedInputFormat
    String individualInputFormat = conf.get("mapred.input.format.class");

    if (individualInputFormat == null)
        throw new TapException("input format is missing from the underlying scheme");

    if (individualInputFormat.equals(CombinedInputFormat.class.getName())
            && conf.get(CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT) == null)
        throw new TapException(
                "the input format class is already the combined input format but the underlying input format is missing");

    // if safe mode is on (default) throw an exception if the InputFormat is not a FileInputFormat, otherwise log a
    // warning and don't use the CombineFileInputFormat
    boolean safeMode = getCombinedInputSafeMode(conf);

    if (!FileInputFormat.class.isAssignableFrom(conf.getClass("mapred.input.format.class", null))) {
        if (safeMode)
            throw new TapException(
                    "input format must be of type org.apache.hadoop.mapred.FileInputFormat, got: "
                            + individualInputFormat);
        else
            LOG.warn(
                    "not combining input splits with CombineFileInputFormat, {} is not of type org.apache.hadoop.mapred.FileInputFormat.",
                    individualInputFormat);
    } else {
        // set the underlying individual input format
        conf.set(CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT, individualInputFormat);

        // override the input format class
        conf.setClass("mapred.input.format.class", CombinedInputFormat.class, InputFormat.class);
    }
}

From source file:cascading.tap.hadoop.io.StreamedFileSystem.java

License:Open Source License

public static void setMD5SumFor(Configuration conf, String path, String md5Hex) {
    if (md5Hex == null || md5Hex.length() == 0)
        return;/*from   w  w w.  j a  va  2  s .c om*/

    conf.set(path + ".md5", md5Hex);
}

From source file:cascading.tap.hadoop.util.Hadoop18TapUtil.java

License:Open Source License

/**
 * should only be called if not in a Flow
 *
 * @param conf/*from  w w  w  . jav a 2 s .  c  om*/
 * @throws IOException
 */
public static void setupJob(Configuration conf) throws IOException {
    Path outputPath = FileOutputFormat.getOutputPath(asJobConfInstance(conf));

    if (outputPath == null)
        return;

    if (getFSSafe(conf, outputPath) == null)
        return;

    String taskID = conf.get("mapred.task.id", conf.get("mapreduce.task.id"));

    if (taskID == null) // need to stuff a fake id
    {
        String mapper = conf.getBoolean("mapred.task.is.map", conf.getBoolean("mapreduce.task.is.map", true))
                ? "m"
                : "r";
        String value = String.format("attempt_%012d_0000_%s_000000_0",
                (int) Math.rint(System.currentTimeMillis()), mapper);
        conf.set("mapred.task.id", value);
        conf.set("mapreduce.task.id", value);
    }

    makeTempPath(conf);

    if (writeDirectlyToWorkingPath(conf, outputPath)) {
        LOG.info("writing directly to output path: {}", outputPath);
        setWorkOutputPath(conf, outputPath);
        return;
    }

    // "mapred.work.output.dir"
    Path taskOutputPath = getTaskOutputPath(conf);
    setWorkOutputPath(conf, taskOutputPath);
}

From source file:cascading.tap.hadoop.util.Hadoop18TapUtil.java

License:Open Source License

static void setWorkOutputPath(Configuration conf, Path outputDir) {
    outputDir = new Path(asJobConfInstance(conf).getWorkingDirectory(), outputDir);
    conf.set("mapred.work.output.dir", outputDir.toString());
}

From source file:cascading.tuple.hadoop.HadoopSerializationPlatformTest.java

License:Open Source License

@Test
public void testInputOutputSerialization() throws IOException {
    long time = System.currentTimeMillis();

    Configuration jobConf = new Configuration();

    jobConf.set("io.serializations",
            TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class
    jobConf.set("cascading.serialization.tokens",
            "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing

    TupleSerialization tupleSerialization = new TupleSerialization(jobConf);

    File file = new File(getOutputPath("serialization"));

    file.mkdirs();/*from w  w w.  jav  a2  s  .  c o  m*/
    file = new File(file, "/test.bytes");

    TupleOutputStream output = new HadoopTupleOutputStream(new FileOutputStream(file, false),
            tupleSerialization.getElementWriter());

    for (int i = 0; i < 501; i++) // 501 is arbitrary
    {
        String aString = "string number " + i;
        double random = Math.random();

        output.writeTuple(new Tuple(i, aString, random, new TestText(aString),
                new Tuple("inner tuple", new BytesWritable("some string".getBytes())),
                new BytesWritable(Integer.toString(i).getBytes("UTF-8")), new BooleanWritable(false)));
    }

    output.close();

    assertEquals("wrong size", 89967L, file.length()); // just makes sure the file size doesnt change from expected

    TupleInputStream input = new HadoopTupleInputStream(new FileInputStream(file),
            tupleSerialization.getElementReader());

    int k = -1;
    for (int i = 0; i < 501; i++) {
        Tuple tuple = input.readTuple();
        int value = tuple.getInteger(0);
        assertTrue("wrong diff", value - k == 1);
        assertTrue("wrong type", tuple.getObject(3) instanceof TestText);
        assertTrue("wrong type", tuple.getObject(4) instanceof Tuple);
        assertTrue("wrong type", tuple.getObject(5) instanceof BytesWritable);

        byte[] bytes = ((BytesWritable) tuple.getObject(5)).getBytes();
        String string = new String(bytes, 0, bytes.length > 1 ? bytes.length - 1 : bytes.length, "UTF-8");
        assertEquals("wrong value", Integer.parseInt(string), i);
        assertTrue("wrong type", tuple.getObject(6) instanceof BooleanWritable);
        k = value;
    }

    input.close();

    System.out.println("time = " + (System.currentTimeMillis() - time));
}

From source file:cascading.tuple.hadoop.SpillableTupleHadoopTest.java

License:Open Source License

private void performListTest(int size, int threshold, CompressionCodec codec, int spills) {
    Configuration jobConf = new Configuration();

    jobConf.set("io.serializations",
            TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class
    jobConf.set("cascading.serialization.tokens",
            "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing

    HadoopSpillableTupleList list = new HadoopSpillableTupleList(threshold, codec, jobConf);

    for (int i = 0; i < size; i++) {
        String aString = "string number " + i;
        double random = Math.random();

        list.add(new Tuple(i, aString, random, new Text(aString), new TestText(aString),
                new Tuple("inner tuple", new BytesWritable(aString.getBytes()))));
    }/*from  w  w  w  .  j  ava2s  . co  m*/

    assertEquals("not equal: list.size();", size, list.size());

    assertEquals("not equal: list.getNumFiles()", spills, list.spillCount());

    int i = -1;
    int count = 0;
    for (Tuple tuple : list) {
        int value = tuple.getInteger(0);
        assertTrue("wrong diff", value - i == 1);
        assertEquals("wrong value", "string number " + count, tuple.getObject(3).toString());
        assertEquals("wrong value", "string number " + count, tuple.getObject(4).toString());
        assertTrue("wrong type", tuple.getObject(5) instanceof Tuple);

        BytesWritable bytesWritable = (BytesWritable) ((Tuple) tuple.getObject(5)).getObject(1);
        byte[] bytes = bytesWritable.getBytes();
        String actual = new String(bytes, 0, bytesWritable.getLength());

        assertEquals("wrong value", "string number " + count, actual);

        i = value;
        count++;
    }

    assertEquals("not equal: list.size();", size, count);

    Iterator<Tuple> iterator = list.iterator();

    assertEquals("not equal: iterator.next().get(1)", "string number 0", iterator.next().getObject(1));
    assertEquals("not equal: iterator.next().get(1)", "string number 1", iterator.next().getObject(1));
}

From source file:cascading.tuple.hadoop.SpillableTupleHadoopTest.java

License:Open Source License

@Test
public void testSpillMapCompressed() {
    long time = System.currentTimeMillis();

    Configuration jobConf = new Configuration();

    jobConf.set(SpillableProps.SPILL_CODECS, "org.apache.hadoop.io.compress.GzipCodec");

    performMapTest(5, 5, 100, 20, jobConf);
    performMapTest(5, 50, 100, 20, jobConf);
    performMapTest(50, 5, 200, 20, jobConf);
    performMapTest(500, 50, 7000, 20, jobConf);

    System.out.println("time = " + (System.currentTimeMillis() - time));
}

From source file:cascading.tuple.hadoop.SpillableTupleHadoopTest.java

License:Open Source License

private void performMapTest(int numKeys, int listSize, int mapThreshold, int listThreshold,
        Configuration jobConf) {
    jobConf.set("io.serializations",
            TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class
    jobConf.set("cascading.serialization.tokens",
            "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing

    HadoopFlowProcess flowProcess = new HadoopFlowProcess(jobConf);
    HadoopSpillableTupleMap map = new HadoopSpillableTupleMap(SpillableProps.defaultMapInitialCapacity,
            SpillableProps.defaultMapLoadFactor, mapThreshold, listThreshold, flowProcess);

    Set<Integer> keySet = new HashSet<Integer>();
    Random gen = new Random(1);

    for (int i = 0; i < listSize * numKeys; i++) {
        String aString = "string number " + i;
        double random = Math.random();

        double keys = numKeys / 3.0;
        int key = (int) (gen.nextDouble() * keys + gen.nextDouble() * keys + gen.nextDouble() * keys);

        Tuple tuple = new Tuple(i, aString, random, new Text(aString), new TestText(aString),
                new Tuple("inner tuple", new BytesWritable(aString.getBytes())));

        map.get(new Tuple(key)).add(tuple);

        keySet.add(key);/*w w  w.  j  a v  a 2  s.  co  m*/
    }

    // the list test above verifies the contents are being serialized, the Map is just a container of lists.
    assertEquals("not equal: map.size();", keySet.size(), map.size());
}

From source file:cgl.hadoop.apps.runner.DataAnalysis.java

License:Open Source License

/**
 * Launch the MapReduce computation./* ww w .j a  v a  2  s  . c  o m*/
 * This method first, remove any previous working directories and create a new one
 * Then the data (file names) is copied to this new directory and launch the 
 * MapReduce (map-only though) computation.
 * @param numMapTasks - Number of map tasks.
 * @param numReduceTasks - Number of reduce tasks =0.
 * @param programDir - The directory where the Cap3 program is.
 * @param execName - Name of the executable.
 * @param dataDir - Directory where the data is located.
 * @param outputDir - Output directory to place the output.
 * @param cmdArgs - These are the command line arguments to the Cap3 program.
 * @throws Exception - Throws any exception occurs in this program.
 */
void launch(int numReduceTasks, String programDir, String execName, String workingDir, String databaseArchive,
        String databaseName, String dataDir, String outputDir, String cmdArgs) throws Exception {

    Configuration conf = new Configuration();
    Job job = new Job(conf, execName);

    // First get the file system handler, delete any previous files, add the
    // files and write the data to it, then pass its name as a parameter to
    // job
    Path hdMainDir = new Path(outputDir);
    FileSystem fs = FileSystem.get(conf);
    fs.delete(hdMainDir, true);

    Path hdOutDir = new Path(hdMainDir, "out");

    // Starting the data analysis.
    Configuration jc = job.getConfiguration();

    jc.set(WORKING_DIR, workingDir);
    jc.set(EXECUTABLE, execName);
    jc.set(PROGRAM_DIR, programDir); // this the name of the executable archive
    jc.set(DB_ARCHIVE, databaseArchive);
    jc.set(DB_NAME, databaseName);
    jc.set(PARAMETERS, cmdArgs);
    jc.set(OUTPUT_DIR, outputDir);

    // using distributed cache
    // flush it
    //DistributedCache.releaseCache(new URI(programDir), jc);
    //DistributedCache.releaseCache(new URI(databaseArchive), jc);
    //DistributedCache.purgeCache(jc);
    // reput the data into cache
    long startTime = System.currentTimeMillis();
    //DistributedCache.addCacheArchive(new URI(databaseArchive), jc);
    DistributedCache.addCacheArchive(new URI(programDir), jc);
    System.out.println(
            "Add Distributed Cache in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    FileInputFormat.setInputPaths(job, dataDir);
    FileOutputFormat.setOutputPath(job, hdOutDir);

    job.setJarByClass(DataAnalysis.class);
    job.setMapperClass(RunnerMap.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(DataFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReduceTasks);

    startTime = System.currentTimeMillis();

    int exitStatus = job.waitForCompletion(true) ? 0 : 1;
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    //clean the cache

    System.exit(exitStatus);
}

From source file:chapter7.src.InputDriver.java

License:Apache License

public static void runJob(Path input, Path output, String vectorClassName, Configuration config)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = config;
    conf.set("vector.implementation.class.name", vectorClassName);
    Job job = new Job(conf, "Input Driver running over input: " + input);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(InputMapper.class);
    job.setNumReduceTasks(0);//  www  .  ja v  a 2s  .c om
    job.setJarByClass(InputDriver.class);

    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.waitForCompletion(true);
}