Example usage for org.apache.hadoop.conf Configuration setStrings

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setStrings.

Prototype

public void setStrings(String name, String... values)

Source Link

Document

Set the array of string values for the name property as as comma delimited values.

Usage

From source file:root.benchmark.LyrlBenchmarkJob.java

License:Apache License

/**
 * {@inheritDoc}//from  w w w  .  j  a v a2s  . c  o m
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(vectorDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path vectorDirectoryPath = new Path(vectorDirectory);
    if (!inputFS.exists(vectorDirectoryPath)) {
        throw new Exception("Vector directory not found.");
    }
    Path wordDictDirectoryPath = new Path(wordDictDirectory);
    if (!inputFS.exists(wordDictDirectoryPath)) {
        throw new Exception("Word Dictionary directory not found.");
    }
    Path workingDirectoryPath = new Path(workingDirectory);
    if (workingFS.exists(workingDirectoryPath)) {
        throw new Exception("Working Directory already exists.");
    }
    if (!workingFS.mkdirs(workingDirectoryPath)) {
        throw new Exception("Failed to create Working Directory.");
    }

    String[] vectorizationArgs = { "-i", vectorDirectory, "-wd", wordDictDirectory, "-w", workingDirectory,
            "-l", numLevels, "-dm", "org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure", "-smd",
            diagScale };
    System.out.println();
    ToolRunner.run(conf, new VectorizationJob(), vectorizationArgs);

    long starttime, stoptime, deltatime;
    starttime = System.currentTimeMillis();

    String[] hapArgs = { "-i", workingDirectory + apInputDirectory, "-o", workingDirectory + apOutputDirectory,
            "-l", numLevels, "-w", workingDirectory, "-iter", numIterations, "-lambda", lambda };
    ToolRunner.run(conf, new HierarchicalAffinityPropagationJob(), hapArgs);

    stoptime = System.currentTimeMillis();
    deltatime = stoptime - starttime;

    conf.setStrings(CONF_PREFIX + "Dataset", "Lyrl2004");
    conf.setStrings(CONF_PREFIX + "iterations", numIterations);
    conf.setStrings(CONF_PREFIX + "levels", numLevels);
    conf.setStrings(CONF_PREFIX + "lambda", lambda);
    conf.setStrings(CONF_PREFIX + "SMatSeed", diagScale);
    writeTimestamp(conf, workingDirectory + timeStamp, deltatime);

    String[] logJobArgs = { "-s", vectorDirectory, "-c", workingDirectory + apOutputDirectory, "-t",
            workingDirectory + timeStamp, "-f",
            workingDirectory + dir_dataVectors + dir_dataMetaData + "/vectorName2docId", };
    ToolRunner.run(conf, new LogJob(), logJobArgs);

    //      workingFS = FileSystem.get(workingURI, conf);
    //      workingFS.mkdirs(new Path(workingDirectory+dir_hiveDataRoot));
    //
    //      String [] hiveArgs={
    //            "-sqldb"    , url_mysql,
    //            "-sqlusr"   , usr_mysql,
    //            "-sqlpsw"   , psw_mysql,
    //            "-dataroot" , workingDirectory+dir_hiveDataRoot,
    //            "-ap-out"   , workingDirectory+apOutputDirectory,
    //            "-word-dict", workingDirectory+wordDictionaryDirectory,
    //            "-metadata" , workingDirectory+dir_dataVectors+dir_dataMetaData,
    //            "-tf-vec"   , workingDirectory+dir_dataVectors+dir_dataTFVectorFile,
    //            "-nwords"   , topNWords
    //      };
    //      ToolRunner.run(conf,new OutputJob(), hiveArgs);

    return 0;

}

From source file:root.benchmark.ReutersBenchmarkJob.java

License:Apache License

/**
 * {@inheritDoc}//from   w  w  w  .  j  a  va2 s  . co m
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path inputDirectoryPath = new Path(inputDirectory);
    if (!inputFS.exists(inputDirectoryPath)) {
        throw new Exception("Input directory not found.");
    }
    Path workingDirectoryPath = new Path(workingDirectory);
    if (workingFS.exists(workingDirectoryPath)) {
        throw new Exception("Working Directory already exists.");
    }
    if (!workingFS.mkdirs(workingDirectoryPath)) {
        throw new Exception("Failed to create Working Directory.");
    }

    String[] vectorizationArgs = { "-i", inputDirectory, "-o", workingDirectory + vectorizationOutputDirectory,
            "-x", exclusionThreshold, "-mdf", minimumDocumentFrequency, };
    System.out.println();
    ToolRunner.run(conf, new ReutersVectorizationJob(), vectorizationArgs);

    long starttime, stoptime, deltatime;
    starttime = System.currentTimeMillis();

    String[] canopyArgs = { "-i", workingDirectory + vectorizationOutputDirectory + "/vectorFiles/tf-vectors",
            "-o", workingDirectory + canopyOutputDirectory, "-dm", distanceMetric, "-t1", threshold1, "-t2",
            threshold2 };
    ToolRunner.run(conf, new CanopyJob(), canopyArgs);

    String[] kmeansArgs = { "-i", workingDirectory + vectorizationOutputDirectory + "/vectorFiles/tf-vectors",
            "-o", workingDirectory + kmeansOutputDirectory, "-c",
            workingDirectory + canopyOutputDirectory + "/clusters-0-final", "-dm", distanceMetric, "-cd",
            convergenceDelta, "-mIter", numIterations, "-k", "3" };
    ToolRunner.run(conf, new KMeansJob(), kmeansArgs);

    Path input = new Path(workingDirectory + kmeansOutputDirectory);
    Path output = new Path(workingDirectory + outputDirectory);
    boolean sequential = false;

    ClusterOutputPostProcessorDriver.run(input, output, sequential);

    stoptime = System.currentTimeMillis();
    deltatime = stoptime - starttime;

    conf.setStrings(CONF_PREFIX + "Dataset", "Image");
    conf.setStrings(CONF_PREFIX + "Iterations", numIterations);
    conf.setStrings(CONF_PREFIX + "Levels", convergenceDelta);
    writeTimestamp(conf, workingDirectory + timeStamp, deltatime);

    return 0;

}

From source file:root.input.images.ImagesJob.java

License:Apache License

/**
 * {@inheritDoc}/*  w w  w  . j  av  a 2  s . c o m*/
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path vectorDirectoryPath = new Path(inputDirectory);
    if (!inputFS.exists(vectorDirectoryPath)) {
        throw new Exception("Points directory not found.");
    }
    Path workingDirectoryPath = new Path(workingDirectory);
    if (workingFS.exists(workingDirectoryPath)) {
        throw new Exception("Working Directory already exists.");
    }
    if (!workingFS.mkdirs(workingDirectoryPath)) {
        throw new Exception("Failed to create Working Directory.");
    }

    String[] vectorizationArgs = { "-i", inputDirectory, "-w", workingDirectory, "-smd", diagScale, "-l",
            numLevels, "-dm", "org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure" };
    System.out.println();
    ToolRunner.run(conf, new VectorizationJob(), vectorizationArgs);

    long starttime, stoptime, deltatime;
    starttime = System.currentTimeMillis();

    String[] hapArgs = { "-i", workingDirectory + apInputDirectory, "-o", workingDirectory + apOutputDirectory,
            "-l", numLevels, "-w", workingDirectory, "-iter", numIterations, "-lambda", lambda, "-n", N };
    ToolRunner.run(conf, new HierarchicalAffinityPropagationJob(), hapArgs);

    stoptime = System.currentTimeMillis();
    deltatime = stoptime - starttime;

    conf.setStrings(CONF_PREFIX + "Dataset", "Image");
    conf.setStrings(CONF_PREFIX + "Iterations", numIterations);
    conf.setStrings(CONF_PREFIX + "Levels", numLevels);
    conf.setStrings(CONF_PREFIX + "Lambda", lambda);
    conf.setStrings(CONF_PREFIX + "SMatSeed", diagScale);
    writeTimestamp(conf, workingDirectory + timeStamp, deltatime);

    String[] logJobArgs = { "-s", inputDirectory, "-c", workingDirectory + apOutputDirectory, "-t",
            workingDirectory + timeStamp, "-f",
            workingDirectory + dir_dataVectors + dir_dataMetaData + "/vectorName2docId", };
    ToolRunner.run(conf, new LogJob(), logJobArgs);

    //      workingFS = FileSystem.get(workingURI, conf);
    //      workingFS.mkdirs(new Path(workingDirectory+dir_hiveDataRoot));
    //
    //      String [] hiveArgs={
    //            "-sqldb"    , url_mysql,
    //            "-sqlusr"   , usr_mysql,
    //            "-sqlpsw"   , psw_mysql,
    //            "-dataroot" , workingDirectory+dir_hiveDataRoot,
    //            "-ap-out"   , workingDirectory+apOutputDirectory,
    //            "-word-dict", workingDirectory+wordDictionaryDirectory,
    //            "-metadata" , workingDirectory+dir_dataVectors+dir_dataMetaData,
    //            "-tf-vec"   , workingDirectory+dir_dataVectors+dir_dataTFVectorFile,
    //            "-nwords"   , topNWords
    //      };
    //      ToolRunner.run(conf,new OutputJob(), hiveArgs);

    return 0;

}

From source file:root.input.lyrl2004.LyrlJob.java

License:Apache License

/**
 * {@inheritDoc}// w w w  .  j  a  va2  s. co  m
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(vectorDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path vectorDirectoryPath = new Path(vectorDirectory);
    if (!inputFS.exists(vectorDirectoryPath)) {
        throw new Exception("Vector directory not found.");
    }
    Path wordDictDirectoryPath = new Path(wordDictDirectory);
    if (!inputFS.exists(wordDictDirectoryPath)) {
        throw new Exception("Word Dictionary directory not found.");
    }
    Path workingDirectoryPath = new Path(workingDirectory);
    if (workingFS.exists(workingDirectoryPath)) {
        throw new Exception("Working Directory already exists.");
    }
    if (!workingFS.mkdirs(workingDirectoryPath)) {
        throw new Exception("Failed to create Working Directory.");
    }

    String[] vectorizationArgs = { "-i", vectorDirectory, "-wd", wordDictDirectory, "-w", workingDirectory,
            "-l", numLevels, "-dm", "org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure", "-smd",
            diagScale };
    System.out.println();
    ToolRunner.run(conf, new VectorizationJob(), vectorizationArgs);

    long starttime, stoptime, deltatime;
    starttime = System.currentTimeMillis();

    String[] hapArgs = { "-i", workingDirectory + apInputDirectory, "-o", workingDirectory + apOutputDirectory,
            "-l", numLevels, "-w", workingDirectory, "-iter", numIterations, "-lambda", lambda, "-n", N };
    ToolRunner.run(conf, new HierarchicalAffinityPropagationJob(), hapArgs);

    stoptime = System.currentTimeMillis();
    deltatime = stoptime - starttime;

    conf.setStrings(CONF_PREFIX + "Dataset", "Lyrl2004");
    conf.setStrings(CONF_PREFIX + "iterations", numIterations);
    conf.setStrings(CONF_PREFIX + "levels", numLevels);
    conf.setStrings(CONF_PREFIX + "lambda", lambda);
    conf.setStrings(CONF_PREFIX + "SMatSeed", diagScale);
    writeTimestamp(conf, workingDirectory + timeStamp, deltatime);

    String[] logJobArgs = { "-s", vectorDirectory, "-c", workingDirectory + apOutputDirectory, "-t",
            workingDirectory + timeStamp, "-f",
            workingDirectory + dir_dataVectors + dir_dataMetaData + "/vectorName2docId", };
    ToolRunner.run(conf, new LogJob(), logJobArgs);

    //      workingFS = FileSystem.get(workingURI, conf);
    //      workingFS.mkdirs(new Path(workingDirectory+dir_hiveDataRoot));
    //
    //      String [] hiveArgs={
    //            "-sqldb"    , url_mysql,
    //            "-sqlusr"   , usr_mysql,
    //            "-sqlpsw"   , psw_mysql,
    //            "-dataroot" , workingDirectory+dir_hiveDataRoot,
    //            "-ap-out"   , workingDirectory+apOutputDirectory,
    //            "-word-dict", workingDirectory+wordDictionaryDirectory,
    //            "-metadata" , workingDirectory+dir_dataVectors+dir_dataMetaData,
    //            "-tf-vec"   , workingDirectory+dir_dataVectors+dir_dataTFVectorFile,
    //            "-nwords"   , topNWords
    //      };
    //      ToolRunner.run(conf,new OutputJob(), hiveArgs);

    return 0;

}

From source file:root.input.reuters21578.ReutersJob.java

License:Apache License

/**
 * {@inheritDoc}/* w w  w . j a  v a  2  s.  co  m*/
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path inputDirectoryPath = new Path(inputDirectory);
    if (!inputFS.exists(inputDirectoryPath)) {
        throw new Exception("Input directory not found.");
    }
    Path workingDirectoryPath = new Path(workingDirectory);
    if (workingFS.exists(workingDirectoryPath)) {
        throw new Exception("Working Directory already exists.");
    }
    if (!workingFS.mkdirs(workingDirectoryPath)) {
        throw new Exception("Failed to create Working Directory.");
    }

    //      workingFS.mkdirs( new Path( workingDirectory + vectorizationOutputDirectory ) );
    //      workingFS.mkdirs( new Path( workingDirectory + vectorizationOutputDirectory + apInputDirectory ));
    //
    //      String[] similarityMatrixConverterArgs = { 
    //            "-i",   inputDirectory + "/Ssym.csv",
    //            "-o",   workingDirectory + vectorizationOutputDirectory + apInputDirectory,
    //            "-l",    numLevels
    //      };
    //      ToolRunner.run( conf, new SimilarityMatrixConverter(), similarityMatrixConverterArgs );

    String[] vectorizationArgs = { "-i", inputDirectory, "-o", workingDirectory + vectorizationOutputDirectory,
            "-x", exclusionThreshold, "-l", numLevels, "-mdf", minimumDocumentFrequency, "-dm", distanceMetric,
            "-tf_tfidf", tf_tfidf, "-smd", diagScale };
    System.out.println();
    ToolRunner.run(conf, new VectorizationJob(), vectorizationArgs);

    long starttime, stoptime, deltatime;
    starttime = System.currentTimeMillis();

    String[] hapArgs = { "-i", workingDirectory + vectorizationOutputDirectory + apInputDirectory, "-o",
            workingDirectory + apOutputDirectory, "-l", numLevels, "-w", workingDirectory, "-iter",
            numIterations, "-lambda", lambda, "-n", N };
    ToolRunner.run(conf, new HierarchicalAffinityPropagationJob(), hapArgs);

    stoptime = System.currentTimeMillis();
    deltatime = stoptime - starttime;

    conf.setStrings(CONF_PREFIX + "Dataset", "Reuters");
    conf.setStrings(CONF_PREFIX + "iterations", numIterations);
    conf.setStrings(CONF_PREFIX + "levels", numLevels);
    conf.setStrings(CONF_PREFIX + "lambda", lambda);
    conf.setStrings(CONF_PREFIX + "SMatSeed", diagScale);
    writeTimestamp(conf, workingDirectory + timeStamp, deltatime);

    String[] logJobArgs = { "-s", inputDirectory, "-c", workingDirectory + apOutputDirectory, "-t",
            workingDirectory + timeStamp, "-f",
            workingDirectory + vectorizationOutputDirectory + dir_dataMetaData + "/fileName2docId" };
    ToolRunner.run(conf, new LogJob(), logJobArgs);
    //
    //      // Move files into position for HiveJob
    //      workingFS = FileSystem.get(workingURI, conf);
    //      workingFS.mkdirs(new Path(workingDirectory + dir_hiveDataRoot));
    //      Path currWordDict = new Path(workingDirectory
    //            + vectorizationOutputDirectory
    //            + "/vectorFiles/dictionary.file-0");
    //      Path nextWordDict = new Path(workingDirectory
    //            + vectorizationOutputDirectory + dir_dataWordDict
    //            + "/dictionary.file-0");
    //      workingFS.mkdirs(new Path(workingDirectory
    //            + vectorizationOutputDirectory + dir_dataWordDict));
    //      workingFS.rename(currWordDict, nextWordDict);
    //
    //      String[] hiveArgs = {
    //            "--sqldb", url_mysql,
    //            "--sqlusr", usr_mysql,
    //            "--sqlpsw", psw_mysql,
    //            "--dataroot", workingDirectory + dir_hiveDataRoot,
    //            "--ap-out", workingDirectory + apOutputDirectory,
    //            "--word-dict", workingDirectory + vectorizationOutputDirectory
    //            + dir_dataWordDict,
    //            "--metadata", workingDirectory + vectorizationOutputDirectory
    //            + dir_dataMetaData,
    //            "--tf-vec", workingDirectory + vectorizationOutputDirectory
    //            + dir_dataTFVectors + tf_tfidf + "-vectors",
    //            "--nwords", topNWords
    //      };
    //      ToolRunner.run(conf, new OutputJob(), hiveArgs);

    return 0;

}

From source file:simsql.runtime.JoinOp.java

License:Apache License

public void setConfigurations(Configuration conf, RuntimeParameter params) {

    // first, send out the type of join
    conf.setStrings("simsql.joinType", new String[] { joinType.toString().toLowerCase() });

    // set the self-join value
    conf.setBoolean("simsql.isSelfJoin", isSelfJoin);

    // see if we have a Cartesian product
    conf.setBoolean("simsql.joinCartesian", isCartesian);

    // see if we have a pure, map-only merge join
    conf.setBoolean("simsql.isMergeJoin", mergeJoin);

    // if we are able to avoid a sort of the left or of the right, then we need some extra configs that will allow the merge
    if (mergeJoin || sortOnlyRight) {
        conf.setInt("simsql.sortedFileTypeCode", getDB().getTypeCode(getDB().getTableName(leftFile)));
        conf.set("simsql.sortedFileName", leftFile);
        conf.setInt("simsql.sortedFileNumAtts", getDB().getNumAtts(getDB().getTableName(leftFile)));
    } else if (sortOnlyLeft) {
        conf.setInt("simsql.sortedFileTypeCode", getDB().getTypeCode(getDB().getTableName(rightFile)));
        conf.set("simsql.sortedFileName", rightFile);
        conf.setInt("simsql.sortedFileNumAtts", getDB().getNumAtts(getDB().getTableName(rightFile)));
    }// ww w.j a  va2s. c o m

    // find out which relation is the largest.
    long leftSize = getPathsActualSize(getValue("leftInput.inFiles").getStringList().toArray(new String[0]));
    long rightSize = getPathsActualSize(getValue("rightInput.inFiles").getStringList().toArray(new String[0]));
    long smallerSize = 0;
    long largerSize = 0;
    int smallerTypeCode = -1;
    int largerTypeCode = -1;

    if (leftSize < rightSize) {
        smallerSize = leftSize;
        largerSize = rightSize;
        smallerTypeCode = leftTypeCode;
        largerTypeCode = rightTypeCode;
    } else {
        smallerSize = rightSize;
        largerSize = leftSize;
        smallerTypeCode = rightTypeCode;
        largerTypeCode = leftTypeCode;
    }

    // and pass the typecode and size of those relations.
    conf.setInt("simsql.smallerRelation.typeCode", smallerTypeCode);
    conf.setInt("simsql.largerRelation.typeCode", largerTypeCode);
    conf.setLong("simsql.smallerRelation.size", smallerSize);
    conf.setLong("simsql.largerRelation.size", largerSize);
}

From source file:simsql.runtime.MRLoader.java

License:Apache License

public long run(String inputPath, String outputPath, short typeCode, Relation r, int sortAtt) {

    // make a directory for the relation
    Configuration conf = new Configuration();
    FileSystem dfs = null;/*from ww  w .j  ava2  s . c o  m*/

    try {
        dfs = FileSystem.get(conf);
    } catch (Exception e) {
        throw new RuntimeException("Cannot access HDFS!", e);
    }

    try {
        // if it exists, destroy it.
        Path path = new Path(outputPath);
        if (dfs.exists(path)) {
            dfs.delete(path, true);
        }
    } catch (Exception e) {
        throw new RuntimeException("Could not create the file to bulk load to!", e);
    }

    // find a file name 
    String tempPath = null;
    if (inputPath.startsWith("hdfs:")) {
        tempPath = inputPath.replace("hdfs:", "");
    } else {
        tempPath = "/tempDataFile_" + r.getName();
        try {
            dfs.delete(new Path(tempPath), true);
        } catch (Exception e) {
            // ignore this.
        }

        // upload the text file
        try {
            dfs.copyFromLocalFile(false, true, new Path(inputPath), new Path(tempPath));
            dfs.deleteOnExit(new Path(tempPath));
        } catch (Exception e) {
            throw new RuntimeException("Failed to upload text file " + inputPath + " to HDFS!", e);
        }
    }

    // set up the new job's parameters.
    conf.setBoolean("mapred.compress.map.output", true);
    conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass());

    conf.set("io.serializations",
            "simsql.runtime.RecordSerialization,simsql.runtime.RecordKeySerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt("simsql.loader.numAtts", r.getAttributes().size());
    conf.setInt("simsql.loader.typeCode", (int) typeCode);
    conf.setInt("simsql.loader.sortAtt", sortAtt);

    String[] myStrings = new String[r.getAttributes().size()];
    int j = 0;
    for (simsql.compiler.Attribute a : r.getAttributes()) {
        myStrings[j++] = a.getPhysicalRealization().getClass().getName();
    }

    conf.setStrings("simsql.loader.types", myStrings);

    // create a job
    Job job;
    try {
        job = new Job(conf);
    } catch (Exception e) {
        throw new RuntimeException("Unable to create bulk loading job!", e);
    }

    // set the split size (number of mappers)
    long fSize = 0;
    if (inputPath.startsWith("hdfs")) {
        fSize = RelOp.getPathsTotalSize(new String[] { tempPath });
    } else {
        fSize = new File(inputPath).length();
    }

    FileInputFormat.setMinInputSplitSize(job, fSize / (long) numTasks);
    FileInputFormat.setMaxInputSplitSize(job, fSize / (long) numTasks);

    // and the number of reducers
    job.setNumReduceTasks(numTasks);

    // the mapper/reducer/jar
    job.setMapperClass(MRLoaderMapper.class);
    job.setReducerClass(MRLoaderReducer.class);
    job.setJarByClass(MRLoader.class);

    // I/O settings.
    job.setOutputFormatClass(RecordOutputFormat.class);

    job.setMapOutputKeyClass(RecordKey.class);
    job.setMapOutputValueClass(RecordWrapper.class);
    job.setOutputKeyClass(Nothing.class);
    job.setOutputValueClass(Record.class);
    try {
        FileInputFormat.setInputPaths(job, new Path(tempPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
    } catch (Exception e) {
        throw new RuntimeException("Could not set job inputs/outputs", e);
    }
    job.setGroupingComparatorClass(RecordKeyGroupingComparator.class);
    job.setPartitionerClass(RecordPartitioner.class);
    job.setSortComparatorClass(RecordKeySortComparator.class);

    job.setJobName("MRLoader: " + inputPath + " ==> " + outputPath);

    // run it
    Counters counters;
    try {
        job.waitForCompletion(true);
        counters = job.getCounters();
    } catch (Exception e) {
        throw new RuntimeException("Could not set up bulk loader job!", e);
    }

    // now, delete all the empty part files
    try {

        // get a filesystem
        FileSystem ddfs = FileSystem.get(conf);
        Path outPath = new Path(outputPath);
        if (ddfs.exists(outPath) && ddfs.isDirectory(outPath)) {
            FileStatus fstatus[] = ddfs.listStatus(outPath, new TableFileFilter());
            for (FileStatus ff : fstatus) {
                if (ddfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around...
                    ddfs.delete(ff.getPath(), true);
                }
            }
        }
    } catch (Exception e) { // this isn't disastrous 
    }

    // get the counter for the output of the mapper.
    Counter bytesCounter = counters.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN);
    return bytesCounter.getValue();
}

From source file:simsql.runtime.RelOp.java

License:Apache License

public boolean run(RuntimeParameter params, boolean verbose) {

    ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params;

    // build the jar.
    String jarFile = buildJarFile(params);

    // Get the default configuration object
    Configuration conf = new Configuration();

    // set quite mode on/off
    conf.setQuietMode(!verbose);//www .j  av a  2  s. com

    /***
    conf.setBoolean("mapred.task.profile", true);
    conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," +
        "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s");
    ***/

    // tell it how to serialize and deserialize records and recordkeys
    conf.set("io.serializations", getSerializations());
    conf.setBoolean("mapred.compress.map.output", true);

    int ioSortMB = conf.getInt("io.sort.mb", 256);
    conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.setInt("simsql.input.numSplits", pp.getNumCPUs());
    conf.setInt("mapred.job.reuse.jvm.num.tasks", 1);
    // conf.setBoolean ("mapred.map.tasks.speculative.execution", false);
    // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false);

    // tell it to use the jar that we just created
    conf.set("mapred.jar", jarFile);

    // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar");

    conf.setBoolean("mapred.output.compress", true);
    conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" });

    // use snappy for the intermediate stuff
    conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass());

    // do some additional operator-specific configurations
    setConfigurations(conf, params);

    // collect statistics for final relations always
    conf.setBoolean("simsql.collectStats", isFinal || collectStats);

    // figure out what file to map
    String[] inDirs = myInputNetwork.getPipelinedInputFiles();
    inDirs = excludeAnyWhoWillNotBeMapped(inDirs);
    String inSingleString = inDirs[0];
    conf.set("simsql.fileToMap", inSingleString);
    for (int i = 1; i < inDirs.length; i++) {
        inSingleString += "," + inDirs[i];
    }

    // create and name the job
    Job job;
    try {
        job = new Job(conf);
    } catch (Exception e) {
        throw new RuntimeException("Unable to create a new job!", e);
    }

    job.setJobName(getJobName());

    // set the map-reduce input and output types
    job.setMapOutputKeyClass(getMapOutputKeyClass());
    job.setMapOutputValueClass(getMapOutputValueClass());
    job.setOutputKeyClass(getOutputKeyClass());
    job.setOutputValueClass(getOutputValueClass());

    int numReducers = getNumReducers(params);

    job.setMapperClass(getMapperClass());
    job.setReducerClass(getReducerClass());

    // set the number of reducers
    job.setNumReduceTasks(numReducers);

    // set the input and the output formats... these extend FileInputFormat and FileOutputFormat
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(getOutputFormatClass());

    // set the input and output paths
    try {
        System.out.println("input file: " + inSingleString);
        FileInputFormat.setInputPaths(job, inSingleString);
        FileInputFormat.setInputPathFilter(job, TableFileFilter.class);
        FileOutputFormat.setOutputPath(job, new Path(getOutput()));
    } catch (Exception e) {
        throw new RuntimeException("Unable to set up the input/output path for the job.", e);
    }

    // set the split size
    FileInputFormat.setMinInputSplitSize(job, getSplitSize(params));
    FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params));

    // set the various sorting/grouping/mapping classes
    job.setGroupingComparatorClass(getGroupingComparatorClass());
    job.setPartitionerClass(getPartitionerClass());
    job.setSortComparatorClass(getSortComparatorClass());

    // and now, submit the job and wait for things to finish
    int exitCode;
    try {
        exitCode = job.waitForCompletion(verbose) ? 0 : 1;

        // get the output bytes counter.
        Counters c = job.getCounters();
        Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN);

        // and use them to set the size of the output relation.
        if (myDB != null) {
            myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue());
            myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length);
        }

    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Unable to run the job", e);
    }

    // now, delete all the empty part files
    try {

        // get a filesystem
        FileSystem dfs = FileSystem.get(conf);
        Path outPath = new Path(getOutput());
        if (dfs.exists(outPath) && dfs.isDirectory(outPath)) {
            FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter());
            for (FileStatus ff : fstatus) {
                if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around...
                    dfs.delete(ff.getPath(), true);
                }
            }
        }
    } catch (Exception e) { // this isn't disastrous 
    }
    return (exitCode == 0);
}

From source file:simsql.runtime.VGWrapperOp.java

License:Apache License

public void setConfigurations(Configuration conf, RuntimeParameter params) {

    ExampleRuntimeParameter p = (ExampleRuntimeParameter) params;

    /**//from w  ww.  j  a v  a2s  . c om
    conf.setBoolean("mapred.task.profile", true);
    conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," +
        "heap=sites,depth=10,force=n,thread=y,verbose=n,file=%s");
    **/
    // set the number of iterations
    conf.setInt("simsql.numIterations", p.getNumIterations());

    // set the file name of the VG function.
    conf.setStrings("simsql.functionFile", new String[] { "/simsql/functions/" + getVGFunctions()[0] + ".so" });

    // set the buffer size for data exchange -- 2GB is the maximum because it is a long.
    int bSize = 0;
    if (((p.getMemoryPerCPUInMB() / 2) * 1024L * 1024L) > (long) Integer.MAX_VALUE) {
        bSize = Integer.MAX_VALUE;
    } else {
        bSize = (p.getMemoryPerCPUInMB() / 2) * 1024 * 1024;
    }

    conf.setInt("simsql.dataBufferSize", bSize);

    // set the cross product relations.
    if (crossFiles.size() > 0) {
        conf.setStrings("simsql.crossFiles", crossFiles.toArray(new String[0]));
        conf.setStrings("simsql.crossTypeCodes", crossTypeCodes.toArray(new String[0]));
        conf.setStrings("simsql.crossAttCounts", crossAttCounts.toArray(new String[0]));
    }
    // set the sorted input relations.
    if (sortedInnerFiles.size() > 0) {
        conf.setStrings("simsql.sortedFiles", sortedInnerFiles.toArray(new String[0]));
        conf.setStrings("simsql.sortedTypeCodes", sortedTypeCodes.toArray(new String[0]));
        conf.setStrings("simsql.sortedAttCounts", sortedAttCounts.toArray(new String[0]));
    }

    conf.setBoolean("simsql.runVGWrapperReducer", runVGWrapperReducer);
}

From source file:sourcefiles.BuildPersonalizedPageRankRecords.java

License:Apache License

/**
 * Runs this tool.//www  .  ja v  a  2s .  c o  m
 */
@Override
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("sources").create(SOURCES));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    String sources = cmdline.getOptionValue(SOURCES);

    LOG.info("Tool name: " + BuildPersonalizedPageRankRecords.class.getSimpleName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - numNodes: " + n);
    LOG.info(" - sources: " + sources);

    Configuration conf = getConf();
    conf.setInt(NODE_CNT_FIELD, n);
    conf.setStrings(NODE_SRC_FIELD, sources);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    Job job = Job.getInstance(conf);
    job.setJobName(BuildPersonalizedPageRankRecords.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(BuildPersonalizedPageRankRecords.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    // job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PageRankNodeEnhanced.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PageRankNodeEnhanced.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}