List of usage examples for org.apache.commons.cli2.builder DefaultOptionBuilder DefaultOptionBuilder
public DefaultOptionBuilder()
From source file:org.apache.mahout.benchmark.VectorBenchmarks.java
public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option vectorSizeOpt = obuilder.withLongName("vectorSize").withRequired(false) .withArgument(abuilder.withName("vs").withDefault(1000000).create()) .withDescription("Cardinality of the vector. Default: 1000000").withShortName("vs").create(); Option numNonZeroOpt = obuilder.withLongName("numNonZero").withRequired(false) .withArgument(abuilder.withName("nz").withDefault(1000).create()) .withDescription("Size of the vector. Default: 1000").withShortName("nz").create(); Option numVectorsOpt = obuilder.withLongName("numVectors").withRequired(false) .withArgument(abuilder.withName("nv").withDefault(25).create()) .withDescription("Number of Vectors to create. Default: 25").withShortName("nv").create(); Option numClustersOpt = obuilder.withLongName("numClusters").withRequired(false) .withArgument(abuilder.withName("nc").withDefault(0).create()) .withDescription(//from w ww . java 2 s . c om "Number of clusters to create. Set to non zero to run cluster benchmark. Default: 0") .withShortName("nc").create(); Option numOpsOpt = obuilder.withLongName("numOps").withRequired(false) .withArgument(abuilder.withName("numOps").withDefault(10).create()) .withDescription("Number of operations to do per timer. " + "E.g In distance measure, the distance is calculated numOps times" + " and the total time is measured. Default: 10") .withShortName("no").create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(vectorSizeOpt).withOption(numNonZeroOpt) .withOption(numVectorsOpt).withOption(numOpsOpt).withOption(numClustersOpt).withOption(helpOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelpWithGenericOptions(group); return; } int cardinality = 1000000; if (cmdLine.hasOption(vectorSizeOpt)) { cardinality = Integer.parseInt((String) cmdLine.getValue(vectorSizeOpt)); } int numClusters = 0; if (cmdLine.hasOption(numClustersOpt)) { numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt)); } int numNonZero = 1000; if (cmdLine.hasOption(numNonZeroOpt)) { numNonZero = Integer.parseInt((String) cmdLine.getValue(numNonZeroOpt)); } int numVectors = 25; if (cmdLine.hasOption(numVectorsOpt)) { numVectors = Integer.parseInt((String) cmdLine.getValue(numVectorsOpt)); } int numOps = 10; if (cmdLine.hasOption(numOpsOpt)) { numOps = Integer.parseInt((String) cmdLine.getValue(numOpsOpt)); } VectorBenchmarks mark = new VectorBenchmarks(cardinality, numNonZero, numVectors, numClusters, numOps); runBenchmark(mark); // log.info("\n{}", mark); log.info("\n{}", mark.asCsvString()); } catch (OptionException e) { CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.cf.taste.example.TasteOptionParser.java
/** * Parse the given command line arguments. * @param args the arguments as given to the application. * @return the input file if a file was given on the command line, null otherwise. *//* w ww. jav a 2 s . co m*/ public static File getRatings(String[] args) throws OptionException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder.withLongName("input").withRequired(false).withShortName("i") .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("The Path for input data directory.").create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create(); Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return null; } return cmdLine.hasOption(inputOpt) ? new File(cmdLine.getValue(inputOpt).toString()) : null; }
From source file:org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups.java
public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option helpOpt = DefaultOptionCreator.helpOption(); Option parentOpt = obuilder.withLongName("parent").withRequired(true) .withArgument(abuilder.withName("parent").withMinimum(1).withMaximum(1).create()) .withDescription("Parent dir containing the newsgroups").withShortName("p").create(); Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true) .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory").withShortName("o").create(); Option analyzerNameOpt = obuilder.withLongName("analyzerName").withRequired(true) .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()) .withDescription("The class name of the analyzer").withShortName("a").create(); Option charsetOpt = obuilder.withLongName("charset").withRequired(true) .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create()) .withDescription("The name of the character encoding of the input files").withShortName("c") .create();/* w w w . j a v a 2s . c o m*/ Group group = gbuilder.withName("Options").withOption(analyzerNameOpt).withOption(charsetOpt) .withOption(outputDirOpt).withOption(parentOpt).withOption(helpOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File parentDir = new File((String) cmdLine.getValue(parentOpt)); File outputDir = new File((String) cmdLine.getValue(outputDirOpt)); String analyzerName = (String) cmdLine.getValue(analyzerNameOpt); Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt)); Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, Analyzer.class); // parent dir contains dir by category if (!parentDir.exists()) { throw new FileNotFoundException("Can't find input directory " + parentDir); } File[] categoryDirs = parentDir.listFiles(); for (File dir : categoryDirs) { if (dir.isDirectory()) { if (!outputDir.exists() && !outputDir.mkdirs()) { throw new IllegalStateException("Can't create output directory"); } File outputFile = new File(outputDir, dir.getName() + ".txt"); BayesFileFormatter.collapse(dir.getName(), analyzer, dir, charset, outputFile); } } } catch (OptionException e) { CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.classifier.bayes.TestClassifier.java
public static void main(String[] args) throws IOException, InvalidDatastoreException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option pathOpt = obuilder.withLongName("model").withRequired(true) .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create()) .withDescription("The path on HDFS as defined by the -source parameter").withShortName("m") .create();//from w w w . j a v a2 s .co m Option dirOpt = obuilder.withLongName("testDir").withRequired(true) .withArgument(abuilder.withName("testDir").withMinimum(1).withMaximum(1).create()) .withDescription("The directory where test documents resides in").withShortName("d").create(); Option helpOpt = DefaultOptionCreator.helpOption(); Option encodingOpt = obuilder.withLongName("encoding") .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()) .withDescription("The file encoding. Defaults to UTF-8").withShortName("e").create(); Option defaultCatOpt = obuilder.withLongName("defaultCat") .withArgument(abuilder.withName("defaultCat").withMinimum(1).withMaximum(1).create()) .withDescription("The default category Default Value: unknown").withShortName("default").create(); Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false) .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create()) .withDescription("Size of the n-gram. Default Value: 1").withShortName("ng").create(); Option alphaOpt = obuilder.withLongName("alpha").withRequired(false) .withArgument(abuilder.withName("a").withMinimum(1).withMaximum(1).create()) .withDescription("Smoothing parameter Default Value: 1.0").withShortName("a").create(); Option verboseOutputOpt = obuilder.withLongName("verbose").withRequired(false) .withDescription("Output which values were correctly and incorrectly classified").withShortName("v") .create(); Option typeOpt = obuilder.withLongName("classifierType").withRequired(false) .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create()) .withDescription("Type of classifier: bayes|cbayes. Default Value: bayes").withShortName("type") .create(); Option dataSourceOpt = obuilder.withLongName("dataSource").withRequired(false) .withArgument(abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create()) .withDescription("Location of model: hdfs").withShortName("source").create(); Option methodOpt = obuilder.withLongName("method").withRequired(false) .withArgument(abuilder.withName("method").withMinimum(1).withMaximum(1).create()) .withDescription("Method of Classification: sequential|mapreduce. Default Value: mapreduce") .withShortName("method").create(); Option confusionMatrixOpt = obuilder.withLongName("confusionMatrix").withRequired(false) .withArgument(abuilder.withName("confusionMatrix").withMinimum(1).withMaximum(1).create()) .withDescription("Export ConfusionMatrix as SequenceFile").withShortName("cm").create(); Group group = gbuilder.withName("Options").withOption(defaultCatOpt).withOption(dirOpt) .withOption(encodingOpt).withOption(gramSizeOpt).withOption(pathOpt).withOption(typeOpt) .withOption(dataSourceOpt).withOption(helpOpt).withOption(methodOpt).withOption(verboseOutputOpt) .withOption(alphaOpt).withOption(confusionMatrixOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } BayesParameters params = new BayesParameters(); // Setting all default values int gramSize = 1; String modelBasePath = (String) cmdLine.getValue(pathOpt); if (cmdLine.hasOption(gramSizeOpt)) { gramSize = Integer.parseInt((String) cmdLine.getValue(gramSizeOpt)); } String classifierType = "bayes"; if (cmdLine.hasOption(typeOpt)) { classifierType = (String) cmdLine.getValue(typeOpt); } String dataSource = "hdfs"; if (cmdLine.hasOption(dataSourceOpt)) { dataSource = (String) cmdLine.getValue(dataSourceOpt); } String defaultCat = "unknown"; if (cmdLine.hasOption(defaultCatOpt)) { defaultCat = (String) cmdLine.getValue(defaultCatOpt); } String encoding = "UTF-8"; if (cmdLine.hasOption(encodingOpt)) { encoding = (String) cmdLine.getValue(encodingOpt); } String alphaI = "1.0"; if (cmdLine.hasOption(alphaOpt)) { alphaI = (String) cmdLine.getValue(alphaOpt); } boolean verbose = cmdLine.hasOption(verboseOutputOpt); String testDirPath = (String) cmdLine.getValue(dirOpt); String classificationMethod = "mapreduce"; if (cmdLine.hasOption(methodOpt)) { classificationMethod = (String) cmdLine.getValue(methodOpt); } String confusionMatrixFile = null; if (cmdLine.hasOption(confusionMatrixOpt)) { confusionMatrixFile = (String) cmdLine.getValue(confusionMatrixOpt); } params.setGramSize(gramSize); params.set("verbose", Boolean.toString(verbose)); params.setBasePath(modelBasePath); params.set("classifierType", classifierType); params.set("dataSource", dataSource); params.set("defaultCat", defaultCat); params.set("encoding", encoding); params.set("alpha_i", alphaI); params.set("testDirPath", testDirPath); params.set("confusionMatrix", confusionMatrixFile); if ("sequential".equalsIgnoreCase(classificationMethod)) { classifySequential(params); } else if ("mapreduce".equalsIgnoreCase(classificationMethod)) { classifyParallel(params); } } catch (OptionException e) { CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.classifier.bayes.TrainClassifier.java
public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option helpOpt = DefaultOptionCreator.helpOption(); Option inputDirOpt = DefaultOptionCreator.inputOption().create(); Option outputOpt = DefaultOptionCreator.outputOption().create(); Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false) .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create()) .withDescription("Size of the n-gram. Default Value: 1 ").withShortName("ng").create(); Option minDfOpt = obuilder.withLongName("minDf").withRequired(false) .withArgument(abuilder.withName("minDf").withMinimum(1).withMaximum(1).create()) .withDescription("Minimum Term Document Frequency: 1 ").withShortName("mf").create(); Option minSupportOpt = obuilder.withLongName("minSupport").withRequired(false) .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()) .withDescription("Minimum Support (Term Frequency): 1 ").withShortName("ms").create(); Option alphaOpt = obuilder.withLongName("alpha").withRequired(false) .withArgument(abuilder.withName("a").withMinimum(1).withMaximum(1).create()) .withDescription("Smoothing parameter Default Value: 1.0").withShortName("a").create(); Option typeOpt = obuilder.withLongName("classifierType").withRequired(false) .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create()) .withDescription("Type of classifier: bayes|cbayes. Default: bayes").withShortName("type").create(); Option dataSourceOpt = obuilder.withLongName("dataSource").withRequired(false) .withArgument(abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create()) .withDescription("Location of model: hdfs. Default Value: hdfs").withShortName("source").create(); Option skipCleanupOpt = obuilder.withLongName("skipCleanup").withRequired(false) .withDescription("Skip cleanup of feature extraction output").withShortName("sc").create(); Option compressOpt = obuilder.withLongName("compress").withRequired(false) .withArgument(abuilder.withName("compress").withDefault("0").withMinimum(0).withMaximum(1).create()) .withDescription("True if the output should be compressed. Default is false").withShortName("comp") .create();/*from ww w . ja v a2 s . co m*/ Option compressCodecOpt = obuilder.withLongName("codec").withRequired(false) .withArgument(abuilder.withName("codec").withDefault("org.apache.hadoop.io.compress.DefaultCodec") .withMinimum(0).withMaximum(1).create()) .withDescription("Compress codec Default Value: org.apache.hadoop.io.compress.DefaultCodec") .withShortName("co").create(); Group group = gbuilder.withName("Options").withOption(gramSizeOpt).withOption(helpOpt) .withOption(inputDirOpt).withOption(outputOpt).withOption(typeOpt).withOption(dataSourceOpt) .withOption(alphaOpt).withOption(minDfOpt).withOption(minSupportOpt).withOption(skipCleanupOpt) .withOption(compressOpt).withOption(compressCodecOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String classifierType = (String) cmdLine.getValue(typeOpt); String dataSourceType = (String) cmdLine.getValue(dataSourceOpt); BayesParameters params = new BayesParameters(); // Setting all the default parameter values params.setGramSize(1); params.setMinDF(1); params.set("alpha_i", "1.0"); params.set("dataSource", "hdfs"); if (cmdLine.hasOption(gramSizeOpt)) { params.setGramSize(Integer.parseInt((String) cmdLine.getValue(gramSizeOpt))); } if (cmdLine.hasOption(minDfOpt)) { params.setMinDF(Integer.parseInt((String) cmdLine.getValue(minDfOpt))); } if (cmdLine.hasOption(minSupportOpt)) { params.setMinSupport(Integer.parseInt((String) cmdLine.getValue(minSupportOpt))); } if (cmdLine.hasOption(skipCleanupOpt)) { params.setSkipCleanup(true); } if (cmdLine.hasOption(alphaOpt)) { params.set("alpha_i", (String) cmdLine.getValue(alphaOpt)); } if (cmdLine.hasOption(dataSourceOpt)) { params.set("dataSource", dataSourceType); } if (cmdLine.hasOption(compressOpt) && cmdLine.getValue(compressOpt).toString().equals("1")) { params.set("compress", "true"); } else { params.set("compress", "false"); } if (cmdLine.hasOption(compressCodecOpt)) { params.set("codec", (String) cmdLine.getValue(compressCodecOpt)); } Path inputPath = new Path((String) cmdLine.getValue(inputDirOpt)); Path outputPath = new Path((String) cmdLine.getValue(outputOpt)); if ("cbayes".equalsIgnoreCase(classifierType)) { log.info("Training Complementary Bayes Classifier"); trainCNaiveBayes(inputPath, outputPath, params); } else { log.info("Training Bayes Classifier"); // setup the HDFS and copy the files there, then run the trainer trainNaiveBayes(inputPath, outputPath, params); } } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorDriver.java
/** * Takes in two arguments:/*from www .j av a 2s.c om*/ * <ol> * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li> * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a * {@link org.apache.hadoop.io.SequenceFile}</li> * </ol> */ public static void main(String[] args) throws IOException, InterruptedException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dirInputPathOpt = DefaultOptionCreator.inputOption().create(); Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create(); Option categoriesOpt = obuilder.withLongName("categories").withRequired(true) .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create()) .withDescription("Location of the categories file. One entry per line. " + "Will be used to make a string match in Wikipedia Category field") .withShortName("c").create(); Option exactMatchOpt = obuilder.withLongName("exactMatch") .withDescription("If set, then the category name must exactly match the " + "entry in the categories file. Default is false") .withShortName("e").create(); Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(false) .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()) .withDescription("The analyzer to use, must have a no argument constructor").withShortName("a") .create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt) .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(analyzerOpt).withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String inputPath = (String) cmdLine.getValue(dirInputPathOpt); String outputPath = (String) cmdLine.getValue(dirOutputPathOpt); String catFile = (String) cmdLine.getValue(categoriesOpt); Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class; if (cmdLine.hasOption(analyzerOpt)) { String className = cmdLine.getValue(analyzerOpt).toString(); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it ClassUtils.instantiateAs(analyzerClass, Analyzer.class); } runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (ClassNotFoundException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.classifier.bayes.WikipediaXmlSplitter.java
public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true) .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()) .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d") .create();//from w w w .j a va 2s .c o m Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true) .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory to place the splits in:\n" + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n" + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n" + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n" + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n") .withShortName("o").create(); Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false) .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 ID key").withShortName("i").create(); Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false) .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 secret key").withShortName("s").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true) .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()) .withDescription("The Size of the chunk, in megabytes").withShortName("c").create(); Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false) .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()) .withDescription( "The maximum number of chunks to create. If specified, program will only create a subset of the chunks") .withShortName("n").create(); Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt) .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine; try { cmdLine = parser.parse(args); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); return; } Configuration conf = new Configuration(); String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt); String outputDirPath = (String) cmdLine.getValue(outputDirOpt); if (cmdLine.hasOption(s3IdOpt)) { String id = (String) cmdLine.getValue(s3IdOpt); conf.set("fs.s3n.awsAccessKeyId", id); conf.set("fs.s3.awsAccessKeyId", id); } if (cmdLine.hasOption(s3SecretOpt)) { String secret = (String) cmdLine.getValue(s3SecretOpt); conf.set("fs.s3n.awsSecretAccessKey", secret); conf.set("fs.s3.awsSecretAccessKey", secret); } // do not compute crc file when using local FS conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf); int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); int numChunks = Integer.MAX_VALUE; if (cmdLine.hasOption(numChunksOpt)) { numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt)); } String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" " + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ " + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n" + " <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n" + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n" + " <generator>MediaWiki 1.13alpha</generator>\n" + " <case>first-letter</case>\n" + " <namespaces>\n" + " <namespace key=\"-2\">Media</namespace>\n" + " <namespace key=\"-1\">Special</namespace>\n" + " <namespace key=\"0\" />\n" + " <namespace key=\"1\">Talk</namespace>\n" + " <namespace key=\"2\">User</namespace>\n" + " <namespace key=\"3\">User talk</namespace>\n" + " <namespace key=\"4\">Wikipedia</namespace>\n" + " <namespace key=\"5\">Wikipedia talk</namespace>\n" + " <namespace key=\"6\">Image</namespace>\n" + " <namespace key=\"7\">Image talk</namespace>\n" + " <namespace key=\"8\">MediaWiki</namespace>\n" + " <namespace key=\"9\">MediaWiki talk</namespace>\n" + " <namespace key=\"10\">Template</namespace>\n" + " <namespace key=\"11\">Template talk</namespace>\n" + " <namespace key=\"12\">Help</namespace>\n" + " <namespace key=\"13\">Help talk</namespace>\n" + " <namespace key=\"14\">Category</namespace>\n" + " <namespace key=\"15\">Category talk</namespace>\n" + " <namespace key=\"100\">Portal</namespace>\n" + " <namespace key=\"101\">Portal talk</namespace>\n" + " </namespaces>\n" + " </siteinfo>\n"; StringBuilder content = new StringBuilder(); content.append(header); NumberFormat decimalFormatter = new DecimalFormat("0000"); File dumpFile = new File(dumpFilePath); FileLineIterator it; if (dumpFilePath.endsWith(".bz2")) { // default compression format from http://download.wikimedia.org CompressionCodec codec = new BZip2Codec(); it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile))); } else { // assume the user has previously de-compressed the dump file it = new FileLineIterator(dumpFile); } int filenumber = 0; while (it.hasNext()) { String thisLine = it.next(); if (thisLine.trim().startsWith("<page>")) { boolean end = false; while (!thisLine.trim().startsWith("</page>")) { content.append(thisLine).append('\n'); if (it.hasNext()) { thisLine = it.next(); } else { end = true; break; } } content.append(thisLine).append('\n'); if (content.length() > chunkSize || end) { content.append("</mediawiki>"); filenumber++; String filename = outputDirPath + "/chunk-" + decimalFormatter.format(filenumber) + ".xml"; BufferedWriter chunkWriter = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8")); try { chunkWriter.write(content.toString(), 0, content.length()); } finally { Closeables.closeQuietly(chunkWriter); } if (filenumber >= numChunks) { break; } content = new StringBuilder(); content.append(header); } } } }
From source file:org.apache.mahout.classifier.BayesFileFormatter.java
/** * Run the FileFormatter//from w ww . java2s .c o m * * @param args * The input args. Run with -h to see the help * @throws ClassNotFoundException * if the Analyzer can't be found * @throws IllegalAccessException * if the Analyzer can't be constructed * @throws InstantiationException * if the Analyzer can't be constructed * @throws IOException * if the files can't be dealt with properly */ public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = DefaultOptionCreator.inputOption().create(); Option outputOpt = DefaultOptionCreator.outputOption().create(); Option labelOpt = obuilder.withLongName("label").withRequired(true) .withArgument(abuilder.withName("label").withMinimum(1).withMaximum(1).create()) .withDescription("The label of the file").withShortName("l").create(); Option analyzerOpt = obuilder.withLongName("analyzer") .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()) .withDescription("The fully qualified class name of the analyzer to use. " + "Must have a no-arg constructor. Default is the StandardAnalyzer") .withShortName("a").create(); Option charsetOpt = obuilder.withLongName("charset") .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create()) .withDescription("The character encoding of the input file").withShortName("c").create(); Option collapseOpt = obuilder.withLongName("collapse").withRequired(true) .withArgument(abuilder.withName("collapse").withMinimum(1).withMaximum(1).create()) .withDescription("Collapse a whole directory to a single file, one doc per line").withShortName("p") .create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(labelOpt) .withOption(analyzerOpt).withOption(charsetOpt).withOption(collapseOpt).withOption(helpOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { return; } File input = new File((String) cmdLine.getValue(inputOpt)); File output = new File((String) cmdLine.getValue(outputOpt)); String label = (String) cmdLine.getValue(labelOpt); Analyzer analyzer; if (cmdLine.hasOption(analyzerOpt)) { analyzer = ClassUtils.instantiateAs((String) cmdLine.getValue(analyzerOpt), Analyzer.class); } else { analyzer = new StandardAnalyzer(Version.LUCENE_31); } Charset charset = Charsets.UTF_8; if (cmdLine.hasOption(charsetOpt)) { charset = Charset.forName((String) cmdLine.getValue(charsetOpt)); } boolean collapse = cmdLine.hasOption(collapseOpt); if (collapse) { collapse(label, analyzer, input, charset, output); } else { format(label, analyzer, input, charset, output); } } catch (OptionException e) { log.error("Exception", e); } }
From source file:org.apache.mahout.classifier.chi_rwcs.mapreduce.BuildModel.java
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Data path").create(); Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true) .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create()) .withDescription("The path of the file descriptor of the dataset").create(); Option timeOpt = obuilder.withLongName("time").withShortName("tm").withRequired(false) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Time path").create(); Option outputOpt = obuilder.withLongName("output").withShortName("o").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Output path, will contain the Decision Forest").create(); Option labelsOpt = obuilder.withLongName("labels").withShortName("l").withRequired(true) .withArgument(abuilder.withName("labels").withMinimum(1).withMaximum(1).create()) .withDescription("Number of Labels").create(); Option combinationTypeOpt = obuilder.withLongName("combinationType").withShortName("t").withRequired(true) .withArgument(abuilder.withName("combinationType").withMinimum(1).withMaximum(1).create()) .withDescription("T-norm for the computation of the compatibility degree").create(); Option rule_weightOpt = obuilder.withLongName("rule_weight").withShortName("r").withRequired(true) .withArgument(abuilder.withName("rule_weight").withMinimum(1).withMaximum(1).create()) .withDescription("Rule Weight").create(); Option fuzzy_r_mOpt = obuilder.withLongName("fuzzy_r_m").withShortName("f").withRequired(true) .withArgument(abuilder.withName("fuzzy_r_m").withMinimum(1).withMaximum(1).create()) .withDescription("Fuzzy Reasoning Method").create(); Option helpOpt = obuilder.withLongName("help").withShortName("h").withDescription("Print out help") .create();/* w w w .j av a 2 s . c o m*/ Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(timeOpt) .withOption(outputOpt).withOption(labelsOpt).withOption(combinationTypeOpt) .withOption(rule_weightOpt).withOption(fuzzy_r_mOpt).withOption(helpOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption("help")) { CommandLineUtil.printHelp(group); return -1; } dataName = cmdLine.getValue(dataOpt).toString(); String datasetName = cmdLine.getValue(datasetOpt).toString(); String outputName = cmdLine.getValue(outputOpt).toString(); nLabels = Integer.parseInt(cmdLine.getValue(labelsOpt).toString()); String combinationType_aux = cmdLine.getValue(combinationTypeOpt).toString(); String ruleWeight_aux = cmdLine.getValue(rule_weightOpt).toString(); String inferenceType_aux = cmdLine.getValue(fuzzy_r_mOpt).toString(); if (cmdLine.hasOption(timeOpt)) { buildTimeIsStored = true; timeName = cmdLine.getValue(timeOpt).toString(); } if (log.isDebugEnabled()) { log.debug("data : {}", dataName); log.debug("dataset : {}", datasetName); log.debug("output : {}", outputName); log.debug("labels : {}", nLabels); log.debug("t_norm : {}", combinationType_aux); log.debug("rule_weight : {}", ruleWeight_aux); log.debug("fuzzy_r_m : {}", inferenceType_aux); log.debug("time : {}", timeName); } dataPath = new Path(dataName); datasetPath = new Path(datasetName); outputPath = new Path(outputName); if (buildTimeIsStored) timePath = new Path(timeName); combinationType = PRODUCT; if (combinationType_aux.compareToIgnoreCase("minimum") == 0) { combinationType = MINIMUM; } ruleWeight = PCF_IV; if (ruleWeight_aux.compareToIgnoreCase("Certainty_Factor") == 0) { ruleWeight = CF; } else if (ruleWeight_aux.compareToIgnoreCase("Average_Penalized_Certainty_Factor") == 0) { ruleWeight = PCF_II; } else if (ruleWeight_aux.compareToIgnoreCase("No_Weights") == 0) { ruleWeight = NO_RW; } inferenceType = WINNING_RULE; if (inferenceType_aux.compareToIgnoreCase("Additive_Combination") == 0) { inferenceType = ADDITIVE_COMBINATION; } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); return -1; } buildModel(); return 0; }
From source file:org.apache.mahout.classifier.chi_rwcs.mapreduce.TestModel.java
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = DefaultOptionCreator.inputOption().create(); Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true) .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create()) .withDescription("Dataset path").create(); Option modelOpt = obuilder.withLongName("model").withShortName("m").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Path to the Model").create(); Option outputOpt = DefaultOptionCreator.outputOption().create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(datasetOpt).withOption(modelOpt) .withOption(outputOpt).withOption(helpOpt).create(); try {/*from w w w . j a va2 s .c o m*/ Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption("help")) { CommandLineUtil.printHelp(group); return -1; } dataName = cmdLine.getValue(inputOpt).toString(); String datasetName = cmdLine.getValue(datasetOpt).toString(); String modelName = cmdLine.getValue(modelOpt).toString(); String outputName = cmdLine.hasOption(outputOpt) ? cmdLine.getValue(outputOpt).toString() : null; if (log.isDebugEnabled()) { log.debug("inout : {}", dataName); log.debug("dataset : {}", datasetName); log.debug("model : {}", modelName); log.debug("output : {}", outputName); } dataPath = new Path(dataName); datasetPath = new Path(datasetName); modelPath = new Path(modelName); if (outputName != null) { outputPath = new Path(outputName); } } catch (OptionException e) { log.warn(e.toString(), e); CommandLineUtil.printHelp(group); return -1; } time = System.currentTimeMillis(); testModel(); time = System.currentTimeMillis() - time; writeToFileClassifyTime(Chi_RWCSUtils.elapsedTime(time)); return 0; }