List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:org.apache.hyracks.hdfs.dataflow.DataflowTest.java
License:Apache License
/** * Test a job with only HDFS read and writes. * * @throws Exception/* ww w .java 2s .com*/ */ public void testHDFSReadWriteOperators() throws Exception { FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH); FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH)); conf.setInputFormat(TextInputFormat.class); Scheduler scheduler = new Scheduler(HyracksUtils.CC_HOST, HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT); InputSplit[] splits = conf.getInputFormat().getSplits(conf, numberOfNC * 4); String[] readSchedule = scheduler.getLocationConstraints(splits); JobSpecification jobSpec = new JobSpecification(); RecordDescriptor recordDesc = new RecordDescriptor( new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() }); String[] locations = new String[] { HyracksUtils.NC1_ID, HyracksUtils.NC1_ID, HyracksUtils.NC2_ID, HyracksUtils.NC2_ID }; HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(jobSpec, recordDesc, conf, splits, readSchedule, new TextKeyValueParserFactory()); PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, readOperator, locations); ExternalSortOperatorDescriptor sortOperator = new ExternalSortOperatorDescriptor(jobSpec, 10, new int[] { 0 }, new IBinaryComparatorFactory[] { RawBinaryComparatorFactory.INSTANCE }, recordDesc); PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, sortOperator, locations); HDFSWriteOperatorDescriptor writeOperator = new HDFSWriteOperatorDescriptor(jobSpec, conf, new TextTupleWriterFactory()); PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, writeOperator, HyracksUtils.NC1_ID); jobSpec.connect(new OneToOneConnectorDescriptor(jobSpec), readOperator, 0, sortOperator, 0); jobSpec.connect(new MToNPartitioningMergingConnectorDescriptor(jobSpec, new FieldHashPartitionComputerFactory(new int[] { 0 }, new IBinaryHashFunctionFactory[] { RawBinaryHashFunctionFactory.INSTANCE }), new int[] { 0 }, new IBinaryComparatorFactory[] { RawBinaryComparatorFactory.INSTANCE }, null), sortOperator, 0, writeOperator, 0); jobSpec.addRoot(writeOperator); IHyracksClientConnection client = new HyracksConnection(HyracksUtils.CC_HOST, HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT); JobId jobId = client.startJob(jobSpec); client.waitForCompletion(jobId); Assert.assertEquals(true, checkResults()); }
From source file:org.apache.hyracks.imru.dataflow.Hdtest.java
License:Apache License
public static JobSpecification createJob() throws Exception { JobSpecification spec = new JobSpecification(); spec.setFrameSize(4096);/*from w w w. j a v a 2 s. c o m*/ String PATH_TO_HADOOP_CONF = "/home/wangrui/a/imru/hadoop-0.20.2/conf"; String HDFS_INPUT_PATH = "/customer/customer.tbl,/customer_result/part-0"; JobConf conf = new JobConf(); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml")); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml")); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml")); FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH); conf.setInputFormat(TextInputFormat.class); RecordDescriptor recordDesc = new RecordDescriptor( new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE }); InputSplit[] splits = conf.getInputFormat().getSplits(conf, 1); HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(spec, recordDesc, conf, splits, new String[] { "NC0", "NC1" }, new IKeyValueParserFactory<LongWritable, Text>() { @Override public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) { return new IKeyValueParser<LongWritable, Text>() { TupleWriter tupleWriter; @Override public void open(IFrameWriter writer) throws HyracksDataException { tupleWriter = new TupleWriter(ctx, writer, 1); } @Override public void parse(LongWritable key, Text value, IFrameWriter writer, String fileString) throws HyracksDataException { try { tupleWriter.write(value.getBytes(), 0, value.getLength()); tupleWriter.finishField(); tupleWriter.finishTuple(); } catch (IOException e) { throw new HyracksDataException(e); } } @Override public void close(IFrameWriter writer) throws HyracksDataException { tupleWriter.close(); } }; } }); // createPartitionConstraint(spec, readOperator, new String[] {"NC0"}); PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, readOperator, new String[] { "NC0", "NC1" }); IOperatorDescriptor writer = new HDFSOD(spec, null, null, null); // createPartitionConstraint(spec, writer, outSplits); spec.connect(new OneToOneConnectorDescriptor(spec), readOperator, 0, writer, 0); spec.addRoot(writer); return spec; }
From source file:org.apache.ignite.internal.processors.hadoop.examples.GridHadoopWordCount1.java
License:Apache License
/** * Gets fully configured JobConf instance. * * @param input input file name.//from w w w. jav a 2 s. co m * @param output output directory name. * @return Job configuration */ public static JobConf getJob(String input, String output) { JobConf conf = new JobConf(GridHadoopWordCount1.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); setTasksClasses(conf, true, true, true); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); return conf; }
From source file:org.apache.ignite.internal.processors.hadoop.examples.HadoopWordCount1.java
License:Apache License
/** * Gets fully configured JobConf instance. * * @param input input file name.//from ww w . j av a2s. c o m * @param output output directory name. * @return Job configuration */ public static JobConf getJob(String input, String output) { JobConf conf = new JobConf(HadoopWordCount1.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); setTasksClasses(conf, true, true, true); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); return conf; }
From source file:org.apache.mahout.avro.text.mapred.WikipediaToAvroDocuments.java
License:Apache License
/** * Run the job//w w w. ja v a 2 s . c om * * @param input * the input pathname String * @param output * the output pathname String * @param catFile * the file containing the Wikipedia categories * @param exactMatchOnly * if true, then the Wikipedia category must match exactly instead of * simply containing the category string * @param all * if true select all categories */ public static int runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(WikipediaToAvroDocuments.class); if (log.isInfoEnabled()) { log.info("Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all); } Path inPath = new Path(input); Path outPath = new Path(output); FileInputFormat.setInputPaths(conf, inPath); FileOutputFormat.setOutputPath(conf, outPath); //AvroOutputFormat.setClass(conf, AvroDocument.class); //AvroOutputFormat.setSchema(conf, AvroDocument._SCHEMA); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(AvroDocument.class); conf.setBoolean("exact.match.only", exactMatchOnly); conf.setBoolean("all.files", all); conf.setMapperClass(WikipediaAvroDocumentMapper.class); conf.setInputFormat(XmlInputFormat.class); conf.setReducerClass(IdentityReducer.class); conf.setOutputFormat(AvroOutputFormat.class); AvroOutputFormat.setAvroOutputClass(conf, AvroDocument.class); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Set<String> categories = new HashSet<String>(); if (catFile.equals("") == false) { for (String line : new FileLineIterable(new File(catFile))) { categories.add(line.trim().toLowerCase()); } } DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories)); String categoriesStr = setStringifier.toString(categories); conf.set("wikipedia.categories", categoriesStr); client.setConf(conf); RunningJob job = JobClient.runJob(conf); job.waitForCompletion(); return job.isSuccessful() ? 1 : 0; }
From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureDriver.java
License:Apache License
/** * Run the job// w w w . j a v a2 s . co m * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output, int gramSize) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(BayesFeatureDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(conf, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); //conf.setNumReduceTasks(1); conf.setMapperClass(BayesFeatureMapper.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setCombinerClass(BayesFeatureReducer.class); conf.setReducerClass(BayesFeatureReducer.class); conf.setOutputFormat(BayesFeatureOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } DefaultStringifier<Integer> intStringifier = new DefaultStringifier<Integer>(conf, Integer.class); String gramSizeString = intStringifier.toString(gramSize); log.info("{}", intStringifier.fromString(gramSizeString)); conf.set("bayes.gramSize", gramSizeString); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver.java
License:Apache License
/** * Run the job//from w ww . j a v a2s. co m * * @param params * The Job parameters containing the gramSize, input output folders, defaultCat, encoding */ public static void runJob(Parameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesClassifierDriver.class); conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath")); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath"))); Path outPath = new Path(params.get("testDirPath") + "-output"); FileOutputFormat.setOutputPath(conf, outPath); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setMapperClass(BayesClassifierMapper.class); conf.setCombinerClass(BayesClassifierReducer.class); conf.setReducerClass(BayesClassifierReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); HadoopUtil.overwriteOutput(outPath); conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); Path outputFiles = new Path(outPath, "part*"); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params); log.info("{}", matrix.summarize()); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesFeatureDriver.class); conf.setJobName("Bayes Feature Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); conf.setPartitionerClass(FeaturePartitioner.class); conf.setOutputKeyComparatorClass(FeatureLabelComparator.class); FileInputFormat.setInputPaths(conf, input); FileOutputFormat.setOutputPath(conf, output); conf.setMapperClass(BayesFeatureMapper.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setCombinerClass(BayesFeatureCombiner.class); conf.setReducerClass(BayesFeatureReducer.class); conf.setOutputFormat(BayesFeatureOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // this conf parameter needs to be set enable serialisation of conf values HadoopUtil.overwriteOutput(output);// ww w. ja v a 2s. com conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.clustering.canopy.ClusterDriver.java
License:Apache License
/** * Run the job//from w w w .j av a 2 s. com * * @param points the input points directory pathname String * @param canopies the input canopies directory pathname String * @param output the output directory pathname String * @param measureClassName the DistanceMeasure class name * @param t1 the T1 distance threshold * @param t2 the T2 distance threshold * @param vectorClass The {@link Class} of Vector to use for the Output Value Class. Must be concrete. */ public static void runJob(String points, String canopies, String output, String measureClassName, double t1, double t2, Class<? extends Vector> vectorClass) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(org.apache.mahout.clustering.canopy.ClusterDriver.class); conf.set(Canopy.DISTANCE_MEASURE_KEY, measureClassName); conf.set(Canopy.T1_KEY, String.valueOf(t1)); conf.set(Canopy.T2_KEY, String.valueOf(t2)); conf.set(Canopy.CANOPY_PATH_KEY, canopies); conf.setInputFormat(SequenceFileInputFormat.class); /*conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(SparseVector.class);*/ conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(vectorClass); conf.setOutputFormat(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(points)); Path outPath = new Path(output + DEFAULT_CLUSTER_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(conf, outPath); conf.setMapperClass(ClusterMapper.class); conf.setReducerClass(IdentityReducer.class); client.setConf(conf); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } JobClient.runJob(conf); }
From source file:org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.java
License:Apache License
public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); conf.setMapperClass(OutputMapper.class); conf.setReducerClass(Reducer.class); conf.setNumReduceTasks(0);//w ww . ja v a 2 s.co m client.setConf(conf); JobClient.runJob(conf); }