List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:edu.yale.cs.hadoopdb.benchmark.SelectionTaskHDFS.java
License:Apache License
@Override protected JobConf configureJob(String... args) throws IOException { JobConf conf = new JobConf(getConf(), this.getClass()); conf.setJobName("selection_hdfs"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setNumReduceTasks(0);/*from www . j a v a 2 s .c om*/ conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); if (args.length < 3) { throw new RuntimeException("Incorrect arguments provided for " + this.getClass()); } conf.set(PAGE_RANK_VALUE_PARAM, args[0]); FileInputFormat.setInputPaths(conf, new Path(args[1])); // OUTPUT properties Path outputPath = new Path(args[2]); HDFSUtil.deletePath(outputPath); FileOutputFormat.setOutputPath(conf, outputPath); return conf; }
From source file:edu.yale.cs.hadoopdb.benchmark.UDFAggTaskHDFS.java
License:Apache License
@Override protected JobConf configureJob(String... args) throws IOException { JobConf conf = new JobConf(this.getClass()); conf.setJobName("udf_agg_hdfs"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(LongWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(LongSumReducer.class); conf.setReducerClass(LongSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); if (args.length < 2) { throw new RuntimeException("Incorrect arguments provided for " + this.getClass()); }/* w w w.j a v a 2 s . c om*/ FileInputFormat.setInputPaths(conf, new Path(args[0])); // OUTPUT properties Path outputPath = new Path(args[1]); HDFSUtil.deletePath(outputPath); FileOutputFormat.setOutputPath(conf, outputPath); return conf; }
From source file:edu.yale.cs.hadoopdb.dataloader.GlobalHasher.java
License:Apache License
@Override protected JobConf configureJob(String... args) throws Exception { JobConf conf = new JobConf(getConf(), this.getClass()); conf.setJobName("GlobalHasher"); conf.setMapOutputKeyClass(UnsortableInt.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(GlobalHasher.Map.class); conf.setReducerClass(GlobalHasher.Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); if (args.length < 5) { throw new RuntimeException("Incorrect arguments provided for " + this.getClass()); }//from w ww. j a v a2 s . c o m FileInputFormat.setInputPaths(conf, new Path(args[0])); // OUTPUT properties Path outputPath = new Path(args[1]); HDFSUtil.deletePath(outputPath); FileOutputFormat.setOutputPath(conf, outputPath); int partNo = Integer.parseInt(args[2]); conf.setNumReduceTasks(partNo); conf.set(DELIMITER_PARAM, args[3]); int hashFieldPos = Integer.parseInt(args[4]); conf.setInt(HASH_FIELD_POS_PARAM, hashFieldPos); return conf; }
From source file:eu.larkc.iris.storage.FactsTap.java
License:Apache License
@Override public void sourceInit(JobConf jobConf) throws IOException { // a hack for MultiInputFormat to see that there is a child format FileInputFormat.setInputPaths(jobConf, getPath()); jobConf.set(IFactsConfiguration.FACTS_CONFIGURATION_CLASS, factsConfigurationClass); if (isSource()) { StringBuilder sb = new StringBuilder(); if (atom != null) { sb.append(atom.getPredicate().getPredicateSymbol()); }/*from w w w . ja v a 2 s . c o m*/ if (predicates != null && predicates.length > 0) { for (IPredicate predicate : predicates) { if (sb.length() > 0) { sb.append(","); } sb.append(predicate.getPredicateSymbol()); } } jobConf.set(IFactsConfiguration.PREDICATE_FILTER, sb.toString()); } IFactsConfiguration factsConfiguration = FactsConfigurationFactory.getFactsConfiguration(jobConf); factsConfiguration.setSourceStorageId(storageId); //RdfFactsConfiguration.configure(conf, rdf2GoImpl, serverURL, repositoryID); super.sourceInit(jobConf); }
From source file:eu.scape_project.tb.chutney.ChutneyDriver.java
License:Apache License
/** * This method sets up and runs the job on Hadoop * @param args The passed through command line arguments *//*from www. j a va 2 s . com*/ public int run(String[] args) { CommandLineParser parser = new PosixParser(); Options options = new Options(); options.addOption("n", "jobname", true, "name to assign to the hadoop job"); options.addOption("i", "inputlist", true, "text file containing list of input files (ensure no trailing carriage returns)"); options.addOption("t", "jobtype", true, "type of job; CLJ (command line job), TSJ (Taverna Server job), TCL (Taverna command line job), XML (XML defined command line job), XWR (XML workflow report)"); options.addOption("x", "xmlcode", true, "xml definition of job to run for XML jobs"); options.addOption("h", "help", false, "help text"); JobConf conf = new JobConf(ChutneyDriver.class); String input = null; String xmlcode = null; CommandLine com; try { com = parser.parse(options, args); if (com.hasOption("help")) { throw (new ParseException("")); } String jobName = Settings.JOB_NAME + "default"; if (com.hasOption("jobname")) { //set the job name to something better than the default jobName = Settings.JOB_NAME + com.getOptionValue("jobname"); } conf.setJobName(jobName); JobType jobType = JobType.CommandLineJob; if (com.hasOption("jobtype")) { String value = com.getOptionValue("jobtype").toUpperCase(); if (value.equals(CommandLineJob.getShortJobType())) { jobType = CommandLineJob.getJobType(); } else if (value.equals(TavernaCommandLineJob.getShortJobType())) { jobType = TavernaCommandLineJob.getJobType(); } else if (value.equals(TavernaServerJob.getShortJobType())) { jobType = TavernaServerJob.getJobType(); } else if (value.equals(XMLCommandLineJob.getShortJobType())) { jobType = XMLCommandLineJob.getJobType(); } else if (value.equals(XMLWorkflowReport.getShortJobType())) { jobType = XMLWorkflowReport.getJobType(); } } System.out.println("JobType: " + jobType.toString()); conf.set(Settings.JOBTYPE_CONF_SETTING, jobType.toString()); if (com.hasOption("xmlcode")) { //jobType == JobType.XMLCommandLineJob xmlcode = com.getOptionValue("xmlcode"); //if it is a local file get the full path if (new File(xmlcode).exists()) xmlcode = new File(xmlcode).getAbsolutePath(); conf.set(Settings.XMLCODE_CONF_SETTING, xmlcode); } if ((jobType == JobType.XMLCommandLineJob) & (xmlcode == null)) { //i.e. no code specified System.out.println("No XML code specified on the command line"); return -1; } if (com.hasOption("inputlist")) { input = com.getOptionValue("inputlist"); } if (input.equals(null)) { System.out.println("no input given"); return -2; } } catch (ParseException e) { HelpFormatter help = new HelpFormatter(); help.printHelp("hadoop jar TavernaHadoopWrapper.jar", options); return -1; } //using matchbox it may take a while to process the jobs //set a longer timeout than the default (10 mins) //six hours should be more than enough :/ MMM*SS*MS //QAJob testing for 9 tests on ANJO files can take ~4.5hrs+ conf.set("mapred.task.timeout", Integer.toString(360 * 60 * 1000)); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(conf.getJobName())); //set the mapper to this class' mapper conf.setMapperClass(Chutney.class); //we don't want to reduce //conf.setReducerClass(Reducer.class); //this input format should split the input by one line per map by default. conf.setInputFormat(NLineInputFormat.class); conf.setInt("mapred.line.input.format.linespermap", 1); //sets how the output is written cf. OutputFormat //we can use nulloutputformat if we are writing our own output conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); //this sets maximum jvm reuse conf.set("mapred.job.reuse.jvm.num.tasks", "-1"); //we only want one reduce task conf.setNumReduceTasks(1); try { JobClient.runJob(conf); } catch (IOException ioe) { ioe.printStackTrace(); return -1; } return 0; }
From source file:example.ColorCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: ColorCount <input path> <output path>"); return -1; }//from ww w .ja v a 2s. c o m JobConf conf = new JobConf(getConf(), ColorCount.class); conf.setJobName("colorcount"); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); AvroJob.setMapperClass(conf, ColorCountMapper.class); AvroJob.setReducerClass(conf, ColorCountReducer.class); // Note that AvroJob.setInputSchema and AvroJob.setOutputSchema set // relevant config options such as input/output format, map output // classes, and output key class. AvroJob.setInputSchema(conf, User.SCHEMA$); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT))); JobClient.runJob(conf); return 0; }
From source file:findstableweatherstate.FindStableWeatherState.java
public String call() throws Exception { Path firstOutputPath = new Path("input/firstOutput"); Path secondOutputPath = new Path("input/secondOutput"); long startTime, stopTime, elapsedTime; JobConf job = new JobConf(); job.setJarByClass(getClass());/*w w w . j a v a 2s. c o m*/ job.setJobName("invertedindex"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(JoinReducer.class); MultipleInputs.addInputPath(job, new Path(getInputPathStation()), TextInputFormat.class, StationMapper.class); MultipleInputs.addInputPath(job, new Path(getInputPathReadings()), TextInputFormat.class, ReadingsMapper.class); FileOutputFormat.setOutputPath(job, firstOutputPath); JobConf job2 = new JobConf(); job2.setJarByClass(getClass()); job2.setJobName("secondJob"); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); //job2.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); FileInputFormat.setInputPaths(job2, firstOutputPath); job2.setMapperClass(CalculateMinMaxTemperatureMapper.class); job2.setReducerClass(CalculateMaxMinTemperatureReducer.class); if (getOutputPath() != null) { FileOutputFormat.setOutputPath(job2, secondOutputPath); } JobConf job3 = new JobConf(); job3.setJarByClass(getClass()); job3.setJobName("thirdJob"); job3.setOutputKeyClass(Text.class); job3.setOutputValueClass(Text.class); job3.setMapOutputKeyClass(DoubleWritable.class); job3.setMapOutputValueClass(Text.class); //job2.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); FileInputFormat.setInputPaths(job3, secondOutputPath); job3.setMapperClass(SortStateMapper.class); job3.setReducerClass(SortStateReducer.class); if (getOutputPath() != null) { FileOutputFormat.setOutputPath(job3, new Path(getOutputPath())); } startTime = System.currentTimeMillis(); JobClient.runJob(job); stopTime = System.currentTimeMillis(); elapsedTime = stopTime - startTime; System.out.println("******************** First Job : " + elapsedTime / 1000); startTime = System.currentTimeMillis(); JobClient.runJob(job2); stopTime = System.currentTimeMillis(); elapsedTime = stopTime - startTime; System.out.println("******************** Second Job : " + elapsedTime / 1000); startTime = System.currentTimeMillis(); JobClient.runJob(job3); stopTime = System.currentTimeMillis(); elapsedTime = stopTime - startTime; System.out.println("******************** Third Job : " + elapsedTime / 1000); return ""; }
From source file:FormatStorage1.MergeFileUtil.java
License:Open Source License
public static void run(String inputdir, String outputdir, Configuration conf) throws IOException { JobConf job = new JobConf(conf); job.setJobName("MergeFileUtil"); job.setJarByClass(MergeFileUtil.class); FileSystem fs = null;//from w ww. ja va 2 s. c o m fs = FileSystem.get(job); if (fs.exists(new Path(outputdir))) { throw new IOException("outputdir: " + outputdir + " exist!!!"); } FileStatus[] fss = fs.listStatus(new Path(inputdir)); if (fss == null || fss.length <= 0) { throw new IOException("no input files"); } IFormatDataFile ifdf = new IFormatDataFile(job); ifdf.open(fss[0].getPath().toString()); job.set("ifdf.head.info", ifdf.fileInfo().head().toStr()); ifdf.close(); long wholesize = 0; for (FileStatus status : fss) { wholesize += status.getLen(); } job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, inputdir); FileOutputFormat.setOutputPath(job, new Path(outputdir)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(IRecord.class); job.setMapperClass(MergeMap.class); job.setInputFormat(CombineFormatStorageFileInputFormat.class); job.setOutputFormat(MergeIFormatOutputFormat.class); JobClient jc = new JobClient(job); RunningJob rjob = jc.submitJob(job); try { String lastReport = ""; SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss,SSS"); long reportTime = System.currentTimeMillis(); long maxReportInterval = 3 * 1000; while (!rjob.isComplete()) { Thread.sleep(1000); int mapProgress = Math.round(rjob.mapProgress() * 100); int reduceProgress = Math.round(rjob.reduceProgress() * 100); String report = " map = " + mapProgress + "%, reduce = " + reduceProgress + "%"; if (!report.equals(lastReport) || System.currentTimeMillis() >= reportTime + maxReportInterval) { String output = dateFormat.format(Calendar.getInstance().getTime()) + report; System.err.println(output); lastReport = report; reportTime = System.currentTimeMillis(); } } LOG.info(rjob.getJobState()); } catch (IOException e1) { e1.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } }
From source file:FormatStorage1.MergeFileUtil.java
License:Open Source License
public static void runold(String inputdir, String outputdir, Configuration conf) throws IOException { JobConf job = new JobConf(conf); job.setJobName("MergeFileUtil"); job.setJarByClass(MergeFileUtil.class); FileSystem fs = null;/*ww w . ja v a 2 s . co m*/ fs = FileSystem.get(job); if (fs.exists(new Path(outputdir))) { throw new IOException("outputdir: " + outputdir + " exist!!!"); } FileStatus[] fss = fs.listStatus(new Path(inputdir)); if (fss == null || fss.length <= 0) { throw new IOException("no input files"); } for (FileStatus status : fss) { if (status.isDir()) { throw new IOException("!!!input dir contains directory:\t" + status.getPath().toString()); } } IFormatDataFile ifdf = new IFormatDataFile(job); ifdf.open(fss[0].getPath().toString()); job.set("ifdf.head.info", ifdf.fileInfo().head().toStr()); ifdf.close(); long wholesize = 0; for (FileStatus status : fss) { wholesize += status.getLen(); } long fl = 512 * 1024 * 1024; int reduces = (int) (wholesize / fl + 1); job.setNumReduceTasks(reduces); FileInputFormat.setInputPaths(job, inputdir); FileOutputFormat.setOutputPath(job, new Path(outputdir)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(IRecord.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setInputFormat(MergeIFormatInputFormat.class); job.setOutputFormat(MergeIFormatOutputFormat.class); JobClient jc = new JobClient(job); RunningJob rjob = jc.submitJob(job); try { String lastReport = ""; SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss,SSS"); long reportTime = System.currentTimeMillis(); long maxReportInterval = 3 * 1000; while (!rjob.isComplete()) { Thread.sleep(1000); int mapProgress = Math.round(rjob.mapProgress() * 100); int reduceProgress = Math.round(rjob.reduceProgress() * 100); String report = " map = " + mapProgress + "%, reduce = " + reduceProgress + "%"; if (!report.equals(lastReport) || System.currentTimeMillis() >= reportTime + maxReportInterval) { String output = dateFormat.format(Calendar.getInstance().getTime()) + report; System.err.println(output); lastReport = report; reportTime = System.currentTimeMillis(); } } LOG.info(rjob.getJobState()); } catch (IOException e1) { e1.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } }
From source file:FormatStorage1.MergeFileUtil1.java
License:Open Source License
public static void run(String inputdir, String outputdir, Configuration conf) throws IOException { JobConf job = new JobConf(conf); job.setJobName("MergeFileUtil1"); job.setJarByClass(MergeFileUtil1.class); FileSystem fs = null;//from w w w . j av a 2 s.co m fs = FileSystem.get(job); if (fs.exists(new Path(outputdir))) { throw new IOException("outputdir: " + outputdir + " exist!!!"); } FileStatus[] fss = fs.listStatus(new Path(inputdir)); if (fss == null || fss.length <= 0) { throw new IOException("no input files"); } IFormatDataFile ifdf = new IFormatDataFile(job); ifdf.open(fss[0].getPath().toString()); job.set("ifdf.head.info", ifdf.fileInfo().head().toStr()); ifdf.close(); long wholesize = 0; for (FileStatus status : fss) { wholesize += status.getLen(); } job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, inputdir); FileOutputFormat.setOutputPath(job, new Path(outputdir)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(IRecord.class); job.setMapperClass(MergeMap.class); job.setInputFormat(CombineFormatStorageFileInputFormat.class); job.setOutputFormat(MergeIFormatOutputFormat1.class); JobClient jc = new JobClient(job); RunningJob rjob = jc.submitJob(job); try { String lastReport = ""; SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss,SSS"); long reportTime = System.currentTimeMillis(); long maxReportInterval = 3 * 1000; while (!rjob.isComplete()) { Thread.sleep(1000); int mapProgress = Math.round(rjob.mapProgress() * 100); int reduceProgress = Math.round(rjob.reduceProgress() * 100); String report = " map = " + mapProgress + "%, reduce = " + reduceProgress + "%"; if (!report.equals(lastReport) || System.currentTimeMillis() >= reportTime + maxReportInterval) { String output = dateFormat.format(Calendar.getInstance().getTime()) + report; System.err.println(output); lastReport = report; reportTime = System.currentTimeMillis(); } } LOG.info(rjob.getJobState()); } catch (IOException e1) { e1.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } }