hdfs.hdfsadapter.XMLJob.java Source code

Java tutorial

Introduction

Here is the source code for hdfs.hdfsadapter.XMLJob.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package hdfs.hdfsadapter;

import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.Set;
import javax.lang.model.SourceVersion;
import javax.tools.Tool;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

/**
 *
 * @author efi
 */
public class XMLJob extends Configured implements Tool {

    public int run(String[] args) throws Exception {

        long startTime = System.currentTimeMillis();

        // Paths of input and output directory
        Path input = new Path(args[0]); //input path
        Path output = new Path(args[1]); //output path
        Path temp = new Path("buffer.txt");

        // Create configuration
        Configuration conf = super.getConf();
        //conf.set("mapred.map.tasks", );
        conf.set("fs.default.name", "hdfs://localhost:9000");
        String tag = args[2];
        conf.set("start_tag", "<" + tag + ">");
        conf.set("end_tag", "</" + tag + ">");
        // Create connector with the hdfs system
        FileSystem hdfs = FileSystem.get(conf);

        // Delete output if it exists to avoid error
        if (hdfs.exists(output)) {
            hdfs.delete(output, true);
        }
        if (hdfs.exists(temp)) {
            hdfs.delete(output, true);
        }
        hdfs.createNewFile(temp);
        DistributedCache.addCacheFile(new URI("buffer.txt"), conf);

        Job read = new Job(super.getConf(), "Read from HDFS");
        read.setNumReduceTasks(0);
        // Assign Map and Reduce class
        read.setJarByClass(XmlReadMapper.class);
        read.setMapperClass(XmlReadMapper.class);
        // Define the data type of key and value
        read.setMapOutputKeyClass(Text.class); //key from map
        read.setMapOutputValueClass(Text.class);//value from map
        // Set input path 
        FileInputFormat.addInputPath(read, input);

        //How to read each block
        //1.Whole Block
        //read.setInputFormatClass(XmlInputFormatBlockSolution.class);

        //2.One Buffer 
        //read.setInputFormatClass(XmlInputFormatOneBufferSolution.class);

        //3.Two buffers
        read.setInputFormatClass(XmlInputFormatTwoBufferSolution.class);

        // Set output path
        FileOutputFormat.setOutputPath(read, output);
        read.setOutputFormatClass(TextOutputFormat.class);

        //Execute job 
        int code = read.waitForCompletion(true) ? 0 : 1;

        URI[] filenames = DistributedCache.getCacheFiles(conf);

        long endTime = System.currentTimeMillis();
        long totalTime = endTime - startTime;
        System.out.println(totalTime);

        return code;
    }

    @Override
    public int run(InputStream in, OutputStream out, OutputStream out1, String... strings) {
        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
    }

    @Override
    public Set<SourceVersion> getSourceVersions() {
        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
    }

}