sampler.TotalOrderPrep.java Source code

Introduction

Here is the source code for sampler.TotalOrderPrep.java
Source

/*******************************************************************************
 * Copyright (c) 2012 Nikos Papailiou. 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *     Nikos Papailiou - initial API and implementation
 ******************************************************************************/
package sampler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Random;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.Tool;

import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.graph.Triple;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.sparql.util.NodeFactory;

import byte_import.Combiner;
import bytes.ByteValues;
import bytes.NotSupportedDatatypeException;

public class TotalOrderPrep implements Tool {

    private static final float sumplingRate = new Float("0.01");
    private static final int bucketSampledTriples = 650000;//3000000(gia 1% sampling) //32MB regions
    public static long regions;
    private static final String ARG_INPUTFORMAT = "my.sample";
    private Configuration conf;

    public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException {

        Job sample_job = new Job();

        // Remember the real input format so the sampling input format can use
        // it under the hood

        sample_job.getConfiguration().setBoolean(ARG_INPUTFORMAT, true);
        sample_job.setInputFormatClass(TextInputFormat.class);

        //sample_job.getConfiguration().set("mapred.fairscheduler.pool", "pool9");
        // Base the sample size on the number of reduce tasks that will be used
        // by the real job, but only use 1 reducer for this job (maps output very
        // little)
        sample_job.setNumReduceTasks(1);

        // Make this job's output a temporary filethe input file for the real job's
        // TotalOrderPartitioner
        Path partition = new Path("partitions/");
        //partition.getFileSystem(job.getConfiguration()).deleteOnExit(partition);

        conf = new Configuration();
        FileSystem fs;
        try {
            fs = FileSystem.get(conf);
            if (fs.exists(partition)) {
                fs.delete(partition, true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        FileOutputFormat.setOutputPath(sample_job, partition);
        FileInputFormat.setInputPaths(sample_job, new Path(args[0]));
        //TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path(partition, "part-r-00000"));
        //job.setPartitionerClass(TotalOrderPartitioner.class);

        // If there's a combiner, turn it into an identity reducer to prevent
        // destruction of keys.

        sample_job.setCombinerClass(Combiner.class);

        sample_job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        sample_job.setMapOutputValueClass(ImmutableBytesWritable.class);
        sample_job.setOutputKeyClass(ImmutableBytesWritable.class);
        sample_job.setOutputValueClass(NullWritable.class);
        sample_job.setPartitionerClass(HashPartitioner.class);
        sample_job.setOutputFormatClass(SequenceFileOutputFormat.class);
        sample_job.setJarByClass(TotalOrderPrep.class);
        sample_job.setMapperClass(Map.class);
        sample_job.setReducerClass(PartitioningReducer.class);
        sample_job.setJobName("(Sampler)");
        sample_job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        sample_job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        return sample_job;

        // Run the job.  If it fails, then it's probably because of the main job.
        /*try {
           sample_job.waitForCompletion(false);
            
           if( !sample_job.isSuccessful() )
        throw new RuntimeException("Partition sampler job failed.");
            
        } catch (Exception e) {
           throw new RuntimeException("Failed to start Partition sampler.", e);
        }*/
    }

    public int run(String[] args) throws Exception {

        Job job = createSubmittableJob(args);
        job.waitForCompletion(true);
        Counters counters = job.getCounters();
        regions = counters.getGroup("org.apache.hadoop.mapred.Task$Counter").findCounter("REDUCE_OUTPUT_RECORDS")
                .getValue() + 1;

        return 0;
    }

    public static class Map extends Mapper<LongWritable, Text, ImmutableBytesWritable, ImmutableBytesWritable> {
        private byte[] subject;
        private byte[] predicate;
        private byte[] object;
        private byte[] non;
        private ImmutableBytesWritable new_key = new ImmutableBytesWritable();
        private Random r = new Random();
        private static final int totsize = ByteValues.totalBytes;
        private static Boolean sampling;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            sampling = context.getConfiguration().getBoolean(ARG_INPUTFORMAT, false);
        }

        public void map(LongWritable key, Text value, Context context) throws IOException {
            if (sampling) {
                int rand = r.nextInt(1000000);
                if (rand > 1000000 * sumplingRate)
                    return;
            }

            non = Bytes.toBytes("");
            String line = value.toString();
            String s, p, o;
            StringTokenizer tokenizer = new StringTokenizer(line);

            s = tokenizer.nextToken(" ");
            if (s.contains("\"")) {
                if (!s.endsWith("\"") && !s.endsWith(">"))
                    s += tokenizer.nextToken("\"") + "\"";
            }
            subject = Bytes.toBytes(s);
            //System.out.println(subject);
            p = tokenizer.nextToken(" ");
            if (p.contains("\"")) {
                if (!p.endsWith("\"") && !p.endsWith(">"))
                    p += tokenizer.nextToken("\"") + "\"";
            }
            predicate = Bytes.toBytes(p);
            //System.out.println(predicate);
            o = tokenizer.nextToken(" ");
            if (o.contains("\"")) {
                if (!o.endsWith("\"") && !o.endsWith(">"))
                    o += tokenizer.nextToken("\"") + "\"";
            }
            object = Bytes.toBytes(o);
            //System.out.println(object);
            //tokenizer.nextToken();
            //if (tokenizer.hasMoreTokens()) {
            //   return ;
            //}
            /*try{
               //Reader r = new StringReader(value.toString());
               Node s1 = NodeFactory.parseNode(s);
               Node p1= NodeFactory.parseNode(p);
               Node o1 = NodeFactory.parseNode(o);
               //Triple t = new Triple(s1, p1, o1);
               //System.out.println(s1.toString(false));
               //System.out.println(p1.toString(false));
               //System.out.println(o1.toString(false));
                   
               return ;
            }catch (Exception e) {
               System.out.println(value.toString());
               e.printStackTrace();
               return;
            }*/
            try {

                byte[] si = ByteValues.getFullValue(s);
                byte[] pi = ByteValues.getFullValue(p);
                byte[] oi = ByteValues.getFullValue(o);

                //dhmiourgia pinaka me indexes kanoume emit hashvalue-name, byte[0]=1
                byte[] k;
                if (si[0] == ByteValues.TYPE_STRING) {
                    k = new byte[subject.length + totsize + 1];
                    k[0] = (byte) 1;
                    for (int i = 0; i < totsize; i++) {
                        k[i + 1] = si[i];
                    }
                    for (int i = 0; i < subject.length; i++) {
                        k[i + totsize + 1] = subject[i];
                    }
                    new_key.set(k, 0, k.length);
                    context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                }
                if (pi[0] == ByteValues.TYPE_STRING) {
                    k = new byte[predicate.length + totsize + 1];
                    k[0] = (byte) 1;
                    for (int i = 0; i < totsize; i++) {
                        k[i + 1] = pi[i];
                    }
                    for (int i = 0; i < predicate.length; i++) {
                        k[i + totsize + 1] = predicate[i];
                    }
                    new_key.set(k, 0, k.length);
                    context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                }
                if (oi[0] == ByteValues.TYPE_STRING) {
                    k = new byte[object.length + totsize + 1];
                    k[0] = (byte) 1;
                    for (int i = 0; i < totsize; i++) {
                        k[i + 1] = oi[i];
                    }
                    for (int i = 0; i < object.length; i++) {
                        k[i + totsize + 1] = object[i];
                    }
                    new_key.set(k, 0, k.length);
                    context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                }

                //dhmiourgia spo byte[0]=4 emit key=si,pi value=oi
                k = new byte[totsize + totsize + totsize + 1];
                k[0] = (byte) 4;
                for (int i = 0; i < totsize; i++) {
                    k[i + 1] = si[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + 1] = pi[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + totsize + 1] = oi[i];
                }
                new_key.set(k, 0, k.length);
                context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                //dhmiourgia osp byte[0]=2 emit key=oi,si value=pi
                k = new byte[totsize + totsize + totsize + 1];
                k[0] = (byte) 2;
                for (int i = 0; i < totsize; i++) {
                    k[i + 1] = oi[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + 1] = si[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + totsize + 1] = pi[i];
                }
                new_key.set(k, 0, k.length);
                context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                //dhmiourgia pos byte[0]=3 emit key=pi,oi value=si
                k = new byte[totsize + totsize + totsize + 1];
                k[0] = (byte) 3;
                for (int i = 0; i < totsize; i++) {
                    k[i + 1] = pi[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + 1] = oi[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + totsize + 1] = si[i];
                }
                new_key.set(k, 0, k.length);
                context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                //dhmiourgia sop byte[0]=5 emit key=si,oi value=pi
                k = new byte[totsize + totsize + totsize + 1];
                k[0] = (byte) 5;
                for (int i = 0; i < totsize; i++) {
                    k[i + 1] = si[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + 1] = oi[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + totsize + 1] = pi[i];
                }
                new_key.set(k, 0, k.length);
                context.write(new_key, new ImmutableBytesWritable(non, 0, 0));

            } catch (InterruptedException e) {
                e.printStackTrace();
            } catch (NotSupportedDatatypeException e) {
                e.printStackTrace();
            }
        }

    }

    public static class MapNew extends Mapper<LongWritable, Text, ImmutableBytesWritable, ImmutableBytesWritable> {
        private byte[] subject;
        private byte[] predicate;
        private byte[] object;
        private byte[] non;
        private ImmutableBytesWritable new_key = new ImmutableBytesWritable();
        private Random r = new Random();
        private static final int totsize = ByteValues.totalBytes;
        private static Boolean sampling;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            sampling = context.getConfiguration().getBoolean(ARG_INPUTFORMAT, false);
        }

        public void map(LongWritable key, Text value, Context context) throws IOException {
            if (sampling) {
                int rand = r.nextInt(1000000);
                if (rand > 1000000 * sumplingRate)
                    return;
            }

            non = Bytes.toBytes("");
            String line = value.toString();
            String s, p, o;
            StringTokenizer tokenizer = new StringTokenizer(line);

            s = tokenizer.nextToken(" ");
            if (s.contains("\"")) {
                if (!s.endsWith("\"") && !s.endsWith(">"))
                    s += tokenizer.nextToken("\"") + "\"";
            }
            subject = Bytes.toBytes(s);
            //System.out.println(subject);
            p = tokenizer.nextToken(" ");
            if (p.contains("\"")) {
                if (!p.endsWith("\"") && !p.endsWith(">"))
                    p += tokenizer.nextToken("\"") + "\"";
            }
            predicate = Bytes.toBytes(p);
            //System.out.println(predicate);
            o = tokenizer.nextToken(" ");
            if (o.contains("\"")) {
                if (!o.endsWith("\"") && !o.endsWith(">"))
                    o += tokenizer.nextToken("\"") + "\"";
            }
            object = Bytes.toBytes(o);
            //System.out.println(object);
            //tokenizer.nextToken();
            //if (tokenizer.hasMoreTokens()) {
            //   return ;
            //}
            /*try{
               //Reader r = new StringReader(value.toString());
               Node s1 = NodeFactory.parseNode(s);
               Node p1= NodeFactory.parseNode(p);
               Node o1 = NodeFactory.parseNode(o);
               //Triple t = new Triple(s1, p1, o1);
               //System.out.println(s1.toString(false));
               //System.out.println(p1.toString(false));
               //System.out.println(o1.toString(false));
                   
               return ;
            }catch (Exception e) {
               System.out.println(value.toString());
               e.printStackTrace();
               return;
            }*/
            try {

                byte[] si = ByteValues.getFullValue(s);
                byte[] pi = ByteValues.getFullValue(p);
                byte[] oi = ByteValues.getFullValue(o);
                byte[] tel = null, type = null;
                try {
                    tel = ByteValues
                            .getFullValue("<http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#telephone>");
                    type = ByteValues.getFullValue("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>");
                } catch (NotSupportedDatatypeException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                }
                if (Bytes.equals(pi, tel) || Bytes.equals(pi, type))
                    return;

                //dhmiourgia pinaka me indexes kanoume emit hashvalue-name, byte[0]=1
                byte[] k;

                //dhmiourgia spo byte[0]=4 emit key=si,pi value=oi
                k = new byte[totsize + totsize + totsize + 1];
                k[totsize + totsize] = (byte) 2;
                for (int i = 0; i < totsize; i++) {
                    k[i] = si[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize] = oi[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + totsize + 1] = pi[i];
                }
                new_key.set(k, 0, k.length);
                context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                //dhmiourgia spo byte[0]=4 emit key=si,pi value=oi

                k = new byte[totsize + totsize + totsize + 1];
                k[totsize + totsize] = (byte) 3;
                for (int i = 0; i < totsize; i++) {
                    k[i] = oi[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize] = si[i];
                }
                for (int i = 0; i < totsize; i++) {
                    k[i + totsize + totsize + 1] = pi[i];
                }
                new_key.set(k, 0, k.length);
                context.write(new_key, new ImmutableBytesWritable(non, 0, 0));

            } catch (InterruptedException e) {
                e.printStackTrace();
            } catch (NotSupportedDatatypeException e) {
                e.printStackTrace();
            }
        }

    }

    /**
     * This reducer only emits enough keys to fill the partition file.
     */
    public static class PartitioningReducer
            extends Reducer<ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable, NullWritable> {

        public void run(Context context) throws IOException, InterruptedException {

            int collected = 0, chunks = 0;
            float chunkSize = bucketSampledTriples * sumplingRate;
            System.out.println("chunkSize: " + chunkSize);
            while (context.nextKey()) {
                if (collected > chunkSize) {
                    context.write(context.getCurrentKey(), NullWritable.get());
                    collected = 0;
                    chunks++;
                } else {
                    collected++;
                }
            }
            System.out.println("chunks: " + chunks);
        }

    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }
}