gr.ntua.h2rdf.sampler.TotalOrderPrep.java Source code

Introduction

Here is the source code for gr.ntua.h2rdf.sampler.TotalOrderPrep.java
Source

/*******************************************************************************
 * Copyright (c) 2012 Nikos Papailiou. 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *     Nikos Papailiou - initial API and implementation
 ******************************************************************************/
package gr.ntua.h2rdf.sampler;

import gr.ntua.h2rdf.bytes.ByteValues;
import gr.ntua.h2rdf.bytes.H2RDFNode;
import gr.ntua.h2rdf.bytes.NotSupportedDatatypeException;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.Tool;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.ntriples.NTriplesParser;

import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.graph.Triple;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import com.hp.hpl.jena.sparql.sse.SSEParseException;
import com.hp.hpl.jena.sparql.util.NodeFactory;

import gr.ntua.h2rdf.byteImport.Combiner;

public class TotalOrderPrep implements Tool {

    private static final float samplingRate = new Float("0.001");
    private static final int bucketSampledTriples = 3000000;//(gia 1% sampling) 650000(gia 10%) //32MB regions
    public static long regions;
    private static final String ARG_INPUTFORMAT = "my.sample";
    private Configuration conf;

    public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException {

        Job sample_job = new Job();

        // Remember the real input format so the sampling input format can use
        // it under the hood

        sample_job.getConfiguration().setBoolean(ARG_INPUTFORMAT, true);
        sample_job.setInputFormatClass(TextInputFormat.class);

        //sample_job.getConfiguration().set("mapred.fairscheduler.pool", "pool9");
        // Base the sample size on the number of reduce tasks that will be used
        // by the real job, but only use 1 reducer for this job (maps output very
        // little)
        sample_job.setNumReduceTasks(1);

        // Make this job's output a temporary filethe input file for the real job's
        // TotalOrderPartitioner
        Path partition = new Path("partitions/");
        //partition.getFileSystem(job.getConfiguration()).deleteOnExit(partition);

        conf = new Configuration();
        FileSystem fs;
        try {
            fs = FileSystem.get(conf);
            if (fs.exists(partition)) {
                fs.delete(partition, true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        FileOutputFormat.setOutputPath(sample_job, partition);
        FileInputFormat.setInputPaths(sample_job, new Path(args[0]));
        //TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path(partition, "part-r-00000"));
        //job.setPartitionerClass(TotalOrderPartitioner.class);

        // If there's a combiner, turn it into an identity reducer to prevent
        // destruction of keys.

        sample_job.setCombinerClass(Combiner.class);

        sample_job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        sample_job.setMapOutputValueClass(ImmutableBytesWritable.class);
        sample_job.setOutputKeyClass(ImmutableBytesWritable.class);
        sample_job.setOutputValueClass(NullWritable.class);
        sample_job.setPartitionerClass(HashPartitioner.class);
        sample_job.setOutputFormatClass(SequenceFileOutputFormat.class);
        sample_job.setJarByClass(TotalOrderPrep.class);
        sample_job.setMapperClass(Map.class);
        sample_job.setReducerClass(PartitioningReducer.class);
        sample_job.setJobName("(Sampler)");
        sample_job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        sample_job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        return sample_job;

    }

    public int run(String[] args) throws Exception {

        Job job = createSubmittableJob(args);
        job.waitForCompletion(true);
        Counters counters = job.getCounters();
        regions = counters.getGroup("org.apache.hadoop.mapred.Task$Counter").findCounter("REDUCE_OUTPUT_RECORDS")
                .getValue() + 1;

        return 0;
    }

    public static class Map extends Mapper<LongWritable, Text, ImmutableBytesWritable, ImmutableBytesWritable> {
        private byte[] subject;
        private byte[] predicate;
        private byte[] object;
        private byte[] non;
        private ImmutableBytesWritable new_key = new ImmutableBytesWritable();
        private Random r = new Random();
        private static final int totsize = ByteValues.totalBytes;
        private static Boolean sampling;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            sampling = context.getConfiguration().getBoolean(ARG_INPUTFORMAT, false);
        }

        public void map(LongWritable key, Text value, Context context) throws IOException {
            if (sampling) {
                int rand = r.nextInt(1000000);
                if (rand > 1000000 * samplingRate) {
                    value.clear();
                    return;
                }
            }
            non = Bytes.toBytes("");
            //String line = Text.decode(value.getBytes());
            //System.out.println("ls: "+value.toString());
            //System.out.println(line);
            //line=line.split("> .")[0]+"> .";
            //if(line.split(" ").length>4)
            //   return;
            //System.out.println(line);

            Model model = ModelFactory.createDefaultModel();
            InputStream is = new ByteArrayInputStream(value.toString().getBytes("UTF-8"));
            model.read(is, null, "N-TRIPLE");
            StmtIterator it = model.listStatements();
            while (it.hasNext()) {
                Triple tr = it.nextStatement().asTriple();
                //System.out.println(tr.getSubject().toString());
                //System.out.println(tr.getPredicate().toString());
                //System.out.println(tr.getObject().toString());
                try {
                    H2RDFNode nodeS = new H2RDFNode(tr.getSubject());
                    H2RDFNode nodeP = new H2RDFNode(tr.getPredicate());
                    H2RDFNode nodeO = new H2RDFNode(tr.getObject());
                    byte[] si = nodeS.getHashValue();
                    byte[] pi = nodeP.getHashValue();
                    byte[] oi = nodeO.getHashValue();

                    //reverse hash values
                    subject = nodeS.getStringValue();
                    predicate = nodeP.getStringValue();
                    object = nodeO.getStringValue();

                    //dhmiourgia pinaka me indexes kanoume emit hashvalue-name, byte[0]=1
                    byte[] k;
                    if (subject != null) {
                        k = new byte[subject.length + totsize + 1];
                        k[0] = (byte) 1;
                        for (int i = 0; i < totsize; i++) {
                            k[i + 1] = si[i];
                        }
                        for (int i = 0; i < subject.length; i++) {
                            k[i + totsize + 1] = subject[i];
                        }
                        new_key.set(k, 0, k.length);
                        context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                    }
                    if (predicate != null) {
                        k = new byte[predicate.length + totsize + 1];
                        k[0] = (byte) 1;
                        for (int i = 0; i < totsize; i++) {
                            k[i + 1] = pi[i];
                        }
                        for (int i = 0; i < predicate.length; i++) {
                            k[i + totsize + 1] = predicate[i];
                        }
                        new_key.set(k, 0, k.length);
                        context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                    }
                    if (object != null) {
                        k = new byte[object.length + totsize + 1];
                        k[0] = (byte) 1;
                        for (int i = 0; i < totsize; i++) {
                            k[i + 1] = oi[i];
                        }
                        for (int i = 0; i < object.length; i++) {
                            k[i + totsize + 1] = object[i];
                        }
                        new_key.set(k, 0, k.length);
                        context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                    }

                    //dhmiourgia spo byte[0]=4 emit key=si,pi value=oi
                    k = new byte[totsize + totsize + totsize + 1];
                    k[0] = (byte) 4;
                    for (int i = 0; i < totsize; i++) {
                        k[i + 1] = si[i];
                    }
                    for (int i = 0; i < totsize; i++) {
                        k[i + totsize + 1] = pi[i];
                    }
                    for (int i = 0; i < totsize; i++) {
                        k[i + totsize + totsize + 1] = oi[i];
                    }
                    new_key.set(k, 0, k.length);
                    context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                    //dhmiourgia osp byte[0]=2 emit key=oi,si value=pi
                    k = new byte[totsize + totsize + totsize + 1];
                    k[0] = (byte) 2;
                    for (int i = 0; i < totsize; i++) {
                        k[i + 1] = oi[i];
                    }
                    for (int i = 0; i < totsize; i++) {
                        k[i + totsize + 1] = si[i];
                    }
                    for (int i = 0; i < totsize; i++) {
                        k[i + totsize + totsize + 1] = pi[i];
                    }
                    new_key.set(k, 0, k.length);
                    context.write(new_key, new ImmutableBytesWritable(non, 0, 0));
                    //dhmiourgia pos byte[0]=3 emit key=pi,oi value=si
                    k = new byte[totsize + totsize + totsize + 1];
                    k[0] = (byte) 3;
                    for (int i = 0; i < totsize; i++) {
                        k[i + 1] = pi[i];
                    }
                    for (int i = 0; i < totsize; i++) {
                        k[i + totsize + 1] = oi[i];
                    }
                    for (int i = 0; i < totsize; i++) {
                        k[i + totsize + totsize + 1] = si[i];
                    }
                    new_key.set(k, 0, k.length);
                    context.write(new_key, new ImmutableBytesWritable(non, 0, 0));

                } catch (SSEParseException e) {
                    e.printStackTrace();
                } catch (InterruptedException e) {
                    e.printStackTrace();
                } catch (NotSupportedDatatypeException e) {
                    e.printStackTrace();
                } catch (StringIndexOutOfBoundsException e) {
                    e.printStackTrace();
                }
            }

            /*
            //String line = value.toString();
            String s, p,o;
            StringTokenizer tokenizer = new StringTokenizer(line);
                
            s=tokenizer.nextToken(" ");
            if(s.contains("\"")){
               if(!s.endsWith("\"") && !s.endsWith(">"))
                  s+= tokenizer.nextToken("\"")+"\"";
            }
            subject=Bytes.toBytes(s);
            //System.out.println(subject);
            p=tokenizer.nextToken(" ");
            if(p.contains("\"")){
               if(!p.endsWith("\"") && !p.endsWith(">"))
                  p+= tokenizer.nextToken("\"")+"\"";
            }
            predicate=Bytes.toBytes(p);
            //System.out.println(predicate);
            o=tokenizer.nextToken(" ");
            if(o.contains("\"")){
               if(!o.endsWith("\"") && !o.endsWith(">"))
                  o+= tokenizer.nextToken("\"")+"\"";
            }
            object=Bytes.toBytes(o);
            //System.out.println(object);
            //tokenizer.nextToken();
            //if (tokenizer.hasMoreTokens()) {
            //   return ;
            //}
                
            */
            value.clear();
        }

    }

    /**
     * This reducer only emits enough keys to fill the partition file.
     */
    public static class PartitioningReducer
            extends Reducer<ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable, NullWritable> {

        public void run(Context context) throws IOException, InterruptedException {

            int collected = 0, chunks = 0;
            float chunkSize = bucketSampledTriples * samplingRate;
            System.out.println("chunkSize: " + chunkSize);
            while (context.nextKey()) {
                if (collected > chunkSize) {
                    context.write(context.getCurrentKey(), NullWritable.get());
                    collected = 0;
                    chunks++;
                } else {
                    collected++;
                }
            }
            System.out.println("chunks: " + chunks);
        }

    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }
}