com.yahoo.glimmer.indexing.preprocessor.PrepTool.java Source code

Introduction

Here is the source code for com.yahoo.glimmer.indexing.preprocessor.PrepTool.java
Source

package com.yahoo.glimmer.indexing.preprocessor;

/*
 * Copyright (c) 2012 Yahoo! Inc. All rights reserved.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software distributed under the License is 
 *  distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and limitations under the License.
 *  See accompanying LICENSE file.
 */

import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.semanticweb.owlapi.model.OWLClass;
import org.semanticweb.owlapi.model.OWLOntology;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.yahoo.glimmer.indexing.OntologyLoader;

public class PrepTool extends Configured implements Tool {
    private static final int DEFAULT_REDUCER_COUNT = 1;
    public static final String NO_CONTEXTS_ARG = "excludeContexts";
    private static final String ONTOLOGY_ARG = "ontology";
    private static final String REDUCER_COUNT_ARG = "reducers";
    private static final String OUTPUT_ARG = "output";
    private static final String INPUT_ARG = "input";

    public static void main(String[] args) throws Exception {
        int ret = ToolRunner.run(new PrepTool(), args);
        System.exit(ret);
    }

    @Override
    public int run(String[] args) throws Exception {

        SimpleJSAP jsap = new SimpleJSAP(PrepTool.class.getName(), "RDF tuples pre-processor for Glimmer",
                new Parameter[] {
                        new Switch(NO_CONTEXTS_ARG, 'C', NO_CONTEXTS_ARG,
                                "Don't process the contexts for each tuple."),
                        new FlaggedOption(ONTOLOGY_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'O',
                                ONTOLOGY_ARG),
                        new FlaggedOption(REDUCER_COUNT_ARG, JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
                                JSAP.NOT_REQUIRED, 'r', REDUCER_COUNT_ARG),
                        new UnflaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "HDFS location for the input data."),
                        new UnflaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "HDFS location for the out data."), });

        JSAPResult jsapResult = jsap.parse(args);
        if (!jsapResult.success()) {
            System.err.print(jsap.getUsage());
            System.exit(1);
        }

        Configuration config = getConf();

        boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
        config.setBoolean(TuplesToResourcesMapper.INCLUDE_CONTEXTS_KEY, withContexts);

        // The ontology if any...
        String ontologyFilename = jsapResult.getString(ONTOLOGY_ARG);
        if (ontologyFilename != null) {
            // Load the ontology
            InputStream ontologyInputStream = new FileInputStream(ontologyFilename);
            OWLOntology ontology = OntologyLoader.load(ontologyInputStream);
            System.out.println(
                    "Loaded ontology from " + ontologyFilename + " with " + ontology.getAxiomCount() + " axioms.");

            ArrayList<String> ontologyClasses = new ArrayList<String>();
            for (OWLClass owlClass : ontology.getClassesInSignature()) {
                ontologyClasses.add(owlClass.getIRI().toString());
            }
            System.out.println("Adding " + ontologyClasses.size() + " classes from ontology.");
            config.setStrings(TuplesToResourcesMapper.EXTRA_RESOURCES, ontologyClasses.toArray(new String[0]));
        } else {
            System.out.println("No ontology filename set in conf.  No ontology has been loaded.");
        }

        Job job = Job.getInstance(config);
        job.setJarByClass(PrepTool.class);

        job.setJobName(PrepTool.class.getName() + "-part1-" + System.currentTimeMillis());
        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(TuplesToResourcesMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        int reducerCount = jsapResult.getInt(REDUCER_COUNT_ARG, DEFAULT_REDUCER_COUNT);
        job.setNumReduceTasks(reducerCount);
        if (reducerCount == 1) {
            // We assign 'global' ids in the reducer. For this to work, there
            // can be only one. But using just one reducer, we run out of local disk space during the
            // pre-reduce merge with big data sets like WCC.

            job.setReducerClass(ResourcesReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Object.class);
            job.setOutputFormatClass(ResourceRecordWriter.OutputFormat.class);
        } else {
            /*
             * TODO: Take the functionality of the reducer and move it to run on
             * the gateway. We then use n identity reducers, the output of which
             * will be read and merged as streams on the gateway.
             */
        }

        FileInputFormat.setInputPaths(job, new Path(jsapResult.getString(INPUT_ARG)));

        Path outputDir = new Path(jsapResult.getString(OUTPUT_ARG));
        FileOutputFormat.setOutputPath(job, outputDir);

        if (!job.waitForCompletion(true)) {
            System.err.println("Failed to process tuples from " + jsapResult.getString(INPUT_ARG));
            return 1;
        }

        // IF THERE WAS ONLY ONE REDUCER WE NOW HAVE
        // One file per reducer containing lists of urls(recourses) for
        // subjects, predicates, objects and contexts.
        // One file per reducer that contains all resources. subjects +
        // predicates + objects + contexts.
        // One file per reducer that contains the subjects + all <predicate>
        // <object>|"Literal" <context> on that subject.

        // IF THERE WAS MORE THAN ONE REDUCER WE NOW HAVE N FILES THAT NEED TO BE MERGED ON THE GATEWAY. TODO.

        return 0;
    }
}