ivory.ptc.SortedPseudoTestCollection.java Source code

Java tutorial

Introduction

Here is the source code for ivory.ptc.SortedPseudoTestCollection.java

Source

/**
 * Ivory: A Hadoop toolkit for Web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.ptc;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.util.PowerTool;

import ivory.ptc.data.AnchorTextTarget;
import ivory.ptc.data.PseudoJudgments;
import ivory.ptc.data.PseudoQuery;
import ivory.ptc.judgments.extractor.PseudoJudgmentExtractor;
import ivory.ptc.sampling.Criterion;
import ivory.ptc.scorer.PseudoQueryScorer;

/**
 * Map-Reduce job to extract a Pseudo Test Collection according to
 * a sampling criterion.
 *
 * @author Nima Asadi
 */
@SuppressWarnings("deprecation")
public class SortedPseudoTestCollection extends PowerTool {
    public static final String PARAMETERS_SPERATOR = ",";
    private static final Logger LOG = Logger.getLogger(SortedPseudoTestCollection.class);
    static {
        LOG.setLevel(Level.INFO);
    }

    private static class MyMapper extends MapReduceBase
            implements Mapper<Text, ArrayListWritable<AnchorTextTarget>, PseudoQuery, PseudoJudgments> {
        private static final PseudoQuery pseudoQuery = new PseudoQuery();
        private static PseudoJudgmentExtractor pseudoJudgmentExtractor;
        private static String queryText;
        private static PseudoJudgments pseudoJudgments;
        private static PseudoQueryScorer queryScorer;

        public void configure(JobConf job) {
            try {
                pseudoJudgmentExtractor = (PseudoJudgmentExtractor) Class
                        .forName(job.get("Ivory.JudgmentExtractor")).newInstance();
                String[] params = job.get("Ivory.JudgmentExtractorParameters").split(PARAMETERS_SPERATOR);
                pseudoJudgmentExtractor.setParameters(params);
            } catch (Exception e) {
                throw new RuntimeException(
                        "Mapper failed to initialize the judgment extractor: " + job.get("Ivory.QueryExtractor")
                                + " with parameters: " + job.get("Ivory.QueryExtractorParameters"));
            }

            try {
                queryScorer = (PseudoQueryScorer) Class.forName(job.get("Ivory.QueryScorer")).newInstance();
            } catch (Exception e) {
                throw new RuntimeException("Mapper failed to initialize the scorer");
            }
        }

        public void map(Text key, ArrayListWritable<AnchorTextTarget> anchorTextTargets,
                OutputCollector<PseudoQuery, PseudoJudgments> output, Reporter reporter) throws IOException {
            queryText = key.toString();
            pseudoJudgments = pseudoJudgmentExtractor.getPseudoJudgments(anchorTextTargets);

            if (pseudoJudgments.size() > 0) {
                pseudoQuery.set(queryText, queryScorer.getScore(queryText, pseudoJudgments));
                output.collect(pseudoQuery, pseudoJudgments);
            }
        }
    }

    private static class MyReducer extends MapReduceBase
            implements Reducer<PseudoQuery, PseudoJudgments, PseudoQuery, PseudoJudgments> {
        private static Criterion criterion;
        private static PseudoJudgments nextJudgments;

        public void configure(JobConf job) {
            try {
                criterion = (Criterion) Class.forName(job.get("Ivory.SamplingCriterion")).newInstance();
                String[] params = job.get("Ivory.SamplingCriterionParameters").split(PARAMETERS_SPERATOR);
                criterion.initialize(FileSystem.get(job), params);
            } catch (Exception e) {
                throw new RuntimeException(
                        "Mapper failed to initialize the sampling criterion: " + job.get("Ivory.SamplingCriterion")
                                + " with parameters: " + job.get("Ivory.SamplingCriterionParameters"));
            }
        }

        public void reduce(PseudoQuery query, Iterator<PseudoJudgments> judgments,
                OutputCollector<PseudoQuery, PseudoJudgments> output, Reporter reporter) throws IOException {
            while (judgments.hasNext()) {
                nextJudgments = judgments.next();

                if (criterion.meets(query, nextJudgments)) {
                    output.collect(query, nextJudgments);
                }
            }
        }
    }

    public static final String[] RequiredParameters = { "Ivory.JudgmentExtractor",
            "Ivory.JudgmentExtractorParameters", "Ivory.QueryScorer", "Ivory.SamplingCriterion",
            "Ivory.SamplingCriterionParameters", "Ivory.InputPath", "Ivory.OutputPath", };

    @Override
    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public SortedPseudoTestCollection(Configuration conf) {
        super(conf);
    }

    public int runTool() throws Exception {
        JobConf conf = new JobConf(getConf(), SortedPseudoTestCollection.class);
        FileSystem fs = FileSystem.get(conf);
        String inPath = conf.get("Ivory.InputPath");
        String outPath = conf.get("Ivory.OutputPath");
        Path inputPath = new Path(inPath);
        Path outputPath = new Path(outPath);
        int mapTasks = 1;
        int reduceTasks = 1;

        LOG.info("SortedPseudoTestCollection");
        LOG.info(" - Input path: " + conf.get("Ivory.InputPath"));
        LOG.info(" - Output path: " + conf.get("Ivory.OutputPath"));
        LOG.info(" - JudgmentExtractor: " + conf.get("Ivory.JudgmentExtractor"));
        LOG.info(" - JudgmentExtractorParameters: " + conf.get("Ivory.JudgmentExtractorParameters"));
        LOG.info(" - SamplingCriterion: " + conf.get("Ivory.SamplingCriterion"));
        LOG.info(" - SamplingCriterionParameters: " + conf.get("Ivory.SamplingCriterionParameters"));
        LOG.info(" - QueryScorer: " + conf.get("Ivory.QueryScorer"));

        conf.setJobName("SortedPTC");
        conf.setNumMapTasks(mapTasks);
        conf.setNumReduceTasks(reduceTasks);
        conf.set("mapred.child.java.opts", "-Xmx4096m");

        FileInputFormat.setInputPaths(conf, inputPath);
        FileOutputFormat.setOutputPath(conf, outputPath);

        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setMapOutputKeyClass(PseudoQuery.class);
        conf.setMapOutputValueClass(PseudoJudgments.class);
        conf.setOutputKeyClass(PseudoQuery.class);
        conf.setOutputValueClass(PseudoJudgments.class);
        conf.setMapperClass(MyMapper.class);
        conf.setReducerClass(MyReducer.class);

        fs.delete(outputPath);
        JobClient.runJob(conf);
        return 0;
    }
}