ivory.ptc.AnchorTextInvertedIndex.java Source code

Java tutorial

Introduction

Here is the source code for ivory.ptc.AnchorTextInvertedIndex.java

Source

/**
 * Ivory: A Hadoop toolkit for Web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.ptc;

import ivory.ptc.data.AnchorTextTarget;
import ivory.ptc.judgments.weighting.WeightingScheme;

import java.io.IOException;
import java.net.URI;
import java.util.Collections;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.io.array.ArrayListOfIntsWritable;
import edu.umd.cloud9.io.array.ArrayListWritable;

/**
 * Map-Reduce job that constructs anchor text-inverted index.
 * The inverted index contains, for each unique anchor text,
 * a list of documents that are pointed to by that anchor text.
 *
 * @author Nima Asadi
 */
@SuppressWarnings("deprecation")
public class AnchorTextInvertedIndex extends PowerTool {
    private static final Logger LOG = Logger.getLogger(AnchorTextInvertedIndex.class);
    public static final String PARAMETER_SEPARATER = ",";

    static {
        LOG.setLevel(Level.INFO);
    }

    private static class MyMapper extends MapReduceBase
            implements Mapper<IntWritable, ArrayListWritable<AnchorText>, Text, AnchorTextTarget> {
        private static final AnchorTextTarget anchorTextTarget = new AnchorTextTarget();
        private static final Text keyOut = new Text();
        // Weighting scheme used to rank target documents
        private static WeightingScheme weightingScheme;

        @Override
        public void configure(JobConf job) {
            Path[] localFiles;
            try {
                localFiles = DistributedCache.getLocalCacheFiles(job);
            } catch (IOException e) {
                throw new RuntimeException("Local cache files not read properly.");
            }

            String[] params = new String[localFiles.length];
            for (int i = 0; i < params.length; i++) {
                params[i] = localFiles[i].toString();
            }

            try {
                weightingScheme = (WeightingScheme) Class.forName(job.get("Ivory.WeightingScheme")).newInstance();
                weightingScheme.initialize(FileSystem.getLocal(job), params);
            } catch (Exception e) {
                throw new RuntimeException(
                        "Mapper failed to initialize the weighting scheme: " + job.get("Ivory.WeightingScheme")
                                + " with parameters: " + job.get("Ivory.WeightingSchemeParameters"));
            }
        }

        @Override
        public void map(IntWritable key, ArrayListWritable<AnchorText> anchors,
                OutputCollector<Text, AnchorTextTarget> output, Reporter reporter) throws IOException {
            anchorTextTarget.setTarget(key.get());
            for (AnchorText anchor : anchors) {
                // Internal links provide navigational information which are not useful for our purposes.
                if (!anchor.isExternalInLink() && !anchor.isInternalInLink()) {
                    continue;
                }

                keyOut.set(anchor.getText());
                anchorTextTarget.setSources(new ArrayListOfIntsWritable(anchor.getDocuments()));
                anchorTextTarget.setWeight(weightingScheme.getWeight(key.get(), anchor));
                output.collect(keyOut, anchorTextTarget);
            }
        }
    }

    private static class MyReducer extends MapReduceBase
            implements Reducer<Text, AnchorTextTarget, Text, ArrayListWritable<AnchorTextTarget>> {
        private static final ArrayListWritable<AnchorTextTarget> outList = new ArrayListWritable<AnchorTextTarget>();

        @Override
        public void reduce(Text anchorText, Iterator<AnchorTextTarget> values,
                OutputCollector<Text, ArrayListWritable<AnchorTextTarget>> output, Reporter reporter)
                throws IOException {
            outList.clear();
            while (values.hasNext()) {
                outList.add(new AnchorTextTarget(values.next()));
            }

            Collections.sort(outList);
            output.collect(anchorText, outList);
        }
    }

    public static final String[] RequiredParameters = { "Ivory.NumMapTasks", "Ivory.NumReduceTasks",
            "Ivory.InputPath", "Ivory.OutputPath", "Ivory.WeightingScheme", "Ivory.WeightingSchemeParameters", };

    @Override
    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public AnchorTextInvertedIndex(Configuration conf) {
        super(conf);
    }

    @Override
    public int runTool() throws Exception {
        JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class);
        FileSystem fs = FileSystem.get(conf);
        String inPath = conf.get("Ivory.InputPath");
        String outPath = conf.get("Ivory.OutputPath");
        Path inputPath = new Path(inPath);
        Path outputPath = new Path(outPath);
        int mapTasks = conf.getInt("Ivory.NumMapTasks", 1);
        int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100);
        String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters");

        LOG.info("BuildAnchorTextInvertedIndex");
        LOG.info(" - input path: " + inPath);
        LOG.info(" - output path: " + outPath);
        LOG.info(" - number of reducers: " + reduceTasks);
        LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme"));
        LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters);

        String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER);
        for (String param : params) {
            DistributedCache.addCacheFile(new URI(param), conf);
        }

        conf.setJobName("BuildAnchorTextInvertedIndex");
        conf.setNumMapTasks(mapTasks);
        conf.setNumReduceTasks(reduceTasks);
        conf.set("mapred.child.java.opts", "-Xmx4096m");
        conf.setInt("mapred.task.timeout", 60000000);

        FileInputFormat.setInputPaths(conf, inputPath);
        FileOutputFormat.setOutputPath(conf, outputPath);

        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(AnchorTextTarget.class);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(ArrayListWritable.class);
        conf.setMapperClass(MyMapper.class);
        conf.setReducerClass(MyReducer.class);

        fs.delete(outputPath);
        JobClient.runJob(conf);
        return 0;
    }
}