nl.utwente.mirex.QueryTermCount.java Source code

Introduction

Here is the source code for nl.utwente.mirex.QueryTermCount.java
Source

/*
 * Copyright Notice:
 * -----------------
 *
 * The contents of this file are subject to the PfTijah Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://dbappl.cs.utwente.nl/Legal/PfTijah-1.1.html
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * The Original Code is the Mirex system.
 * 
 * The Initial Developer of the Original Code is the "University of Twente".
 * Portions created by the "University of Twente" are
 * Copyright (C) 2010 "University of Twente".
 * All Rights Reserved.
 */

package nl.utwente.mirex;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.security.InvalidParameterException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;

import nl.utwente.mirex.util.WarcTextConverterInputFormat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;

import nl.utwente.mirex.util.WarcTextConverterInputFormat;

/**
 * <b>Runs MapReduce job:</b> Gets global statistics for query file.
 * Very similar to the famous "word count" job.
 *  Inputs: (argument 1) Document representation (WARC-TREC-ID, text), 
 *         tab separated
 *         (argument 2) TREC ClueWeb queries (TREC-QUERY-ID, Query terms), 
 *         separated by a colon (":")
 *  Output: (argument 3) TREC ClueWeb queries (TREC-QUERY-ID, Query terms),
 *         separated by a colon (":"), augmented with global statistics
 * 
 * @author Djoerd Hiemstra
 * @since 0.2
 * @see AnchorExtract
 */
public class QueryTermCount {

    private static final String SysName = "MIREX";
    private static final String CollectionLength = SysName + "-LENGTH";
    private static final String NumberOfDocs = SysName + "-NDOCS";
    private static final String DF = SysName + "-DF-";
    private static final String CF = SysName + "-CF-";
    private static final String tempName = SysName + "-tmp";
    private static final String TOKENIZER = "[^0-9A-Za-z]+";

    /**
    * -- Mapper: Collects local statistics for one document. 
    */
    public static class Map extends Mapper<Text, Text, Text, LongWritable> {

        private static final LongWritable one = new LongWritable(1);

        private java.util.Map<String, String[]> trecQueries = new HashMap<String, String[]>();
        private java.util.Map<String, Integer> queryTerms = new HashMap<String, Integer>();

        public void setup(Context context) {
            Path[] queryFiles;
            try {
                queryFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
                parseQueryFile(queryFiles[0]);
            } catch (IOException ioe) {
                System.err.println(StringUtils.stringifyException(ioe));
                System.exit(1);
            }
        }

        private void parseQueryFile(Path queryFile) {
            try {
                BufferedReader fis = new BufferedReader(new FileReader(queryFile.toString()));
                String query = null;

                while ((query = fis.readLine()) != null) {
                    String[] fields = query.toLowerCase().split(":");
                    String[] terms = fields[1].split(TOKENIZER);
                    trecQueries.put(fields[0], terms);
                    for (int i = 0; i < terms.length; i++) {
                        queryTerms.put(terms[i], 1);
                    }
                }
            } catch (IOException ioe) {
                System.err.println(StringUtils.stringifyException(ioe));
                System.exit(1);
            }
        }

        /**
         * @param key TREC-ID
         * @param value document text
         * @param output (Query-term <i>or</i> intermediate statistic, Count)
         */
        public void map(Text key, Text value, Context context) throws IOException, InterruptedException {

            // Store tf's of document only for term that is in one of the queries
            java.util.Map<String, Integer> docTF = new HashMap<String, Integer>();
            Long doclength = 0l;
            Scanner scan = new Scanner(value.toString().toLowerCase()).useDelimiter(TOKENIZER);
            while (scan.hasNext()) {
                doclength++;
                String term = scan.next();
                if (queryTerms.containsKey(term)) {
                    Integer freq = (Integer) docTF.get(term);
                    if (freq != null)
                        docTF.put(term, freq + 1);
                    else
                        docTF.put(term, 1);
                }
            }
            context.write(new Text(CollectionLength), new LongWritable(doclength));
            context.write(new Text(NumberOfDocs), one);
            Iterator<String> iterator = docTF.keySet().iterator();
            while (iterator.hasNext()) {
                String term = (String) iterator.next();
                Integer count = (Integer) docTF.get(term);
                context.write(new Text(CF + term), new LongWritable(count));
                context.write(new Text(DF + term), one);
            }
        }
    }

    /**
    * -- Reducer: Sums all statistics. 
    */
    public static class Reduce extends Reducer<Text, LongWritable, Text, LongWritable> {

        /**
         * @param key Query-term <i>or</i> intermediate statistic
         * @param values Counts
         * @param output (Query-term <i>or</i> intermediate statistic, Summed count)
         */
        public void reduce(Text key, Iterable<LongWritable> values, Context context)
                throws InterruptedException, IOException {

            Long sum = 0l;
            for (LongWritable val : values) {
                sum += val.get();
            }
            context.write(key, new LongWritable(sum));
        }
    }

    /**
    * Configure the Hadoop job
    * @throws IOException 
    */
    public static Job configureJob(String jobName, String format, Path inputFile, Path tempOut, Path topicFile)
            throws IOException, InvalidParameterException {
        // Set job configuration
        Job job = new Job();
        job.setJobName(jobName);
        job.setJarByClass(QueryTermCount.class);

        // Set intermediate output (override defaults)
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        // Set output (override defaults)
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        // Set map-reduce classes
        job.setMapperClass(Map.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reduce.class);

        // Set input-output format
        if (format.equals("KEYVAL")) {
            job.setInputFormatClass(KeyValueTextInputFormat.class);
        } else if (format.equals("WARC")) {
            job.setInputFormatClass(WarcTextConverterInputFormat.class);
        } else {
            throw new InvalidParameterException("inputFormat must bei either WARC or KEYVAL");
        }
        job.setOutputFormatClass(TextOutputFormat.class);
        // also works withoput
        //conf.set("mapred.output.compress", false);
        job.setNumReduceTasks(1);

        // Set input-output paths
        FileInputFormat.setInputPaths(job, inputFile);
        FileOutputFormat.setOutputPath(job, tempOut);

        // Set job specific distributed cache file (query file)
        DistributedCache.addCacheFile(topicFile.toUri(), job.getConfiguration());

        return job;
    }

    /**
     * Runs the MapReduce job that gets global statistics
     * @param args 0: path to parsed document collection (use AnchorExtract); 1: TREC query file; 2: MIREX query file with global statistics
     * @usage. 
     * <code> % hadoop jar mirex-0.2.jar nl.utwente.mirex.QueryTermCount  WARC warc wt2010-topics.stats wt2010-topics.queries-only  </code> 
     */
    public static void main(String[] args) throws Exception {
        if (args.length != 3 && args.length != 4) {
            System.out.printf("Usage: %s [inputFormat] inputFiles topicFile outputFile\n",
                    QueryTermCount.class.getSimpleName());
            System.out.println("          inputFormat: either WARC or KEYVAL; default WARC");
            System.out.println("          inputFiles: path to data");
            System.out.println("          outputFile: topic file with statistics");
            System.out.println("          topicFile: topic file in format queryId: term1 term2...");
            System.exit(1);
        }

        Path tempOut = new Path(tempName);
        int argc = 0;
        String inputFormat = "WARC";
        if (args.length > 3) {
            inputFormat = args[argc++];
        }
        Path inputFile = new Path(args[argc++]);
        Path topicFile = new Path(args[argc++]);
        Path outputFile = new Path(args[argc++]);

        java.util.Map<String, Long> queryCounts = new HashMap<String, Long>();

        // Stop if out file exists
        FileSystem hdfs = FileSystem.get(new Configuration());
        if (hdfs.exists(outputFile)) {
            System.err.println("Output file " + outputFile + " already exists.");
            System.exit(1);
        }
        hdfs.delete(tempOut, true);

        // Run the job
        Job job = configureJob("QueryTermCount", inputFormat, inputFile, tempOut, topicFile);
        job.waitForCompletion(true);

        // Get created global statistics from all files which start with "part" from tempOut
        try {
            String tempLine;
            FileStatus[] status = hdfs.listStatus(tempOut);
            for (int i = 0; i < status.length; i++) {
                String fileName = status[i].getPath().getName();
                if (!fileName.startsWith("part"))
                    continue;
                FSDataInputStream dis = hdfs.open(status[i].getPath());
                //BufferedReader in = new BufferedReader();
                BufferedReader in = new BufferedReader(new InputStreamReader(dis));
                while ((tempLine = in.readLine()) != null) {
                    String[] fields = tempLine.split("\t");
                    queryCounts.put(fields[0], new Long(fields[1]));
                }
                dis.close();
            }
        } catch (IOException ioe) {
            System.err.println(StringUtils.stringifyException(ioe));
            System.exit(1);
        }

        // Write new topic file with global statistics
        try {
            String tempLine;
            FSDataOutputStream dos = hdfs.create(outputFile);
            dos.writeBytes(
                    "#MIREX-COMMENT: query term weight, document frequency, collection frequency (for each term)\n");
            dos.writeBytes("#MIREX-COLLECTION:" + inputFile + "\n");
            dos.writeBytes("#" + CollectionLength + ":" + queryCounts.get(CollectionLength) + "\n");
            dos.writeBytes("#" + NumberOfDocs + ":" + queryCounts.get(NumberOfDocs) + "\n");

            FSDataInputStream dis = hdfs.open(topicFile);
            BufferedReader in = new BufferedReader(new InputStreamReader(dis));
            while ((tempLine = in.readLine()) != null) {
                String[] fields = tempLine.toLowerCase().split(":");
                dos.writeBytes(fields[0] + ":");
                String[] terms = fields[1].replaceAll("=", " ").split(TOKENIZER);
                for (int i = 0; i < terms.length; i++) {
                    Long df, cf;
                    if (queryCounts.containsKey(DF + terms[i])) {
                        df = queryCounts.get(DF + terms[i]);
                        cf = queryCounts.get(CF + terms[i]);
                    } else {
                        df = 0l;
                        cf = 0l;
                    }
                    dos.writeBytes(terms[i] + "=1=" + df.toString() + "=" + cf.toString());
                    if (i < terms.length - 1)
                        dos.writeBytes(" ");
                }
                dos.writeBytes("\n");
            }
            dis.close();
            dos.close();
        } catch (IOException ioe) {
            System.err.println(StringUtils.stringifyException(ioe));
            System.exit(1);
        }
        hdfs.close();
    }

}