ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors.java Source code

Java tutorial

Introduction

Here is the source code for ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors.java

Source

/*
 * Ivory: A Hadoop toolkit for Web-scale information retrieval
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.core.preprocess;

import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.WeightedIntDocVector;
import ivory.core.util.CLIRUtils;
import ivory.lsh.driver.PwsimEnvironment;

import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import edu.umd.hooka.Vocab;
import edu.umd.hooka.alignment.HadoopAlign;
import edu.umd.cloud9.io.map.HMapIFW;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.map.MapIF;
import edu.umd.cloud9.util.map.MapKF;

/**
 * Map term doc vectors into int doc vectors using the term-to-id mapping. 
 * This task is the same in either cross-lingual or mono-lingual case. That is, this task works for the case where doc vectors are translated into English and the case where doc vectors are originally in English.
 * Also, weights in doc vector are normalized.
 * 
 * @author ferhanture
 *
 */
@SuppressWarnings("deprecation")
public class BuildTargetLangWeightedIntDocVectors extends PowerTool {
    private static final Logger sLogger = Logger.getLogger(BuildWeightedIntDocVectors.class);

    static {
        sLogger.setLevel(Level.INFO);
    }

    protected static enum Docs {
        Total
    }

    protected static enum Terms {
        OOV, NEG
    }

    private static class MyMapper extends MapReduceBase
            implements Mapper<IntWritable, HMapSFW, IntWritable, WeightedIntDocVector> {

        static IntWritable mDocno = new IntWritable();
        private boolean normalize = false;
        private Vocab engVocabH;

        public void configure(JobConf conf) {
            //         sLogger.setLevel(Level.DEBUG);

            normalize = conf.getBoolean("Ivory.Normalize", false);

            Path[] localFiles;
            try {
                localFiles = DistributedCache.getLocalCacheFiles(conf);
            } catch (IOException e2) {
                throw new RuntimeException("Local cache files not read properly.");
            }

            try {
                engVocabH = HadoopAlign.loadVocab(localFiles[0], FileSystem.getLocal(conf));
            } catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException("Error initializing Term to Id map!");
            }
        }

        WeightedIntDocVector weightedVectorOut = new WeightedIntDocVector();
        HMapIFW weightedVector = new HMapIFW();

        float sum2;

        public void map(IntWritable docno, HMapSFW doc, OutputCollector<IntWritable, WeightedIntDocVector> output,
                Reporter reporter) throws IOException {
            mDocno.set(docno.get());
            weightedVector.clear();

            sLogger.debug("===================================BEGIN READ DOC");
            sum2 = 0;

            for (MapKF.Entry<String> entry : doc.entrySet()) {
                String eTerm = entry.getKey();
                int e = engVocabH.get(eTerm);
                if (e < 0) {
                    sLogger.debug(eTerm + " term in doc not found in aligner vocab");
                    continue;
                }
                float score = entry.getValue();
                if (normalize) {
                    sum2 += score * score;
                }
                weightedVector.put(e, score);
            }
            sLogger.debug("===================================END READ DOC");

            weightedVectorOut.setWeightedTerms(weightedVector);
            if (normalize) {
                /*length-normalize doc vectors*/
                sum2 = (float) Math.sqrt(sum2);
                weightedVectorOut.normalizeWith(sum2);
            }
            output.collect(mDocno, weightedVectorOut);
            reporter.incrCounter(Docs.Total, 1);
        }
    }

    public static final String[] RequiredParameters = { "Ivory.NumMapTasks", "Ivory.IndexPath",
            "Ivory.Normalize", };

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public BuildTargetLangWeightedIntDocVectors(Configuration conf) {
        super(conf);
    }

    @SuppressWarnings("deprecation")
    public int runTool() throws Exception {
        //      sLogger.setLevel(Level.DEBUG);

        sLogger.info("PowerTool: GetTargetLangWeightedIntDocVectors");

        JobConf conf = new JobConf(BuildTargetLangWeightedIntDocVectors.class);
        FileSystem fs = FileSystem.get(conf);

        String indexPath = getConf().get("Ivory.IndexPath");

        RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

        String outputPath = env.getWeightedIntDocVectorsDirectory();
        int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
        int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
        String collectionName = getConf().get("Ivory.CollectionName");

        sLogger.info("Characteristics of the collection:");
        sLogger.info(" - CollectionName: " + collectionName);
        sLogger.info("Characteristics of the job:");
        sLogger.info(" - NumMapTasks: " + mapTasks);
        sLogger.info(" - MinSplitSize: " + minSplitSize);

        String vocabFile = getConf().get("Ivory.FinalVocab");
        DistributedCache.addCacheFile(new URI(vocabFile), conf);

        Path inputPath = new Path(PwsimEnvironment.getFileNameWithPars(indexPath, "TermDocs"));
        Path weightedVectorsPath = new Path(outputPath);

        if (fs.exists(weightedVectorsPath)) {
            sLogger.info("Output path already exists!");
            return -1;
        }
        conf.setJobName("GetWeightedIntDocVectors:" + collectionName);
        conf.setNumMapTasks(mapTasks);
        conf.setNumReduceTasks(0);
        conf.setInt("mapred.min.split.size", minSplitSize);
        conf.set("mapred.child.java.opts", "-Xmx2048m");
        conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
        FileInputFormat.setInputPaths(conf, inputPath);
        FileOutputFormat.setOutputPath(conf, weightedVectorsPath);

        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setMapOutputKeyClass(IntWritable.class);
        conf.setMapOutputValueClass(WeightedIntDocVector.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(WeightedIntDocVector.class);

        conf.setMapperClass(MyMapper.class);

        long startTime = System.currentTimeMillis();

        RunningJob rj = JobClient.runJob(conf);
        sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        Counters counters = rj.getCounters();

        long numOfDocs = (long) counters.findCounter(Docs.Total).getCounter();

        return (int) numOfDocs;
    }
}