net.sf.katta.indexing.IndexerJob.java Source code

Java tutorial

Introduction

Here is the source code for net.sf.katta.indexing.IndexerJob.java

Source

/**
 * Copyright 2008 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.sf.katta.indexing;

import java.io.File;
import java.io.IOException;
import java.util.Random;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * Illustrates how to implement a indexer as hadoop map reduce job.
 */
public class IndexerJob {

    public static void main(String[] args) throws IOException {

        if (args.length != 3) {
            String usage = "IndexerJob <in text file/dir> <out katta index dir> <numOfShards>";
            System.out.println(usage);
            System.exit(1);
        }

        IndexerJob indexerJob = new IndexerJob();
        String input = args[0];
        String output = args[1];
        int numOfShards = Integer.parseInt(args[2]);
        indexerJob.startIndexer(input, output, numOfShards);

    }

    public void startIndexer(String path, String finalDestination, int numOfShards) throws IOException {
        // create job conf with class pointing into job jar.
        JobConf jobConf = new JobConf(IndexerJob.class);
        jobConf.setJobName("indexer");
        jobConf.setMapRunnerClass(Indexer.class);
        // alternative use a text file and a TextInputFormat
        jobConf.setInputFormat(SequenceFileInputFormat.class);

        Path input = new Path(path);
        FileInputFormat.setInputPaths(jobConf, input);
        // we just set the output path to make hadoop happy.
        FileOutputFormat.setOutputPath(jobConf, new Path(finalDestination));
        // setting the folder where lucene indexes will be copied when finished.
        jobConf.set("finalDestination", finalDestination);
        // important to switch spec exec off.
        // We dont want to have something duplicated.
        jobConf.setSpeculativeExecution(false);

        // The num of map tasks is equal to the num of input splits.
        // The num of input splits by default is equal to the num of hdf blocks
        // for the input file(s). To get the right num of shards we need to
        // calculate the best input split size.

        FileSystem fs = FileSystem.get(input.toUri(), jobConf);
        FileStatus[] status = fs.globStatus(input);
        long size = 0;
        for (FileStatus fileStatus : status) {
            size += fileStatus.getLen();
        }
        long optimalSplisize = size / numOfShards;
        jobConf.set("mapred.min.split.size", "" + optimalSplisize);

        // give more mem to lucene tasks.
        jobConf.set("mapred.child.java.opts", "-Xmx2G");
        jobConf.setNumMapTasks(1);
        jobConf.setNumReduceTasks(0);
        JobClient.runJob(jobConf);
    }

    public static class Indexer implements MapRunnable<LongWritable, Text, Text, Text> {

        private JobConf _conf;

        public void configure(JobConf conf) {
            _conf = conf;

        }

        @SuppressWarnings("deprecation")
        public void run(RecordReader<LongWritable, Text> reader, OutputCollector<Text, Text> output,
                final Reporter report) throws IOException {
            LongWritable key = reader.createKey();
            Text value = reader.createValue();

            String tmp = _conf.get("hadoop.tmp.dir");
            long millis = System.currentTimeMillis();
            String shardName = "" + millis + "-" + new Random().nextInt();
            File file = new File(tmp, shardName);
            report.progress();
            // TODO sg this should be configurable
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
            IndexWriter indexWriter = new IndexWriter(FSDirectory.open(file), analyzer, MaxFieldLength.UNLIMITED);
            indexWriter.setMergeFactor(100000);
            report.setStatus("Adding documents...");
            while (reader.next(key, value)) {
                report.progress();
                Document doc = new Document();
                String text = "" + value.toString();
                Field contentField = new Field("content", text, Store.YES, Index.ANALYZED);
                doc.add(contentField);
                indexWriter.addDocument(doc);
            }

            report.setStatus("Done adding documents.");
            Thread t = new Thread() {
                public boolean stop = false;

                @Override
                public void run() {
                    while (!stop) {
                        // Makes sure hadoop is not killing the task in case the
                        // optimization
                        // takes longer than the task timeout.
                        report.progress();
                        try {
                            sleep(10000);
                        } catch (InterruptedException e) {
                            // don't need to do anything.
                            stop = true;
                        }
                    }
                }
            };
            t.start();
            report.setStatus("Optimizing index...");
            indexWriter.optimize();
            report.setStatus("Done optimizing!");
            report.setStatus("Closing index...");
            indexWriter.close();
            report.setStatus("Closing done!");
            FileSystem fileSystem = FileSystem.get(_conf);

            report.setStatus("Starting copy to final destination...");
            Path destination = new Path(_conf.get("finalDestination"));
            fileSystem.copyFromLocalFile(new Path(file.getAbsolutePath()), destination);
            report.setStatus("Copy to final destination done!");
            report.setStatus("Deleting tmp files...");
            FileUtil.fullyDelete(file);
            report.setStatus("Deleting tmp files done!");
            t.interrupt();
        }
    }
}