com.nearinfinity.blur.mapreduce.BlurReducer.java Source code

Java tutorial

Introduction

Here is the source code for com.nearinfinity.blur.mapreduce.BlurReducer.java

Source

/*
 * Copyright (C) 2011 Near Infinity Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nearinfinity.blur.mapreduce;

import static com.nearinfinity.blur.lucene.LuceneConstant.LUCENE_VERSION;
import static com.nearinfinity.blur.utils.BlurConstants.RECORD_ID;
import static com.nearinfinity.blur.utils.BlurConstants.ROW_ID;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.NoLockFactory;
import org.apache.lucene.util.IOUtils;
import org.apache.zookeeper.ZooKeeper;

import com.nearinfinity.blur.analysis.BlurAnalyzer;
import com.nearinfinity.blur.log.Log;
import com.nearinfinity.blur.log.LogFactory;
import com.nearinfinity.blur.lucene.search.FairSimilarity;
import com.nearinfinity.blur.mapreduce.BlurMutate.MUTATE_TYPE;
import com.nearinfinity.blur.mapreduce.BlurTask.INDEXING_TYPE;
import com.nearinfinity.blur.store.compressed.CompressedFieldDataDirectory;
import com.nearinfinity.blur.store.hdfs.HdfsDirectory;
import com.nearinfinity.blur.thrift.generated.Column;
import com.nearinfinity.blur.thrift.generated.TableDescriptor;
import com.nearinfinity.blur.utils.BlurConstants;
import com.nearinfinity.blur.utils.BlurUtil;
import com.nearinfinity.blur.utils.Converter;
import com.nearinfinity.blur.utils.IterableConverter;
import com.nearinfinity.blur.utils.RowIndexWriter;

public class BlurReducer extends Reducer<BytesWritable, BlurMutate, BytesWritable, BlurMutate> {

    static class LuceneFileComparator implements Comparator<String> {

        private Directory _directory;

        public LuceneFileComparator(Directory directory) {
            _directory = directory;
        }

        @Override
        public int compare(String o1, String o2) {
            try {
                long fileLength1 = _directory.fileLength(o1);
                long fileLength2 = _directory.fileLength(o2);
                if (fileLength1 == fileLength2) {
                    return o1.compareTo(o2);
                }
                return (int) (fileLength2 - fileLength1);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    };

    protected static final Log LOG = LogFactory.getLog(BlurReducer.class);
    protected static final long REPORT_PERIOD = TimeUnit.SECONDS.toMillis(10);
    protected static final double MB = 1024 * 1024;
    protected IndexWriter _writer;
    protected Directory _directory;
    protected BlurAnalyzer _analyzer;
    protected BlurTask _blurTask;

    protected Counter _recordCounter;
    protected Counter _rowCounter;
    protected Counter _fieldCounter;
    protected Counter _rowBreak;
    protected Counter _rowFailures;
    protected StringBuilder _builder = new StringBuilder();
    protected byte[] _copyBuf;
    protected Configuration _configuration;
    protected long _start;
    protected long _previousRow;
    protected long _previousRecord;
    protected long _prev;
    protected IndexReader _reader;
    protected Map<String, Document> _newDocs = new HashMap<String, Document>();
    protected Set<String> _recordIdsToDelete = new HashSet<String>();
    protected Term _rowIdTerm = new Term(BlurConstants.ROW_ID);
    protected ZooKeeper _zookeeper;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        _blurTask = BlurTask.read(context.getConfiguration());
        _configuration = context.getConfiguration();
        setupCounters(context);
        setupAnalyzer(context);
        setupDirectory(context);
        setupWriter(context);
        if (_blurTask.getIndexingType() == INDEXING_TYPE.UPDATE) {
            _reader = IndexReader.open(_directory);
        }
    }

    protected void setupCounters(Context context) {
        _rowCounter = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getRowCounterName());
        _recordCounter = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getRecordCounterName());
        _fieldCounter = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getFieldCounterName());
        _rowBreak = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getRowBreakCounterName());
        _rowFailures = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getRowFailureCounterName());
        _start = System.currentTimeMillis();
        _prev = System.currentTimeMillis();
    }

    @Override
    protected void reduce(BytesWritable key, Iterable<BlurMutate> values, Context context)
            throws IOException, InterruptedException {
        if (!index(key, values, context)) {
            _rowFailures.increment(1);
        }
    }

    protected boolean index(BytesWritable key, Iterable<BlurMutate> values, Context context) throws IOException {
        int recordCount = 0;
        _newDocs.clear();
        _recordIdsToDelete.clear();
        boolean rowIdSet = false;

        for (BlurMutate mutate : values) {
            BlurRecord record = mutate.getRecord();
            if (!rowIdSet) {
                String rowId = record.getRowId();
                _rowIdTerm = _rowIdTerm.createTerm(rowId);
                rowIdSet = true;
            }
            if (mutate.getMutateType() == MUTATE_TYPE.DELETE) {
                _recordIdsToDelete.add(record.getRecordId());
                continue;
            }
            Document document = toDocument(record, _builder);
            _newDocs.put(record.getRecordId(), document);

            context.progress();
            recordCount++;
            if (recordCount >= _blurTask.getMaxRecordsPerRow()) {
                return false;
            }
            if (_blurTask.getIndexingType() == INDEXING_TYPE.UPDATE) {
                fetchOldRecords();
            }
        }

        List<Document> docs = documentsToIndex(new ArrayList<Document>(_newDocs.values()));
        if (docs.size() > 0) {
            docs.get(0).add(BlurConstants.PRIME_DOC_FIELD);
        }

        switch (_blurTask.getIndexingType()) {
        case REBUILD:
            _writer.addDocuments(docs);
            break;
        case UPDATE:
            _writer.updateDocuments(_rowIdTerm, docs);
        default:
            break;
        }

        _recordCounter.increment(recordCount);
        _rowCounter.increment(1);
        if (_prev + REPORT_PERIOD < System.currentTimeMillis()) {
            long records = _recordCounter.getValue();
            long rows = _rowCounter.getValue();

            long now = System.currentTimeMillis();

            double overAllSeconds = (now - _start) / 1000.0;
            double overAllRecordRate = records / overAllSeconds;
            double overAllRowsRate = rows / overAllSeconds;

            double seconds = (now - _prev) / 1000.0;
            double recordRate = (records - _previousRecord) / seconds;
            double rowsRate = (rows - _previousRow) / seconds;

            String status = String.format(
                    "Totals [%d Row, %d Records], Avg Rates [%.1f Row/s, %.1f Records/s] Rates [%.1f Row/s, %.1f Records/s]",
                    rows, records, overAllRowsRate, overAllRecordRate, rowsRate, recordRate);

            LOG.info(status);
            context.setStatus(status);

            _previousRecord = records;
            _previousRow = rows;
            _prev = now;
        }
        return true;
    }

    protected List<Document> documentsToIndex(List<Document> list) {
        return list;
    }

    protected void fetchOldRecords() throws IOException {
        TermDocs termDocs = _reader.termDocs(_rowIdTerm);
        // find all records for row that are not deleted.
        while (termDocs.next()) {
            int doc = termDocs.doc();
            if (!_reader.isDeleted(doc)) {
                Document document = _reader.document(doc);
                String recordId = document.get(RECORD_ID);
                // add them to the new records if the new records do not contain them.
                if (!_newDocs.containsKey(recordId)) {
                    _newDocs.put(recordId, document);
                }
            }
        }

        // delete all records that should be removed.
        for (String recordId : _recordIdsToDelete) {
            _newDocs.remove(recordId);
        }
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        switch (_blurTask.getIndexingType()) {
        case UPDATE:
            cleanupFromUpdate(context);
            return;
        case REBUILD:
            cleanupFromRebuild(context);
            return;
        default:
            break;
        }
    }

    protected void cleanupFromUpdate(Context context) throws IOException {
        _writer.commit();
        _writer.close();
    }

    protected void cleanupFromRebuild(Context context) throws IOException, InterruptedException {
        _writer.commit();
        _writer.close();

        IndexReader reader = IndexReader.open(_directory);

        TableDescriptor descriptor = _blurTask.getTableDescriptor();

        Path directoryPath = _blurTask.getDirectoryPath(context);
        remove(directoryPath);

        NoLockFactory lockFactory = NoLockFactory.getNoLockFactory();

        Directory destDirectory = getDestDirectory(descriptor, directoryPath);
        destDirectory.setLockFactory(lockFactory);

        boolean optimize = _blurTask.getOptimize();

        if (optimize) {
            context.setStatus("Starting Copy-Optimize Phase");
            IndexWriterConfig conf = new IndexWriterConfig(LUCENE_VERSION, _analyzer);
            TieredMergePolicy policy = (TieredMergePolicy) conf.getMergePolicy();
            policy.setUseCompoundFile(false);
            long s = System.currentTimeMillis();
            IndexWriter writer = new IndexWriter(getBiggerBuffers(destDirectory), conf);
            writer.addIndexes(reader);
            writer.close();
            long e = System.currentTimeMillis();
            context.setStatus("Copying phase took [" + (e - s) + " ms]");
            LOG.info("Copying phase took [" + (e - s) + " ms]");
        } else {
            context.setStatus("Starting Copy-Optimize Phase");
            long s = System.currentTimeMillis();
            List<String> files = getFilesOrderedBySize(_directory);
            long totalBytesToCopy = getTotalBytes(_directory);
            long totalBytesCopied = 0;
            long startTime = System.currentTimeMillis();
            for (String file : files) {
                totalBytesCopied += copy(_directory, destDirectory, file, file, context, totalBytesCopied,
                        totalBytesToCopy, startTime);
            }
            long e = System.currentTimeMillis();
            context.setStatus("Copying phase took [" + (e - s) + " ms]");
            LOG.info("Copying phase took [" + (e - s) + " ms]");
        }
    }

    protected Directory getBiggerBuffers(Directory destDirectory) {
        return new BufferedDirectory(destDirectory, 32768);
    }

    protected Directory getDestDirectory(TableDescriptor descriptor, Path directoryPath) throws IOException {
        String compressionClass = descriptor.compressionClass;
        int compressionBlockSize = descriptor.getCompressionBlockSize();
        if (compressionClass == null) {
            compressionClass = "org.apache.hadoop.io.compress.DefaultCodec";
        }
        // if (compressionBlockSize == 0) {
        compressionBlockSize = 32768;
        // }
        HdfsDirectory dir = new HdfsDirectory(directoryPath);
        return new CompressedFieldDataDirectory(dir, getInstance(compressionClass), compressionBlockSize);
    }

    protected CompressionCodec getInstance(String compressionClass) throws IOException {
        try {
            CompressionCodec codec = (CompressionCodec) Class.forName(compressionClass).newInstance();
            if (codec instanceof Configurable) {
                Configurable configurable = (Configurable) codec;
                configurable.setConf(_configuration);
            }
            return codec;
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    protected void remove(Path directoryPath) throws IOException {
        FileSystem fileSystem = FileSystem.get(directoryPath.toUri(), _configuration);
        fileSystem.delete(directoryPath, true);
    }

    protected long getTotalBytes(Directory directory) throws IOException {
        long total = 0;
        for (String file : directory.listAll()) {
            total += directory.fileLength(file);
        }
        return total;
    }

    protected long copy(Directory from, Directory to, String src, String dest, Context context,
            long totalBytesCopied, long totalBytesToCopy, long startTime) throws IOException {
        IndexOutput os = to.createOutput(dest);
        IndexInput is = from.openInput(src);
        IOException priorException = null;
        try {
            return copyBytes(is, os, is.length(), context, totalBytesCopied, totalBytesToCopy, startTime, src);
        } catch (IOException ioe) {
            priorException = ioe;
        } finally {
            IOUtils.closeWhileHandlingException(priorException, os, is);
        }
        return 0;// this should never be called
    }

    protected long copyBytes(IndexInput in, IndexOutput out, long numBytes, Context context, long totalBytesCopied,
            long totalBytesToCopy, long startTime, String src) throws IOException {
        if (_copyBuf == null) {
            _copyBuf = new byte[BufferedIndexInput.BUFFER_SIZE];
        }
        long start = System.currentTimeMillis();
        long copied = 0;
        while (numBytes > 0) {
            if (start + REPORT_PERIOD < System.currentTimeMillis()) {
                report(context, totalBytesCopied + copied, totalBytesToCopy, startTime, src);
                start = System.currentTimeMillis();
            }
            final int toCopy = (int) (numBytes > _copyBuf.length ? _copyBuf.length : numBytes);
            in.readBytes(_copyBuf, 0, toCopy);
            out.writeBytes(_copyBuf, 0, toCopy);
            numBytes -= toCopy;
            copied += toCopy;
            context.progress();
        }
        return copied;
    }

    protected List<String> getFilesOrderedBySize(final Directory directory) throws IOException {
        List<String> files = new ArrayList<String>(Arrays.asList(directory.listAll()));
        Collections.sort(files, new LuceneFileComparator(_directory));
        return files;
    }

    protected void setupDirectory(Context context) throws IOException {
        TableDescriptor descriptor = _blurTask.getTableDescriptor();
        switch (_blurTask.getIndexingType()) {
        case UPDATE:
            Path directoryPath = _blurTask.getDirectoryPath(context);
            _directory = getDestDirectory(descriptor, directoryPath);

            NoLockFactory lockFactory = NoLockFactory.getNoLockFactory();
            _directory.setLockFactory(lockFactory);
            return;
        case REBUILD:
            File dir = new File(System.getProperty("java.io.tmpdir"));
            File path = new File(dir, "index");
            rm(path);
            LOG.info("Using local path [" + path + "] for indexing.");
            String compressionClass = descriptor.compressionClass;
            int compressionBlockSize = descriptor.getCompressionBlockSize();
            if (compressionClass == null) {
                compressionClass = "org.apache.hadoop.io.compress.DefaultCodec";
            }

            Directory localDirectory = FSDirectory.open(path);
            // if (compressionBlockSize == 0) {
            compressionBlockSize = 32768;
            // }
            CompressedFieldDataDirectory compressedFieldDataDirectory = new CompressedFieldDataDirectory(
                    localDirectory, getInstance(compressionClass), compressionBlockSize);
            _directory = new ProgressableDirectory(compressedFieldDataDirectory, context);
            return;
        default:
            break;
        }
    }

    protected String getNodeName(Context context) {
        return context.getTaskAttemptID().toString();
    }

    protected void rm(File path) {
        if (!path.exists()) {
            return;
        }
        if (path.isDirectory()) {
            for (File f : path.listFiles()) {
                rm(f);
            }
        }
        path.delete();
    }

    protected <T> T nullCheck(T o) {
        if (o == null) {
            throw new NullPointerException();
        }
        return o;
    }

    protected void setupWriter(Context context) throws IOException {
        nullCheck(_directory);
        nullCheck(_analyzer);
        IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, _analyzer);
        config.setSimilarity(new FairSimilarity());
        config.setRAMBufferSizeMB(_blurTask.getRamBufferSizeMB());
        TieredMergePolicy mergePolicy = (TieredMergePolicy) config.getMergePolicy();
        mergePolicy.setUseCompoundFile(false);
        _writer = new IndexWriter(_directory, config);
    }

    protected void setupAnalyzer(Context context) {
        _analyzer = new BlurAnalyzer(_blurTask.getTableDescriptor().getAnalyzerDefinition());
    }

    protected Document toDocument(BlurRecord record, StringBuilder builder) {
        Document document = new Document();
        document.add(new Field(ROW_ID, record.getRowId(), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        document.add(new Field(RECORD_ID, record.getRecordId(), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        String columnFamily = record.getFamily();
        RowIndexWriter.addColumns(document, _analyzer, builder, columnFamily,
                new IterableConverter<BlurColumn, Column>(record.getColumns(), new Converter<BlurColumn, Column>() {
                    @Override
                    public Column convert(BlurColumn from) throws Exception {
                        _fieldCounter.increment(1);
                        return new Column(from.getName(), from.getValue());
                    }
                }));
        return document;
    }

    protected static void report(Context context, long totalBytesCopied, long totalBytesToCopy, long startTime,
            String src) {
        long now = System.currentTimeMillis();
        double seconds = (now - startTime) / 1000.0;
        double rate = totalBytesCopied / seconds;
        String time = estimateTimeToComplete(rate, totalBytesCopied, totalBytesToCopy);

        String status = String.format(
                "%.1f Complete - Time Remaining [%s s], Copy rate [%.1f MB/s], Total Copied [%.1f MB], Total To Copy [%.1f MB]",
                getPerComplete(totalBytesCopied, totalBytesToCopy), time, getMb(rate), getMb(totalBytesCopied),
                getMb(totalBytesToCopy));
        LOG.info(status);
        context.setStatus(status);
    }

    protected static double getPerComplete(long totalBytesCopied, long totalBytesToCopy) {
        return ((double) totalBytesCopied / (double) totalBytesToCopy) * 100.0;
    }

    protected static double getMb(double b) {
        return b / MB;
    }

    protected static String estimateTimeToComplete(double rate, long totalBytesCopied, long totalBytesToCopy) {
        long whatsLeft = totalBytesToCopy - totalBytesCopied;
        long secondsLeft = (long) (whatsLeft / rate);
        return BlurUtil.humanizeTime(secondsLeft, TimeUnit.SECONDS);
    }
}