com.yahoo.glimmer.indexing.generator.TermReduce.java Source code

Introduction

Here is the source code for com.yahoo.glimmer.indexing.generator.TermReduce.java
Source

package com.yahoo.glimmer.indexing.generator;

/*
 * Copyright (c) 2012 Yahoo! Inc. All rights reserved.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software distributed under the License is 
 *  distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and limitations under the License.
 *  See accompanying LICENSE file.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import com.yahoo.glimmer.indexing.generator.TermValue.Type;

public class TermReduce extends Reducer<TermKey, TermValue, IntWritable, IndexRecordWriterValue> {
    private static final Log LOG = LogFactory.getLog(TermReduce.class);
    public static final String MAX_INVERTEDLIST_SIZE_PARAMETER = "maxInvertiedListSize";
    public static final String MAX_POSITIONLIST_SIZE_PARAMETER = "maxPositionListSize";

    private IntWritable writerKey;
    private IndexRecordWriterTermValue writerTermValue;
    private IndexRecordWriterDocValue writerDocValue;
    private IndexRecordWriterSizeValue writerSizeValue;
    private ArrayList<Long> predicatedIds;
    private long termKeysProcessed;

    @Override
    protected void setup(
            org.apache.hadoop.mapreduce.Reducer<TermKey, TermValue, IntWritable, IndexRecordWriterValue>.Context context)
            throws IOException, InterruptedException {
        writerKey = new IntWritable();
        writerTermValue = new IndexRecordWriterTermValue();
        writerDocValue = new IndexRecordWriterDocValue();
        writerSizeValue = new IndexRecordWriterSizeValue();
        predicatedIds = new ArrayList<Long>();
    };

    @Override
    public void reduce(TermKey key, Iterable<TermValue> values, Context context)
            throws IOException, InterruptedException {
        if (key == null || key.equals("")) {
            return;
        }

        if (termKeysProcessed % 10000 == 0) {
            String statusString = "Reducing " + key.toString();
            context.setStatus(statusString);
            LOG.info(statusString);
        }

        writerKey.set(key.getIndex());

        if (key.getIndex() == DocumentMapper.ALIGNMENT_INDEX) {
            long lastPredicateId = Long.MIN_VALUE;
            for (TermValue value : values) {
                if (value.getType() != Type.INDEX_ID) {
                    throw new IllegalStateException(
                            "Got a " + value.getType() + " value when expecting only " + Type.INDEX_ID);
                }
                if (lastPredicateId != value.getV1()) {
                    lastPredicateId = value.getV1();
                    predicatedIds.add(lastPredicateId);
                }
            }

            writerTermValue.setTerm(key.getTerm());
            writerTermValue.setOccurrenceCount(0);
            writerTermValue.setTermFrequency(predicatedIds.size());
            writerTermValue.setSumOfMaxTermPositions(0);

            context.write(writerKey, writerTermValue);

            for (Long predicateId : predicatedIds) {
                writerDocValue.setDocument(predicateId);
                context.write(writerKey, writerDocValue);
            }
            predicatedIds.clear();
        } else if (TermKey.DOC_SIZE_TERM.equals(key.getTerm())) {
            // Write .sizes files
            Iterator<TermValue> valuesIt = values.iterator();
            while (valuesIt.hasNext()) {
                TermValue value = valuesIt.next();

                if (Type.DOC_SIZE != value.getType()) {
                    throw new IllegalStateException(
                            "Got a " + value.getType() + " value when expecting only " + Type.DOC_SIZE);
                }

                writerSizeValue.setDocument(value.getV1());
                writerSizeValue.setSize(value.getV2());
                context.write(writerKey, writerSizeValue);
            }
        } else {
            int termFrequency = 0;
            int termCount = 0;
            int sumOfMaxTermPositions = 0;
            TermValue value = null;

            Iterator<TermValue> valuesIt = values.iterator();
            while (valuesIt.hasNext()) {
                value = valuesIt.next();

                if (Type.TERM_STATS != value.getType()) {
                    break;
                }
                termFrequency++;
                termCount += value.getV1();
                sumOfMaxTermPositions += value.getV2();
            }

            if (Type.OCCURRENCE != value.getType()) {
                throw new IllegalStateException(
                        "Got a " + value.getType() + " value when expecting only " + Type.OCCURRENCE);
            }

            writerTermValue.setTerm(key.getTerm());
            writerTermValue.setOccurrenceCount(termCount);
            writerTermValue.setTermFrequency(termFrequency);
            writerTermValue.setSumOfMaxTermPositions(sumOfMaxTermPositions);

            context.write(writerKey, writerTermValue);

            TermValue prevValue = new TermValue();
            prevValue.set(value);

            while (value != null && value.getType() == Type.OCCURRENCE) {
                long docId = value.getV1();
                if (docId < 0) {
                    throw new IllegalStateException("Negative DocID. Key:" + key + "\nValue:" + value);
                }
                if (docId != prevValue.getV1()) {
                    // New document, write out previous postings
                    writerDocValue.setDocument(prevValue.getV1());

                    context.write(writerKey, writerDocValue);

                    // The first occerrence of this docId/
                    writerDocValue.clearOccerrences();
                    writerDocValue.addOccurrence(value.getV2());
                } else {
                    writerDocValue.addOccurrence(value.getV2());
                }

                prevValue.set(value);

                boolean last = false;
                if (valuesIt.hasNext()) {
                    value = valuesIt.next();
                    // LOG.warn("Value:" + value.toString());
                    // Skip equivalent occurrences
                    if (value.equals(prevValue)) {
                        // This should never happen.. Is it legacy code?
                        throw new IllegalStateException("For indexId " + key.getIndex() + " and term "
                                + key.getTerm() + " got a duplicate occurrence " + value.toString());
                    }
                    while (value.equals(prevValue) && valuesIt.hasNext()) {
                        value = valuesIt.next();
                    }
                    if (value.equals(prevValue) && !valuesIt.hasNext()) {
                        last = true;
                    }
                } else {
                    last = true;
                }
                if (last) {
                    // This is the last occurrence: write out the remaining
                    // positions
                    writerDocValue.setDocument(prevValue.getV1());
                    if (writerDocValue.getDocument() < 0) {
                        throw new IllegalStateException("Negative DocID. Key:" + key + "\nprevValue:" + prevValue
                                + "\nValue:" + value + "\nwriterDocValue:" + writerDocValue);
                    }
                    context.write(writerKey, writerDocValue);

                    writerDocValue.clearOccerrences();
                    value = null;
                }
            }
        }
        termKeysProcessed++;
    }
}