org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedMapper.java Source code

Introduction

Here is the source code for org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedMapper.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * 
 */
package org.apache.accumulo.examples.wikisearch.ingest;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.MultiTableBatchWriter;
import org.apache.accumulo.core.client.MutationsRejectedException;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.ColumnVisibility;
import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
import org.apache.accumulo.examples.wikisearch.iterator.GlobalIndexUidCombiner;
import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;

public class WikipediaPartitionedMapper extends Mapper<Text, Article, Text, Mutation> {

    // private static final Logger log = Logger.getLogger(WikipediaPartitionedMapper.class);

    public final static Charset UTF8 = Charset.forName("UTF-8");
    public static final String DOCUMENT_COLUMN_FAMILY = "d";
    public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
    public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
    public static final String TOKENS_FIELD_NAME = "TEXT";

    private static final Value NULL_VALUE = new Value(new byte[0]);
    private static final String cvPrefix = "all|";

    private int numPartitions = 0;

    private Text tablename = null;
    private Text indexTableName = null;
    private Text reverseIndexTableName = null;
    private Text metadataTableName = null;

    private static class MutationInfo {
        final String row;
        final String colfam;
        final String colqual;
        final ColumnVisibility cv;
        final long timestamp;

        public MutationInfo(String row, String colfam, String colqual, ColumnVisibility cv, long timestamp) {
            super();
            this.row = row;
            this.colfam = colfam;
            this.colqual = colqual;
            this.cv = cv;
            this.timestamp = timestamp;
        }

        @Override
        public boolean equals(Object obj) {
            MutationInfo other = (MutationInfo) obj;
            return (row == other.row || row.equals(other.row))
                    && (colfam == other.colfam || colfam.equals(other.colfam)) && colqual.equals(other.colqual)
                    && (cv == other.cv || cv.equals(other.cv)) && timestamp == other.timestamp;
        }

        @Override
        public int hashCode() {
            return row.hashCode() ^ colfam.hashCode() ^ colqual.hashCode() ^ cv.hashCode() ^ (int) timestamp;
        }
    }

    private LRUOutputCombiner<MutationInfo, CountAndSet> wikiIndexOutput;
    private LRUOutputCombiner<MutationInfo, CountAndSet> wikiReverseIndexOutput;
    private LRUOutputCombiner<MutationInfo, Value> wikiMetadataOutput;

    private static class CountAndSet {
        public int count;
        public HashSet<String> set;

        public CountAndSet(String entry) {
            set = new HashSet<String>();
            set.add(entry);
            count = 1;
        }
    }

    MultiTableBatchWriter mtbw;

    @Override
    public void setup(final Context context) {
        Configuration conf = context.getConfiguration();
        tablename = new Text(WikipediaConfiguration.getTableName(conf));
        indexTableName = new Text(tablename + "Index");
        reverseIndexTableName = new Text(tablename + "ReverseIndex");
        metadataTableName = new Text(tablename + "Metadata");

        try {
            mtbw = WikipediaConfiguration.getConnector(conf).createMultiTableBatchWriter(10000000, 1000, 10);
        } catch (AccumuloException e) {
            throw new RuntimeException(e);
        } catch (AccumuloSecurityException e) {
            throw new RuntimeException(e);
        }

        final Text metadataTableNameFinal = metadataTableName;
        final Text indexTableNameFinal = indexTableName;
        final Text reverseIndexTableNameFinal = reverseIndexTableName;

        numPartitions = WikipediaConfiguration.getNumPartitions(conf);

        LRUOutputCombiner.Fold<CountAndSet> indexFold = new LRUOutputCombiner.Fold<CountAndSet>() {
            @Override
            public CountAndSet fold(CountAndSet oldValue, CountAndSet newValue) {
                oldValue.count += newValue.count;
                if (oldValue.set == null || newValue.set == null) {
                    oldValue.set = null;
                    return oldValue;
                }
                oldValue.set.addAll(newValue.set);
                if (oldValue.set.size() > GlobalIndexUidCombiner.MAX)
                    oldValue.set = null;
                return oldValue;
            }
        };
        LRUOutputCombiner.Output<MutationInfo, CountAndSet> indexOutput = new LRUOutputCombiner.Output<WikipediaPartitionedMapper.MutationInfo, CountAndSet>() {

            @Override
            public void output(MutationInfo key, CountAndSet value) {
                Uid.List.Builder builder = Uid.List.newBuilder();
                builder.setCOUNT(value.count);
                if (value.set == null) {
                    builder.setIGNORE(true);
                    builder.clearUID();
                } else {
                    builder.setIGNORE(false);
                    builder.addAllUID(value.set);
                }
                Uid.List list = builder.build();
                Value val = new Value(list.toByteArray());
                Mutation m = new Mutation(key.row);
                m.put(key.colfam, key.colqual, key.cv, key.timestamp, val);
                try {
                    mtbw.getBatchWriter(indexTableNameFinal.toString()).addMutation(m);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            }
        };
        LRUOutputCombiner.Output<MutationInfo, CountAndSet> reverseIndexOutput = new LRUOutputCombiner.Output<WikipediaPartitionedMapper.MutationInfo, CountAndSet>() {

            @Override
            public void output(MutationInfo key, CountAndSet value) {
                Uid.List.Builder builder = Uid.List.newBuilder();
                builder.setCOUNT(value.count);
                if (value.set == null) {
                    builder.setIGNORE(true);
                    builder.clearUID();
                } else {
                    builder.setIGNORE(false);
                    builder.addAllUID(value.set);
                }
                Uid.List list = builder.build();
                Value val = new Value(list.toByteArray());
                Mutation m = new Mutation(key.row);
                m.put(key.colfam, key.colqual, key.cv, key.timestamp, val);
                try {
                    mtbw.getBatchWriter(reverseIndexTableNameFinal.toString()).addMutation(m);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            }
        };

        wikiIndexOutput = new LRUOutputCombiner<WikipediaPartitionedMapper.MutationInfo, CountAndSet>(10000,
                indexFold, indexOutput);
        wikiReverseIndexOutput = new LRUOutputCombiner<WikipediaPartitionedMapper.MutationInfo, CountAndSet>(10000,
                indexFold, reverseIndexOutput);
        wikiMetadataOutput = new LRUOutputCombiner<WikipediaPartitionedMapper.MutationInfo, Value>(10000,
                new LRUOutputCombiner.Fold<Value>() {
                    @Override
                    public Value fold(Value oldValue, Value newValue) {
                        return oldValue;
                    }
                }, new LRUOutputCombiner.Output<MutationInfo, Value>() {
                    @Override
                    public void output(MutationInfo key, Value value) {
                        Mutation m = new Mutation(key.row);
                        m.put(key.colfam, key.colqual, key.cv, key.timestamp, value);
                        try {
                            mtbw.getBatchWriter(metadataTableNameFinal.toString()).addMutation(m);
                        } catch (Exception e) {
                            throw new RuntimeException(e);
                        }
                    }
                });
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        wikiIndexOutput.flush();
        wikiMetadataOutput.flush();
        wikiReverseIndexOutput.flush();
        try {
            mtbw.close();
        } catch (MutationsRejectedException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    protected void map(Text language, Article article, Context context) throws IOException, InterruptedException {
        String NULL_BYTE = "\u0000";
        String colfPrefix = language.toString() + NULL_BYTE;
        String indexPrefix = "fi" + NULL_BYTE;
        ColumnVisibility cv = new ColumnVisibility(cvPrefix + language);

        if (article != null) {
            Text partitionId = new Text(Integer.toString(WikipediaMapper.getPartitionId(article, numPartitions)));

            // Create the mutations for the document.
            // Row is partition id, colf is language0articleid, colq is fieldName\0fieldValue
            Mutation m = new Mutation(partitionId);
            for (Entry<String, Object> entry : article.getFieldValues().entrySet()) {
                m.put(colfPrefix + article.getId(), entry.getKey() + NULL_BYTE + entry.getValue().toString(), cv,
                        article.getTimestamp(), NULL_VALUE);
                // Create mutations for the metadata table.
                MutationInfo mm = new MutationInfo(entry.getKey(), METADATA_EVENT_COLUMN_FAMILY,
                        language.toString(), cv, article.getTimestamp());
                wikiMetadataOutput.put(mm, NULL_VALUE);
            }

            // Tokenize the content
            Set<String> tokens = WikipediaMapper.getTokens(article);

            // We are going to put the fields to be indexed into a multimap. This allows us to iterate
            // over the entire set once.
            Multimap<String, String> indexFields = HashMultimap.create();
            // Add the normalized field values
            LcNoDiacriticsNormalizer normalizer = new LcNoDiacriticsNormalizer();
            for (Entry<String, String> index : article.getNormalizedFieldValues().entrySet())
                indexFields.put(index.getKey(), index.getValue());
            // Add the tokens
            for (String token : tokens)
                indexFields.put(TOKENS_FIELD_NAME, normalizer.normalizeFieldValue("", token));

            for (Entry<String, String> index : indexFields.entries()) {
                // Create mutations for the in partition index
                // Row is partition id, colf is 'fi'\0fieldName, colq is fieldValue\0language\0article id
                m.put(indexPrefix + index.getKey(), index.getValue() + NULL_BYTE + colfPrefix + article.getId(), cv,
                        article.getTimestamp(), NULL_VALUE);

                // Create mutations for the global index
                // Row is field value, colf is field name, colq is partitionid\0language, value is Uid.List object
                MutationInfo gm = new MutationInfo(index.getValue(), index.getKey(),
                        partitionId + NULL_BYTE + language, cv, article.getTimestamp());
                wikiIndexOutput.put(gm, new CountAndSet(Integer.toString(article.getId())));

                // Create mutations for the global reverse index
                MutationInfo grm = new MutationInfo(StringUtils.reverse(index.getValue()), index.getKey(),
                        partitionId + NULL_BYTE + language, cv, article.getTimestamp());
                wikiReverseIndexOutput.put(grm, new CountAndSet(Integer.toString(article.getId())));

                // Create mutations for the metadata table.
                MutationInfo mm = new MutationInfo(index.getKey(), METADATA_INDEX_COLUMN_FAMILY,
                        language + NULL_BYTE + LcNoDiacriticsNormalizer.class.getName(), cv,
                        article.getTimestamp());
                wikiMetadataOutput.put(mm, NULL_VALUE);
            }
            // Add the entire text to the document section of the table.
            // row is the partition, colf is 'd', colq is language\0articleid, value is Base64 encoded GZIP'd document
            m.put(DOCUMENT_COLUMN_FAMILY, colfPrefix + article.getId(), cv, article.getTimestamp(),
                    new Value(Base64.encodeBase64(article.getText().getBytes())));
            context.write(tablename, m);

        } else {
            context.getCounter("wikipedia", "invalid articles").increment(1);
        }
        context.progress();
    }
}