org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapper.java Source code

Introduction

Here is the source code for org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapper.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * 
 */
package org.apache.accumulo.examples.wikisearch.ingest;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.IllegalFormatException;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.ColumnVisibility;
import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit;
import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
import org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;

public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Mutation> {

    private static final Logger log = Logger.getLogger(WikipediaMapper.class);

    public final static Charset UTF8 = Charset.forName("UTF-8");
    public static final String DOCUMENT_COLUMN_FAMILY = "d";
    public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
    public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
    public static final String TOKENS_FIELD_NAME = "TEXT";

    private final static Pattern languagePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
    private static final Value NULL_VALUE = new Value(new byte[0]);
    private static final String cvPrefix = "all|";

    private ArticleExtractor extractor;
    private String language;
    private int numPartitions = 0;
    private ColumnVisibility cv = null;

    private int myGroup = -1;
    private int numGroups = -1;

    private Text tablename = null;
    private Text indexTableName = null;
    private Text reverseIndexTableName = null;
    private Text metadataTableName = null;

    @Override
    public void setup(Context context) {
        Configuration conf = context.getConfiguration();
        tablename = new Text(WikipediaConfiguration.getTableName(conf));
        indexTableName = new Text(tablename + "Index");
        reverseIndexTableName = new Text(tablename + "ReverseIndex");
        metadataTableName = new Text(tablename + "Metadata");

        WikipediaInputSplit wiSplit = (WikipediaInputSplit) context.getInputSplit();
        myGroup = wiSplit.getPartition();
        numGroups = WikipediaConfiguration.getNumGroups(conf);

        FileSplit split = wiSplit.getFileSplit();
        String fileName = split.getPath().getName();
        Matcher matcher = languagePattern.matcher(fileName);
        if (matcher.matches()) {
            language = matcher.group(1).replace('_', '-').toLowerCase();
        } else {
            throw new RuntimeException("Unknown ingest language! " + fileName);
        }
        extractor = new ArticleExtractor();
        numPartitions = WikipediaConfiguration.getNumPartitions(conf);
        cv = new ColumnVisibility(cvPrefix + language);

    }

    /**
     * We will partition the documents based on the document id
     * 
     * @param article
     * @param numPartitions
     * @return The number of the partition for a given article.
     * @throws IllegalFormatException
     */
    public static int getPartitionId(Article article, int numPartitions) throws IllegalFormatException {
        return article.getId() % numPartitions;
    }

    static HashSet<String> metadataSent = new HashSet<String>();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        Article article = extractor
                .extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
        String NULL_BYTE = "\u0000";
        String colfPrefix = language + NULL_BYTE;
        String indexPrefix = "fi" + NULL_BYTE;
        if (article != null) {
            int groupId = WikipediaMapper.getPartitionId(article, numGroups);
            if (groupId != myGroup)
                return;
            Text partitionId = new Text(Integer.toString(WikipediaMapper.getPartitionId(article, numPartitions)));

            // Create the mutations for the document.
            // Row is partition id, colf is language0articleid, colq is fieldName\0fieldValue
            Mutation m = new Mutation(partitionId);
            for (Entry<String, Object> entry : article.getFieldValues().entrySet()) {
                m.put(colfPrefix + article.getId(), entry.getKey() + NULL_BYTE + entry.getValue().toString(), cv,
                        article.getTimestamp(), NULL_VALUE);
                // Create mutations for the metadata table.
                String metadataKey = entry.getKey() + METADATA_EVENT_COLUMN_FAMILY + language;
                if (!metadataSent.contains(metadataKey)) {
                    Mutation mm = new Mutation(entry.getKey());
                    mm.put(METADATA_EVENT_COLUMN_FAMILY, language, cv, article.getTimestamp(), NULL_VALUE);
                    context.write(metadataTableName, mm);
                    metadataSent.add(metadataKey);
                }
            }

            // Tokenize the content
            Set<String> tokens = getTokens(article);

            // We are going to put the fields to be indexed into a multimap. This allows us to iterate
            // over the entire set once.
            Multimap<String, String> indexFields = HashMultimap.create();
            // Add the normalized field values
            LcNoDiacriticsNormalizer normalizer = new LcNoDiacriticsNormalizer();
            for (Entry<String, String> index : article.getNormalizedFieldValues().entrySet())
                indexFields.put(index.getKey(), index.getValue());
            // Add the tokens
            for (String token : tokens)
                indexFields.put(TOKENS_FIELD_NAME, normalizer.normalizeFieldValue("", token));

            for (Entry<String, String> index : indexFields.entries()) {
                // Create mutations for the in partition index
                // Row is partition id, colf is 'fi'\0fieldName, colq is fieldValue\0language\0article id
                m.put(indexPrefix + index.getKey(), index.getValue() + NULL_BYTE + colfPrefix + article.getId(), cv,
                        article.getTimestamp(), NULL_VALUE);

                // Create mutations for the global index
                // Create a UID object for the Value
                Builder uidBuilder = Uid.List.newBuilder();
                uidBuilder.setIGNORE(false);
                uidBuilder.setCOUNT(1);
                uidBuilder.addUID(Integer.toString(article.getId()));
                Uid.List uidList = uidBuilder.build();
                Value val = new Value(uidList.toByteArray());

                // Create mutations for the global index
                // Row is field value, colf is field name, colq is partitionid\0language, value is Uid.List object
                Mutation gm = new Mutation(index.getValue());
                gm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
                context.write(indexTableName, gm);

                // Create mutations for the global reverse index
                Mutation grm = new Mutation(StringUtils.reverse(index.getValue()));
                grm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
                context.write(reverseIndexTableName, grm);

                // Create mutations for the metadata table.
                String metadataKey = index.getKey() + METADATA_INDEX_COLUMN_FAMILY + language;
                if (!metadataSent.contains(metadataKey)) {
                    Mutation mm = new Mutation(index.getKey());
                    mm.put(METADATA_INDEX_COLUMN_FAMILY,
                            language + NULL_BYTE + LcNoDiacriticsNormalizer.class.getName(), cv,
                            article.getTimestamp(), NULL_VALUE);
                    context.write(metadataTableName, mm);
                    metadataSent.add(metadataKey);
                }
            }
            // Add the entire text to the document section of the table.
            // row is the partition, colf is 'd', colq is language\0articleid, value is Base64 encoded GZIP'd document
            m.put(DOCUMENT_COLUMN_FAMILY, colfPrefix + article.getId(), cv, article.getTimestamp(),
                    new Value(Base64.encodeBase64(article.getText().getBytes())));
            context.write(tablename, m);

        } else {
            context.getCounter("wikipedia", "invalid articles").increment(1);
        }
        context.progress();
    }

    /**
     * Tokenize the wikipedia content
     * 
     * @param article
     * @return
     * @throws IOException
     */
    static Set<String> getTokens(Article article) throws IOException {
        Set<String> tokenList = new HashSet<String>();
        WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
        TermAttribute term = tok.addAttribute(TermAttribute.class);
        try {
            while (tok.incrementToken()) {
                String token = term.term();
                if (!StringUtils.isEmpty(token))
                    tokenList.add(token);
            }
        } catch (IOException e) {
            log.error("Error tokenizing text", e);
        } finally {
            try {
                tok.end();
            } catch (IOException e) {
                log.error("Error calling end()", e);
            } finally {
                try {
                    tok.close();
                } catch (IOException e) {
                    log.error("Error closing tokenizer", e);
                }
            }
        }
        return tokenList;
    }

}