nl.gridline.free.taalserver.TokenizeMap.java Source code

Introduction

Here is the source code for nl.gridline.free.taalserver.TokenizeMap.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 */
package nl.gridline.free.taalserver;

import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;

import nl.gridline.free.taalserver.writable.WordVector;
import nl.gridline.zieook.mapreduce.HBaseTableConstants;
import nl.gridline.zieook.mapreduce.TaskConfig;
import nl.gridline.zieook.model.CollectionItem;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;

/**
 * Round - 1 - Tokenize
 * <b>in</b>: table result<br />
 * <b>out</b> words vector, doc id.
 * <p />
 * Project hadoop-freetaalserver<br />
 * WordCountMap.java created Sep 6, 2011
 * <p />
 * Copyright, all rights reserved 2011 GridLine Amsterdam
 * @author <a href="mailto:job@gridline.nl">Job</a>
 * @version $Revision:$, $Date:$
 */
public class TokenizeMap extends TableMapper<WordVector, LongWritable> {
    private static final Log LOG = LogFactory.getLog(TokenizeMap.class);

    private static final byte[] COLLUMN_INTR = Bytes.toBytes(HBaseTableConstants.COLLECTION_TABLE_COLUMN_INTR);

    private boolean useDescription = true;
    private boolean useTitle = false;
    private boolean useKeywords = false;

    private final LongWritable outKey = new LongWritable();

    private BreakIterator splitter;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        String loc = context.getConfiguration().get(TaskConfig.TEXT_LANGUAGE);
        if (loc == null) {
            splitter = BreakIterator.getWordInstance(new Locale("nl_NL"));
        } else {
            splitter = BreakIterator.getWordInstance(new Locale(loc));
        }

        String useDescStr = context.getConfiguration().get(TaskConfig.TEXT_ANALYSIS_USE_DESCRIPTION);
        if (useDescStr != null && !useDescStr.isEmpty()) {
            useDescription = Boolean.parseBoolean(useDescStr);
        } else {
            LOG.warn("parameter missing <" + TaskConfig.TEXT_ANALYSIS_USE_DESCRIPTION + "> defaulting to: true");
            useDescription = true;
        }

        String useTitleStr = context.getConfiguration().get(TaskConfig.TEXT_ANALYSIS_USE_TITLE);
        if (useTitleStr != null && !useTitleStr.isEmpty()) {
            useTitle = Boolean.parseBoolean(useTitleStr);

        } else {
            LOG.warn("parameter missing <" + TaskConfig.TEXT_ANALYSIS_USE_TITLE + "> defaulting to: true");
            useTitle = true;
        }

        String useKeyWordsStr = context.getConfiguration().get(TaskConfig.TEXT_ANALYSIS_USE_KEYWORDS);
        if (useKeyWordsStr != null && !useKeyWordsStr.isEmpty()) {
            useKeywords = Boolean.parseBoolean(useKeyWordsStr);
        }

    }

    @Override
    protected void map(ImmutableBytesWritable key, Result value, Context context)
            throws IOException, InterruptedException {
        CollectionItem item = new CollectionItem(value.getFamilyMap(COLLUMN_INTR));

        StringBuilder data = new StringBuilder();
        if (useTitle && item.getTitle() != null) {
            data.append(item.getTitle()).append(' ');
        }
        if (useDescription && item.getDescription() != null) {
            data.append(item.getDescription()).append(' ');
        }
        if (useKeywords) {
            data.append(getKeywords(item));
        }

        Long itemid = item.getId();

        if (itemid == null) {
            throw new IOException("item is invallid: " + item.toString());
        }

        outKey.set(itemid.longValue());

        // get title && description (if requested)

        // temp list for word tokens:
        List<String> result = new ArrayList<String>();

        // split the text & add to list:
        splitter.setText(data.toString());
        int start = splitter.first();
        for (int end = splitter.next(); end != BreakIterator.DONE; start = end, end = splitter.next()) {
            result.add(data.substring(start, end));
        }
        // LOG.info("text: " + data);
        LOG.info("vector size: " + result.size() + " text length: " + data.length());
        if (result.size() > 0) {
            // write new word vector to the output.
            context.write(new WordVector(result), outKey);
            context.progress();
        }

    }

    private String getKeywords(CollectionItem item) {
        StringBuilder result = new StringBuilder();

        Set<String> set = new HashSet<String>();
        List<String> list = item.getKeywords();
        if (list != null) {
            set.addAll(list);
        }
        list = item.getCategory();
        if (list != null) {
            set.addAll(list);
        }

        for (String i : list) {
            result.append(i).append('\n');
        }

        return result.toString().trim();
    }
}