com.sematext.hbase.hut.UpdatesProcessingMrJob.java Source code

Introduction

Here is the source code for com.sematext.hbase.hut.UpdatesProcessingMrJob.java
Source

/**
 * Copyright 2010 Sematext International
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.sematext.hbase.hut;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;

/**
 * Perform processing updates by running MapReduce job, and hence utilizes data locality during work.
 * This is a map-only job for doing compaction which means greater utilization of data locality
 * (when reading and writing data), but may cause issues according to some sources (concern is about 
 * writing into same table from a Mapper which may cause issues TODO: really?)
 * NOTE: it may cause some spans of records to be compacted into multiple result
 * records, which is usually (always?) ok.
 */
public final class UpdatesProcessingMrJob {
    private UpdatesProcessingMrJob() {
    }

    public static class UpdatesProcessingMapper extends TableMapper<ImmutableBytesWritable, Put> {
        public static final String HTABLE_NAME_ATTR = "htable.name";
        public static final String HUT_MR_BUFFER_SIZE_ATTR = "hut.mr.buffer.size";
        public static final String HUT_MR_BUFFER_SIZE_IN_BYTES_ATTR = "hut.mr.buffer.size.bytes";
        public static final String HUT_PROCESSOR_CLASS_ATTR = "hut.processor.class";
        public static final String HUT_PROCESSOR_DETAILS_ATTR = "hut.processor.details";
        public static final String HUT_PROCESSOR_TSMOD_ATTR = "hut.processor.tsMod";
        public static final String HUT_PROCESSOR_MIN_RECORDS_TO_COMPACT_ATTR = "hut.processor.minRecordsToCompact";
        private static final Log LOG = LogFactory.getLog(UpdatesProcessingMapper.class);

        private HTable hTable;

        // buffer for items read by map()
        // can be overridden by HUT_MR_BUFFER_SIZE_ATTR attribute in configuration
        private int bufferMaxSize = 1000;

        // buffer for items read by map()
        // can be overridden by HUT_MR_BUFFER_SIZE_IN_BYTES_ATTR attribute in configuration
        private int bufferMaxSizeInBytes = 32 * 1024 * 1024; // 32 MB

        // TODO: describe
        // can be overridden by HUT_PROCESSOR_TSMOD_ATTR attribute in configuration
        private long tsMod = 0;

        // TODO: describe
        // can be overridden by HUT_PROCESSOR_MIN_RECORDS_TO_COMPACT_ATTR attribute in configuration
        private int minRecordsToCompact = 2;

        // queue with map input records to be fed into updates processor
        private LinkedList<Result> mapInputBuff;
        private long bytesInBuffer;
        // used to process updates TODO: think over reimplementing processing updates for MR case to not stick to scan case
        private DetachedHutResultScanner resultScanner;
        // used to keep last processed update when buffer was emptied (before filled again) - see more comments in the code
        private Put readyToStoreButWaitingFurtherMerging = null;
        private List<byte[]> readyToDelete = new ArrayList<byte[]>();

        // map task state
        private volatile boolean failed;

        // map task counters
        private int writtenRecords = 0;
        private int deletedRecords = 0;

        /**
         * Detached from HTable and ResultScanner scanner that is being fed with {@link org.apache.hadoop.hbase.client.Result} items.
         * Differs from HutResultScanner which uses normal HBase ResultScanner to fetch the data.
         */
        class DetachedHutResultScanner extends HutResultScanner {
            private Put processedUpdatesToStore = null;
            private List<byte[]> rowsToDelete = new ArrayList<byte[]>();

            public DetachedHutResultScanner(UpdateProcessor updateProcessor) {
                super(null, updateProcessor, null, true);
            }

            @Override
            protected boolean isMergeNeeded(byte[] firstKey, byte[] secondKey) {
                if (tsMod <= 0) {
                    return super.isMergeNeeded(firstKey, secondKey);
                } else {
                    return HutRowKeyUtil.sameOriginalKeys(firstKey, secondKey, tsMod);
                }
            }

            @Override
            void store(Put put) throws IOException {
                processedUpdatesToStore = put;
            }

            @Override
            void deleteProcessedRecords(List<byte[]> rows) throws IOException {
                rowsToDelete = rows;
            }

            @Override
            protected void verifyInitParams(ResultScanner resultScanner, UpdateProcessor updateProcessor,
                    HTable hTable, boolean storeProcessedUpdates) {
                if (updateProcessor == null) {
                    throw new IllegalArgumentException("UpdateProcessor should NOT be null.");
                }
                // since this is "detached" scanner, ResultScanner and/or HTable can be null
            }

            @Override
            public Result next() throws IOException {
                processedUpdatesToStore = null;
                rowsToDelete.clear();

                return super.next();
            }

            protected Result fetchNext() throws IOException {
                return fetchNextFromBuffer();
            }

            public Put getProcessedUpdatesToStore() {
                return processedUpdatesToStore;
            }

            public List<byte[]> getRowsToDelete() {
                return rowsToDelete;
            }
        }

        /**
         * Pass the key, value to reduce.
         *
         * @param key  The current key.
         * @param value  The current value.
         * @param context  The current context.
         * @throws java.io.IOException When writing the record fails.
         * @throws InterruptedException When the job is aborted.
         */
        public void map(ImmutableBytesWritable key, Result value, Context context)
                throws IOException, InterruptedException {
            addToMapInputBuffer(value);
            if (!isMapInputBufferFull()) {
                return;
            }

            // more or less reasonable: attempt to ping every time when switching to process buffered records
            pingMap(context); // TODO: allow user to control pinging

            // processing buffered rows
            try {
                Result res = resultScanner.next();
                Result prev = null;
                Put processingResultToStore = null;
                List<byte[]> toDeleteAfterStoringProcessingResult = new ArrayList<byte[]>();
                while (res != null) {
                    // we save previous record in case processingResultToStore is null and this is the last
                    // element in buffer. In that case we will put it back to buffer to give it a chance to merge
                    // with next items coming to map method
                    prev = res;

                    // if merging occurred, it will be stored as processingResultToStore
                    processingResultToStore = resultScanner.getProcessedUpdatesToStore();
                    toDeleteAfterStoringProcessingResult.addAll(resultScanner.getRowsToDelete());

                    // last processing result from previous buffered records
                    // got chance to be merged, but looks like next record is from different group (i.e. no merge occurred)
                    if (processingResultToStore == null && readyToStoreButWaitingFurtherMerging != null) {
                        store(readyToStoreButWaitingFurtherMerging, readyToDelete);
                        readyToDelete.clear();
                    }
                    // setting to null in any case:
                    // * either it was written above or
                    // * was merged with next records and will be written below
                    readyToStoreButWaitingFurtherMerging = null;
                    // in case readyToStoreButWaitingFurtherMerging was merged with next records, we need to
                    // "transfer" those records we wanted to delete with it
                    toDeleteAfterStoringProcessingResult.addAll(readyToDelete);
                    readyToDelete.clear();

                    res = resultScanner.next();
                    boolean lastInBuffer = res == null;
                    // We don't want to store last processed result *now*,
                    // instead we postpone storing it to give it a chance to merge with next map input records down the road.
                    if (!lastInBuffer) {
                        if (processingResultToStore != null) {
                            store(processingResultToStore, toDeleteAfterStoringProcessingResult);
                            toDeleteAfterStoringProcessingResult.clear();
                        }
                    }
                }

                if (prev != null) {
                    // see explanation near assignment
                    boolean added = addToMapInputBufferIfSpaceAvailable(prev);

                    if (!added && processingResultToStore != null) {
                        store(processingResultToStore, toDeleteAfterStoringProcessingResult);
                        toDeleteAfterStoringProcessingResult.clear();
                        return;
                    }

                    readyToStoreButWaitingFurtherMerging = processingResultToStore;
                    readyToDelete.addAll(toDeleteAfterStoringProcessingResult);
                }
            } catch (IOException e) {
                LOG.error(e);
                // TODO: do we really want to fail the whole job? or just skip processing these group of updates
                failed = true; // marking job as failed
            } catch (InterruptedException e) {
                LOG.error(e);
                // TODO: do we really want to fail the whole job? or just skip processing these group of updates
                failed = true; // marking job as failed
            }

        }

        private Result fetchNextFromBuffer() {
            if (mapInputBuff.size() == 0) {
                return null;
            }

            Result r = mapInputBuff.poll();
            bytesInBuffer -= getSize(r);
            return r;
        }

        private void addToMapInputBuffer(Result value) {
            bytesInBuffer += getSize(value);
            mapInputBuff.addLast(value);
        }

        private boolean addToMapInputBufferIfSpaceAvailable(Result value) {
            if (bytesInBuffer + getSize(value) <= bufferMaxSizeInBytes) {
                addToMapInputBuffer(value);
                return true;
            }

            return false;
        }

        private static int getSize(Result value) {
            // TODO: is this the best way of calculating size? Tried using getBytes() but it sometimes returns null
            int size = 0;
            for (KeyValue kv : value.raw()) {
                size += kv.getLength();
            }

            return size;
        }

        private boolean isMapInputBufferFull() {
            return mapInputBuff.size() >= bufferMaxSize || bytesInBuffer >= bufferMaxSizeInBytes;
        }

        private void store(Put processingResultToStore, List<byte[]> rowsToDeleteAfterStoringProcessingResult)
                throws IOException, InterruptedException {
            hTable.put(processingResultToStore);
            writtenRecords++;
            HutResultScanner.deleteProcessedRecords(hTable, rowsToDeleteAfterStoringProcessingResult);
            deletedRecords += rowsToDeleteAfterStoringProcessingResult.size();
        }

        private long lastPingTimeMap = System.currentTimeMillis();

        public void pingMap(Context context) {
            final long currtime = System.currentTimeMillis();
            if (currtime - lastPingTimeMap > 2000) {
                context.progress();
                lastPingTimeMap = currtime;
            }
        }

        @Override
        protected void setup(final Context context) throws IOException, InterruptedException {
            super.setup(context);

            String updatesProcessorClass = context.getConfiguration().get(HUT_PROCESSOR_CLASS_ATTR);
            if (updatesProcessorClass == null) {
                throw new IllegalStateException("hut.processor.class missed in the configuration");
            }

            String updatesProcessorDetails = context.getConfiguration().get(HUT_PROCESSOR_DETAILS_ATTR);
            if (updatesProcessorDetails == null) {
                // TODO: throw exception in future versions
                LOG.warn(
                        "hut.processor.details missed in the configuration, updates processor serialized state is missing");
            }

            UpdateProcessor updateProcessor = convertStringToUpdateProcessor(updatesProcessorClass,
                    updatesProcessorDetails);
            LOG.info("Using updateProcessor: " + updateProcessor.toString());

            if (updateProcessor instanceof Configurable) {
                ((Configurable) updateProcessor).configure(context.getConfiguration());
            }

            if (updateProcessor instanceof MapContextAware) {
                ((MapContextAware) updateProcessor).setContext(context);
            }

            String bufferSizeValue = context.getConfiguration().get(HUT_MR_BUFFER_SIZE_ATTR);
            if (bufferSizeValue == null) {
                LOG.info(HUT_MR_BUFFER_SIZE_ATTR + " is missed in the configuration, using default value: "
                        + bufferMaxSize);
            } else {
                bufferMaxSize = Integer.valueOf(bufferSizeValue);
                LOG.info("Using bufferMaxSize: " + bufferMaxSize);
            }

            String bufferMaxSizeInBytesValue = context.getConfiguration().get(HUT_MR_BUFFER_SIZE_IN_BYTES_ATTR);
            if (bufferMaxSizeInBytesValue == null) {
                LOG.info(HUT_MR_BUFFER_SIZE_IN_BYTES_ATTR + " is missed in the configuration, using default value: "
                        + bufferMaxSizeInBytes);
            } else {
                bufferMaxSizeInBytes = Integer.valueOf(bufferMaxSizeInBytesValue);
                LOG.info("Using bufferMaxSizeInBytes: " + bufferMaxSizeInBytes);
            }

            String tsModValue = context.getConfiguration().get(HUT_PROCESSOR_TSMOD_ATTR);
            if (tsModValue == null) {
                LOG.info(HUT_PROCESSOR_TSMOD_ATTR + " is missed in the configuration, using default value: "
                        + tsMod);
            } else {
                tsMod = Long.valueOf(tsModValue);
                LOG.info("Using tsMod: " + tsMod);
            }

            String minRecordsToCompactValue = context.getConfiguration()
                    .get(HUT_PROCESSOR_MIN_RECORDS_TO_COMPACT_ATTR);
            if (minRecordsToCompactValue == null) {
                LOG.info(HUT_PROCESSOR_MIN_RECORDS_TO_COMPACT_ATTR
                        + " is missed in the configuration, using default value: " + minRecordsToCompact);
            } else {
                minRecordsToCompact = Integer.valueOf(minRecordsToCompactValue);
                LOG.info("Using minRecordsToCompact: " + minRecordsToCompact);
            }

            // TODO: add validation of configuration attributes
            String tableName = context.getConfiguration().get(HTABLE_NAME_ATTR);
            if (tableName == null) {
                throw new IllegalStateException(HTABLE_NAME_ATTR + " missed in the configuration");
            }
            hTable = new HTable(tableName);
            // NOTE: we are OK with using client-side buffer as losing deletes will not corrupt the data
            // TODO: make these settings configurable
            hTable.setAutoFlush(false);
            // 4MB
            hTable.setWriteBufferSize(4 * 1024 * 1024);

            mapInputBuff = new LinkedList<Result>();
            bytesInBuffer = 0;
            resultScanner = new DetachedHutResultScanner(updateProcessor);
            resultScanner.setMinRecordsToProcess(minRecordsToCompact);
            failed = false;

            // map task counters
            writtenRecords = 0;
            deletedRecords = 0;
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            if (mapInputBuff.size() > 0) {
                Result res = resultScanner.next();
                while (res != null) {
                    Put processingResultToStore = resultScanner.getProcessedUpdatesToStore();
                    List<byte[]> toDeleteAfterStoringProcessingResult = resultScanner.getRowsToDelete();

                    // last processing result from previous buffered records
                    // got chance to be merged, but looks like next record is from different group (i.e. no merge occurred)
                    if (processingResultToStore == null && readyToStoreButWaitingFurtherMerging != null) {
                        store(readyToStoreButWaitingFurtherMerging, readyToDelete);
                        readyToDelete.clear();
                    }

                    if (processingResultToStore != null) {
                        // in case readyToStoreButWaitingFurtherMerging was merged with next records, we need to
                        // "transfer" those records we wanted to delete with it
                        toDeleteAfterStoringProcessingResult.addAll(readyToDelete);
                        readyToDelete.clear();
                        store(processingResultToStore, toDeleteAfterStoringProcessingResult);
                        toDeleteAfterStoringProcessingResult.clear();
                    }
                    res = resultScanner.next();
                }

            }

            mapInputBuff.clear();
            bytesInBuffer = 0;

            context.getCounter("hut_compaction", "writtenRecords").increment(writtenRecords);
            context.getCounter("hut_compaction", "deletedRecords").increment(deletedRecords);

            if (failed) {
                throw new RuntimeException("Job was marked as failed");
            }

            hTable.close();

            super.cleanup(context);
        }
    }

    /**
     * Use this before submitting a TableMap job. It will appropriately set up
     * the job.
     *
     * @param table  The table name.
     * @param scan  The scan with the columns to scan.
     * @param up update processor implementation
     * @param job  The job configuration.
     * @throws java.io.IOException When setting up the job fails.
     */
    @SuppressWarnings("unchecked")
    public static void initJob(String table, Scan scan, UpdateProcessor up, Job job) throws IOException {
        TableMapReduceUtil.initTableMapperJob(table, scan, UpdatesProcessingMapper.class, null, null, job);
        job.setJarByClass(UpdatesProcessingMrJob.class);
        job.setOutputFormatClass(NullOutputFormat.class);
        job.setNumReduceTasks(0);

        job.getConfiguration().set(UpdatesProcessingMapper.HTABLE_NAME_ATTR, table);
        job.getConfiguration().set(UpdatesProcessingMapper.HUT_PROCESSOR_CLASS_ATTR, up.getClass().getName());
        job.getConfiguration().set(UpdatesProcessingMapper.HUT_PROCESSOR_DETAILS_ATTR,
                convertUpdateProcessorToString(up));

        job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false"); // TODO: explain
    }

    /**
     * Writes the given updatesProcessor into a Base64 encoded string.
     *
     * @param up  The updatesProcessor to write out.
     * @return The updateProcessor saved in a Base64 encoded string.
     * @throws java.io.IOException When writing the updateProcessor fails.
     */
    static String convertUpdateProcessorToString(UpdateProcessor up) throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DataOutputStream dos = new DataOutputStream(out);
        up.write(dos);
        return Base64.encodeBytes(out.toByteArray());
    }

    /**
     * Converts the given Base64 string back into a UpdateProcessor instance.
     *
     * @param upClassName  The updateProcessor class name.
     * @param base64  The updateProcessor details.
     * @return The newly created updateProcessor instance.
     * @throws java.io.IOException When reading the updateProcessor instance fails.
     */
    static UpdateProcessor convertStringToUpdateProcessor(String upClassName, String base64) throws IOException {
        UpdateProcessor up = createInstance(upClassName, UpdateProcessor.class);
        if (base64 != null) {
            ByteArrayInputStream bis = new ByteArrayInputStream(Base64.decode(base64));
            DataInputStream dis = new DataInputStream(bis);
            up.readFields(dis);
        }
        return up;
    }

    @SuppressWarnings({ "unchecked", "unused" })
    private static <T> T createInstance(String className, Class<T> clazz) {
        try {
            Class c = Class.forName(className);
            return (T) c.newInstance();
        } catch (InstantiationException e) {
            throw new RuntimeException("Could not create class instance.", e);
        } catch (IllegalAccessException e) {
            throw new RuntimeException("Could not create class instance.", e);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException("Could not create class instance.", e);
        }
    }
}