com.knewton.mapreduce.StudentEventAbstractMapper.java Source code

Java tutorial

Introduction

Here is the source code for com.knewton.mapreduce.StudentEventAbstractMapper.java

Source

/**
 * Copyright 2013 Knewton
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 *
 */
package com.knewton.mapreduce;

import com.knewton.mapreduce.util.CounterConstants;
import com.knewton.thrift.StudentEvent;
import com.knewton.thrift.StudentEventData;

import org.apache.cassandra.db.IColumn;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.thrift.TDeserializer;
import org.apache.thrift.TException;
import org.apache.thrift.protocol.TCompactProtocol;
import org.apache.thrift.protocol.TProtocolFactory;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.Interval;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.ByteBuffer;

import javax.annotation.Nullable;

/**
 * Abstract mapper that converts a cassandra column to a student event and a cassandra row key to a
 * student id.
 *
 * @param <K>
 * @param <V>
 */
@SuppressWarnings("rawtypes")
public abstract class StudentEventAbstractMapper<K extends WritableComparable, V extends Writable>
        extends SSTableColumnMapper<Long, StudentEvent, K, V> {

    private final TDeserializer decoder;
    private Interval timeRange;
    private static final Logger LOG = LoggerFactory.getLogger(StudentEventAbstractMapper.class);

    public static final String DATE_TIME_STRING_FORMAT = "yyyy'-'MM'-'dd'T'HH':'mm':'ss.SSSZ"; // 2013-03-31T04:03:02.394-04:00
    public static final String START_DATE_PARAMETER_NAME = "com.knewton.studentevents.date.start";
    public static final String END_DATE_PARAMETER_NAME = "com.knewton.studentevents.date.end";
    public static final String IGNORE_TEST_UUIDS_PARAMETER_NAME = "com.knewton.studentevents.ignore_test_uuids";
    /**
     * Helper constant for avoiding date time overflows.
     */
    public static final long ONE_DAY_IN_MILLIS = 86400000L;

    public StudentEventAbstractMapper() {
        TProtocolFactory factory = new TCompactProtocol.Factory();
        decoder = new TDeserializer(factory);
        setSkipDeletedColumns(true);
    }

    /**
     * Setup time range for excluding student events.
     */
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);
        setupTimeRange(context.getConfiguration());
    }

    /**
     * @param key
     * @param context
     * @return Get the student id from the row key.
     */
    @Override
    protected Long getMapperKey(ByteBuffer key, Context context) {
        ByteBuffer dup = key.slice();
        Long studentId = dup.getLong();
        return studentId;
    }

    /**
     * @param iColumn
     * @param context
     * @return Get a student event out of a cassandra column.
     */
    @Override
    protected StudentEvent getMapperValue(IColumn iColumn, Context context) {
        return getStudentEvent(iColumn, context);
    }

    /**
     * Make a student event out of a column in cassandra.
     *
     * @param value
     * @param context
     * @return
     */
    @Nullable
    private StudentEvent getStudentEvent(IColumn value, Context context) {
        context.getCounter(CounterConstants.STUDENT_EVENTS_JOB, CounterConstants.STUDENT_EVENTS_COUNT).increment(1);
        ByteBuffer columnNameDup = value.name().slice();
        ByteBuffer columnValueDup = value.value().slice();
        long eventId = columnNameDup.getLong();
        if (!isInTimeRange(eventId, context)) {
            return null;
        }
        byte[] data = new byte[columnValueDup.remaining()];
        columnValueDup.get(data);
        StudentEvent studentEvent = new StudentEvent();
        StudentEventData studentEventData = new StudentEventData();
        try {
            decoder.deserialize(studentEventData, data);
        } catch (TException e) {
            LOG.error(String.format("Could not deserialize byte array for event with id %d. Skipping Got: %s",
                    eventId, e.getMessage()));
            return null;
        }
        studentEvent.setId(eventId);
        studentEvent.setData(studentEventData);
        context.getCounter(CounterConstants.STUDENT_EVENTS_JOB, CounterConstants.STUDENT_EVENTS_PROCESSED)
                .increment(1);
        return studentEvent;
    }

    /**
     * Checks to see if the event is in the desired time range.
     *
     * @param eventId
     * @param context
     * @return
     */
    private boolean isInTimeRange(long eventId, Context context) {
        DateTime eventTime = new DateTime(eventId).withZone(DateTimeZone.UTC);
        // Skip events outside of the desired time range.
        if (timeRange != null && !timeRange.contains(eventTime)) {
            context.getCounter(CounterConstants.STUDENT_EVENTS_JOB, CounterConstants.STUDENT_EVENTS_SKIPPED)
                    .increment(1);
            return false;
        }
        return true;
    }

    /**
     * Sets up a DateTime interval for excluding student events. When start time is not set then it
     * defaults to the beginning of time. If end date is not specified then it defaults to
     * "the end of time".
     *
     * @param conf
     */
    private void setupTimeRange(Configuration conf) {
        DateTimeFormatter dtf = DateTimeFormat.forPattern(DATE_TIME_STRING_FORMAT).withZoneUTC();
        String startDateStr = conf.get(START_DATE_PARAMETER_NAME);
        String endDateStr = conf.get(END_DATE_PARAMETER_NAME);
        // No need to instantiate timeRange.
        if (startDateStr == null && endDateStr == null) {
            return;
        }
        DateTime startDate;
        if (startDateStr != null) {
            startDate = dtf.parseDateTime(startDateStr);
        } else {
            startDate = new DateTime(Long.MIN_VALUE + ONE_DAY_IN_MILLIS).withZone(DateTimeZone.UTC);
        }
        DateTime endDate;
        if (endDateStr != null) {
            endDate = dtf.parseDateTime(endDateStr);
        } else {
            endDate = new DateTime(Long.MAX_VALUE - ONE_DAY_IN_MILLIS).withZone(DateTimeZone.UTC);
        }
        this.timeRange = new Interval(startDate, endDate);
    }

}