com.aliyun.odps.flume.sink.OdpsSink.java Source code

Java tutorial

Introduction

Here is the source code for com.aliyun.odps.flume.sink.OdpsSink.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.aliyun.odps.flume.sink;

import com.aliyun.datahub.flume.sink.serializer.OdpsDelimitedTextSerializer;
import com.aliyun.datahub.flume.sink.serializer.OdpsEventSerializer;
import com.aliyun.odps.*;
import com.aliyun.odps.account.Account;
import com.aliyun.odps.account.AliyunAccount;
import com.aliyun.odps.flume.sink.dataobject.OdpsRowDO;
import com.aliyun.odps.tunnel.StreamClient;
import com.aliyun.odps.tunnel.TableTunnel;
import com.aliyun.odps.tunnel.TunnelException;
import com.aliyun.odps.tunnel.io.StreamWriter;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.lang.StringUtils;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.formatter.output.BucketPath;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.sink.AbstractSink;
import org.apache.mina.core.RuntimeIoException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.*;

/**
 * A simple sink which reads events from a channel and writes them to ODPS.
 */
public class OdpsSink extends AbstractSink implements Configurable {

    private static final Logger logger = LoggerFactory.getLogger(OdpsSink.class);

    private static final String DEFAULT_ODPS_END_POINT = "http://service.odps.aliyun.com/api";
    private static final String DEFAULT_ODPS_DATAHUB_END_POINT = "http://dh.odps.aliyun.com";
    private static final String DEFAULT_ODPS_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";

    private static final int DEFAULT_ODPS_BATCHSIZE = 100;
    private static final int DEFAULT_SHARD_NUMBER = 1;

    /**
     * Default timeout for stop waiting shard load, unit is second.
     */
    private static final int DEFAULT_LOAD_SHARD_TIME = 60;
    private static final boolean DEFAULT_AUTO_CREATE_PARTITION = true;
    private static final boolean DEFAULT_NEED_ROUNDING = false;

    private String accessId;
    private String accessKey;
    private String odpsEndPoint;
    private String datahubEndPoint;
    private String project;
    private String table;
    private List<String> partitionValues;
    private boolean autoCreatePartition;
    private String dateFormat;

    private StreamClient streamClient;
    private StreamWriter[] streamWriters;
    private OdpsWriter odpsWriter;
    private List<String> partitionFields;
    private Table odpsTable;
    private Map<String, Boolean> partitionMap;

    private String serializerType;
    private OdpsEventSerializer serializer;

    private int batchSize;
    private int shardNumber;
    private int maxLoadShardTime;

    private boolean useLocalTime;
    private TimeZone timeZone;
    private boolean needRounding;
    private int roundUnit;
    private Integer roundValue;

    private SinkCounter sinkCounter;

    @Override
    public void configure(Context context) {
        accessId = Preconditions.checkNotNull(context.getString(ConfigConstants.ACCESS_ID),
                "%s config setting is not" + " specified for sink %s", ConfigConstants.ACCESS_ID, getName());
        accessKey = Preconditions.checkNotNull(context.getString(ConfigConstants.ACCESS_KEY),
                "%s config setting is " + "not specified for sink %s", ConfigConstants.ACCESS_KEY, getName());
        odpsEndPoint = context.getString(ConfigConstants.ODPS_END_POINT, DEFAULT_ODPS_END_POINT);
        datahubEndPoint = context.getString(ConfigConstants.ODPS_DATAHUB_END_POINT, DEFAULT_ODPS_DATAHUB_END_POINT);
        project = Preconditions.checkNotNull(context.getString(ConfigConstants.ODPS_PROJECT),
                "%s config setting is " + "not specified for sink %s", ConfigConstants.ODPS_PROJECT, getName());
        table = Preconditions.checkNotNull(context.getString(ConfigConstants.ODPS_TABLE),
                "%s config setting is not " + "specified for sink %s", ConfigConstants.ODPS_TABLE, getName());
        autoCreatePartition = context.getBoolean(ConfigConstants.AUTO_CREATE_PARTITION,
                DEFAULT_AUTO_CREATE_PARTITION);
        dateFormat = context.getString(ConfigConstants.DATE_FORMAT, DEFAULT_ODPS_DATE_FORMAT);

        shardNumber = context.getInteger(ConfigConstants.SHARD_NUMBER, DEFAULT_SHARD_NUMBER);
        if (shardNumber < 1) {
            logger.warn("{}.shardNumber must be greater than 0. Defaulting to {}", getName(), DEFAULT_SHARD_NUMBER);
            shardNumber = DEFAULT_SHARD_NUMBER;
        }

        String partitions = context.getString(ConfigConstants.ODPS_PARTITION);
        if (partitions != null) {
            partitionValues = Arrays.asList(partitions.split(","));
        }

        serializerType = Preconditions.checkNotNull(context.getString(ConfigConstants.SERIALIZER),
                "%s config setting" + " is not specified for sink %s", ConfigConstants.SERIALIZER, getName());
        serializer = this.createSerializer(serializerType);
        Context serializerContext = new Context();
        serializerContext.putAll(context.getSubProperties(ConfigConstants.SERIALIZER_PREFIX));
        serializer.configure(serializerContext);

        batchSize = context.getInteger(ConfigConstants.BATCH_SIZE, DEFAULT_ODPS_BATCHSIZE);
        if (batchSize < 0) {
            logger.warn("{}.batchSize must be positive number. Defaulting to {}", getName(),
                    DEFAULT_ODPS_BATCHSIZE);
            batchSize = DEFAULT_ODPS_BATCHSIZE;
        }

        maxLoadShardTime = context.getInteger(ConfigConstants.MAX_LOAD_SHARD_TIME, DEFAULT_LOAD_SHARD_TIME);
        if (maxLoadShardTime <= 0) {
            logger.warn("{}.maxLoadShardTime must be positive number. Defaulting to {}", getName(),
                    DEFAULT_LOAD_SHARD_TIME);
            maxLoadShardTime = DEFAULT_LOAD_SHARD_TIME;
        }

        useLocalTime = context.getBoolean(ConfigConstants.USE_LOCAL_TIME_STAMP, false);
        String tzName = context.getString(ConfigConstants.TIME_ZONE);
        timeZone = (StringUtils.isEmpty(tzName)) ? null : TimeZone.getTimeZone(tzName);
        needRounding = context.getBoolean(ConfigConstants.NEED_ROUNDING, DEFAULT_NEED_ROUNDING);
        String unit = context.getString(ConfigConstants.ROUND_UNIT, ConfigConstants.MINUTE);
        if (unit.equalsIgnoreCase(ConfigConstants.HOUR)) {
            this.roundUnit = Calendar.HOUR_OF_DAY;
        } else if (unit.equalsIgnoreCase(ConfigConstants.MINUTE)) {
            this.roundUnit = Calendar.MINUTE;
        } else if (unit.equalsIgnoreCase(ConfigConstants.SECOND)) {
            this.roundUnit = Calendar.SECOND;
        } else {
            logger.warn(getName() + ". Rounding unit is not valid, please set one of "
                    + "minute, hour or second. Rounding will be disabled");
            needRounding = false;
        }
        this.roundValue = context.getInteger(ConfigConstants.ROUND_VALUE, 1);
        if (roundUnit == Calendar.SECOND || roundUnit == Calendar.MINUTE) {
            Preconditions.checkArgument(roundValue > 0 && roundValue <= 60, "Round value must be > 0 and <= 60");
        } else if (roundUnit == Calendar.HOUR_OF_DAY) {
            Preconditions.checkArgument(roundValue > 0 && roundValue <= 24, "Round value must be > 0 and <= 24");
        }

        if (sinkCounter == null) {
            sinkCounter = new SinkCounter(getName());
        }
    }

    private OdpsEventSerializer createSerializer(String serializerType) {
        if (serializerType.compareToIgnoreCase(OdpsDelimitedTextSerializer.ALIAS) == 0
                || serializerType.compareTo(OdpsDelimitedTextSerializer.class.getName()) == 0) {
            return new OdpsDelimitedTextSerializer();
        }

        try {
            return (OdpsEventSerializer) Class.forName(serializerType).newInstance();
        } catch (Exception e) {
            throw new IllegalArgumentException(
                    "Unable to instantiate serializer: " + serializerType + " on sink: " + getName(), e);
        }
    }

    @Override
    public void start() {
        logger.info("Odps Sink {}: starting...", getName());
        super.start();
        // Sleep a random time (<= 60s)
        try {
            Thread.sleep((new Random()).nextInt(60000));
        } catch (InterruptedException e) {
            // DO NOTHING
        }
        // Initial datahub writers
        Account account = new AliyunAccount(accessId, accessKey);
        Odps odps = new Odps(account);
        odps.setUserAgent("odps-flume-1.0.0");
        odps.setDefaultProject(project);
        odps.setEndpoint(odpsEndPoint);
        odpsTable = odps.tables().get(table);
        if (partitionValues != null) {
            List<Column> partitionCols = odpsTable.getSchema().getPartitionColumns();
            if (partitionCols.size() != partitionValues.size()) {
                logger.error("Odps partition fields number not equals input partition values number");
                throw new FlumeException("Odps partition fields number not equals input partition values number");
            }
            partitionFields = Lists.newArrayList();
            for (Column partCol : partitionCols) {
                partitionFields.add(partCol.getName());
            }
            buildPartitionMap();
        }
        TableTunnel tunnel = new TableTunnel(odps);
        tunnel.setEndpoint(datahubEndPoint);

        try {
            streamClient = tunnel.createStreamClient(project, table);
            streamClient.loadShard(shardNumber);
        } catch (TunnelException e) {
            logger.error(
                    "Error initializing stream client from tunnel. Project: " + project + ", Table: " + table + ".",
                    e);
            throw new FlumeException("Error initializing stream client from tunnel. Project: " + project
                    + ", Table: " + "" + table + ".", e);
        }

        try {
            buildStreamWriters();
        } catch (Exception e) {
            logger.error("Error build ODPS stream writers.", e);
            throw new FlumeException("Error build ODPS stream writers.", e);
        }
        try {
            odpsWriter = serializer.createOdpsWriter(odpsTable, streamWriters, dateFormat);
        } catch (UnsupportedEncodingException e) {
            logger.error("Error create ODPS stream writers.", e);
            throw new FlumeException("create odps writer failed!");
        }
        sinkCounter.start();
        logger.info("Odps Sink {}: started", getName());
    }

    private void buildPartitionMap() {
        partitionMap = Maps.newHashMap();
        for (Partition partition : odpsTable.getPartitions()) {
            partitionMap.put(partition.getPartitionSpec().toString(), true);
        }
    }

    private void buildStreamWriters() throws IOException, TunnelException {
        final StreamClient.ShardState finish = StreamClient.ShardState.LOADED;
        long now = System.currentTimeMillis();
        long endTime = now + maxLoadShardTime * 1000;
        List<Long> shardIDList = null;
        while (now < endTime) {
            HashMap<Long, StreamClient.ShardState> shardStatus = streamClient.getShardStatus();
            shardIDList = new ArrayList<Long>();
            Set<Long> keys = shardStatus.keySet();
            Iterator<Long> iter = keys.iterator();
            while (iter.hasNext()) {
                Long key = iter.next();
                StreamClient.ShardState value = shardStatus.get(key);
                if (value.equals(finish)) {
                    shardIDList.add(key);
                }
            }
            now = System.currentTimeMillis();
            if (shardIDList.size() == shardNumber) {
                break;
            }
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                // DO NOTHING...
            }
        }
        if (shardIDList != null && shardIDList.size() > 0) {
            this.streamWriters = new StreamWriter[shardIDList.size()];
            for (int i = 0; i < shardIDList.size(); i++) {
                this.streamWriters[i] = streamClient.openStreamWriter(shardIDList.get(i));
            }
        } else {
            throw new RuntimeException(
                    "Odps Sink " + getName() + " buildStreamWriter() error, have not loaded shards" + ".");
        }
    }

    @Override
    public void stop() {
        sinkCounter.stop();
        super.stop();
        logger.info("Odps Sink {}: stopped", getName());
    }

    @Override
    public Status process() throws EventDeliveryException {
        logger.debug("Sink {} processing...", getName());
        Status status = Status.READY;
        Channel channel = getChannel();
        Transaction transaction = channel.getTransaction();

        try {
            transaction.begin();
            List<OdpsRowDO> rowList = Lists.newLinkedList();
            for (int i = 0; i < batchSize; i++) {
                Event event = channel.take();
                if (event == null) {
                    break;
                }
                serializer.initialize(event);
                OdpsRowDO odpsRowDO = new OdpsRowDO();
                odpsRowDO.setRowMap(serializer.getRow());
                String partitionSpec = buildPartitionSpec(partitionValues, event.getHeaders(), timeZone,
                        needRounding, roundUnit, roundValue, useLocalTime);
                odpsRowDO.setPartitionSpec(partitionSpec);
                rowList.add(odpsRowDO);
            }
            sinkCounter.addToEventDrainAttemptCount(rowList.size());
            if (rowList.size() == 0) {
                sinkCounter.incrementBatchEmptyCount();
                logger.debug("No events in channel {}", getChannel().getName());
                status = Status.BACKOFF;
            } else {
                // Write batch to ODPS table
                odpsWriter.write(rowList);
                logger.info("Write success. Sink: {}, Event count: {}", getName(), rowList.size());
                if (batchSize == rowList.size()) {
                    sinkCounter.incrementBatchCompleteCount();
                } else {
                    sinkCounter.incrementBatchUnderflowCount();
                }
                sinkCounter.addToEventDrainSuccessCount(rowList.size());
            }
            transaction.commit();
        } catch (Throwable t) {
            transaction.rollback();
            if (t instanceof Error) {
                throw (Error) t;
            } else if (t instanceof ChannelException) {
                logger.error("Odps Sink " + getName() + ": Unable to get event from channel " + channel.getName()
                        + "" + ". Exception follows.", t);
                status = Status.BACKOFF;
            } else {
                throw new EventDeliveryException("Failed to send events", t);
            }
        } finally {
            transaction.close();
        }
        return status;
    }

    private String buildPartitionSpec(List<String> partitionValues, Map<String, String> headers, TimeZone timeZone,
            boolean needRounding, int roundUnit, Integer roundValue, boolean useLocalTime) throws OdpsException {
        if (partitionValues == null || partitionValues.size() == 0) {
            return StringUtils.EMPTY;
        }
        if (partitionValues.size() != partitionFields.size()) {
            throw new RuntimeException("Odps partition fields number not equals input partition values number");
        }
        StringBuilder sb = new StringBuilder();
        String sep = "";
        for (int i = 0; i < partitionValues.size(); i++) {
            String realPartVal = BucketPath.escapeString(partitionValues.get(i), headers, timeZone, needRounding,
                    roundUnit, roundValue, useLocalTime);
            sb.append(sep).append(partitionFields.get(i)).append("='").append(realPartVal).append("'");
            sep = ",";
        }
        String partitionSpec = sb.toString();
        if (autoCreatePartition && !partitionMap.containsKey(partitionSpec)) {
            odpsTable.createPartition(new PartitionSpec(partitionSpec), true);
            partitionMap.put(partitionSpec, true);
        }
        return partitionSpec;
    }

}