com.nearinfinity.blur.mapreduce.BlurTask.java Source code

Java tutorial

Introduction

Here is the source code for com.nearinfinity.blur.mapreduce.BlurTask.java

Source

/*
 * Copyright (C) 2011 Near Infinity Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nearinfinity.blur.mapreduce;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.List;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.thrift.TException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.transport.TIOStreamTransport;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooKeeper;

import com.nearinfinity.blur.log.Log;
import com.nearinfinity.blur.log.LogFactory;
import com.nearinfinity.blur.manager.clusterstatus.ZookeeperClusterStatus;
import com.nearinfinity.blur.manager.clusterstatus.ZookeeperPathConstants;
import com.nearinfinity.blur.thrift.generated.TableDescriptor;
import com.nearinfinity.blur.utils.BlurConstants;
import com.nearinfinity.blur.utils.BlurUtil;

public class BlurTask implements Writable {

    public enum INDEXING_TYPE {
        REBUILD, UPDATE
    }

    private static final String BLUR_BLURTASK = "blur.blurtask";
    private static final Log LOG = LogFactory.getLog(BlurTask.class);

    public static String getCounterGroupName() {
        return "Blur";
    }

    public static String getRowCounterName() {
        return "Rows";
    }

    public static String getFieldCounterName() {
        return "Fields";
    }

    public static String getRecordCounterName() {
        return "Records";
    }

    public static String getRowBreakCounterName() {
        return "Row Retries";
    }

    public static String getRowFailureCounterName() {
        return "Row Failures";
    }

    private int _ramBufferSizeMB = 256;
    private long _maxRecordCount = Long.MAX_VALUE;
    private TableDescriptor _tableDescriptor;
    private int _maxRecordsPerRow = 16384;
    private boolean _optimize = true;
    private INDEXING_TYPE _indexingType = INDEXING_TYPE.REBUILD;
    private transient ZooKeeper _zooKeeper;

    public String getShardName(TaskAttemptContext context) {
        TaskAttemptID taskAttemptID = context.getTaskAttemptID();
        int id = taskAttemptID.getTaskID().getId();
        return BlurUtil.getShardName(BlurConstants.SHARD_PREFIX, id);
    }

    public Path getDirectoryPath(TaskAttemptContext context) {
        String shardName = getShardName(context);
        return new Path(new Path(_tableDescriptor.tableUri), shardName);
    }

    public int getNumReducers(Configuration configuration) {
        Path tablePath = new Path(_tableDescriptor.tableUri);
        try {
            int num = _tableDescriptor.shardCount;
            FileSystem fileSystem = FileSystem.get(tablePath.toUri(), configuration);
            if (!fileSystem.exists(tablePath)) {
                return num;
            }
            FileStatus[] files = fileSystem.listStatus(tablePath);
            int shardCount = 0;
            for (FileStatus fileStatus : files) {
                if (fileStatus.isDir()) {
                    String name = fileStatus.getPath().getName();
                    if (name.startsWith(BlurConstants.SHARD_PREFIX)) {
                        shardCount++;
                    }
                }
            }

            if (shardCount == 0) {
                return num;
            }
            if (shardCount != num) {
                LOG.warn("Asked for " + num + " reducers, but existing table " + _tableDescriptor.name + " has "
                        + shardCount + " shards. Using " + shardCount + " reducers");
            }
            return shardCount;
        } catch (IOException e) {
            throw new RuntimeException("Unable to connect to filesystem", e);
        }
    }

    public int getRamBufferSizeMB() {
        return _ramBufferSizeMB;
    }

    public void setRamBufferSizeMB(int ramBufferSizeMB) {
        _ramBufferSizeMB = ramBufferSizeMB;
    }

    public long getMaxRecordCount() {
        return _maxRecordCount;
    }

    public void setMaxRecordCount(long maxRecordCount) {
        _maxRecordCount = maxRecordCount;
    }

    public void setTableDescriptor(TableDescriptor tableDescriptor) {
        _tableDescriptor = tableDescriptor;
    }

    public TableDescriptor getTableDescriptor() {
        return _tableDescriptor;
    }

    public Job configureJob(Configuration configuration) throws IOException {
        if (getIndexingType() == INDEXING_TYPE.UPDATE) {
            checkTable();
        }
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        DataOutputStream output = new DataOutputStream(os);
        write(output);
        output.close();
        String blurTask = new String(Base64.encodeBase64(os.toByteArray()));
        configuration.set(BLUR_BLURTASK, blurTask);

        Job job = new Job(configuration, "Blur Indexer");
        job.setReducerClass(BlurReducer.class);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(BlurMutate.class);
        job.setNumReduceTasks(getNumReducers(configuration));
        return job;
    }

    private void checkTable() {
        ZookeeperClusterStatus status = new ZookeeperClusterStatus(_zooKeeper);
        // check if table exists
        String cluster = _tableDescriptor.cluster;
        String table = _tableDescriptor.name;
        if (!status.exists(false, cluster, table)) {
            throw new RuntimeException("Table [" + table + "] in cluster [" + cluster + "] does not exist.");
        }
        // check if table is locked
        try {
            List<String> children = _zooKeeper.getChildren(ZookeeperPathConstants.getLockPath(cluster, table),
                    false);
            if (!children.isEmpty()) {
                throw new RuntimeException("Table [" + table + "] in cluster [" + cluster
                        + "] has write locks enabled, cannot perform update.");
            }
        } catch (KeeperException e) {
            throw new RuntimeException(e);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }

    }

    public static BlurTask read(Configuration configuration) throws IOException {
        byte[] blurTaskBs = Base64.decodeBase64(configuration.get(BLUR_BLURTASK));
        BlurTask blurTask = new BlurTask();
        blurTask.readFields(new DataInputStream(new ByteArrayInputStream(blurTaskBs)));
        return blurTask;
    }

    @Override
    public void readFields(DataInput input) throws IOException {
        _maxRecordCount = input.readLong();
        _ramBufferSizeMB = input.readInt();
        _optimize = input.readBoolean();
        _indexingType = INDEXING_TYPE.valueOf(readString(input));
        byte[] data = new byte[input.readInt()];
        input.readFully(data);
        ByteArrayInputStream is = new ByteArrayInputStream(data);
        TIOStreamTransport trans = new TIOStreamTransport(is);
        TBinaryProtocol protocol = new TBinaryProtocol(trans);
        _tableDescriptor = new TableDescriptor();
        try {
            _tableDescriptor.read(protocol);
        } catch (TException e) {
            throw new IOException(e);
        }
    }

    private String readString(DataInput input) throws IOException {
        int length = input.readInt();
        byte[] buf = new byte[length];
        input.readFully(buf);
        return new String(buf);
    }

    @Override
    public void write(DataOutput output) throws IOException {
        output.writeLong(_maxRecordCount);
        output.writeInt(_ramBufferSizeMB);
        output.writeBoolean(_optimize);
        writeString(output, _indexingType.name());
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        TIOStreamTransport trans = new TIOStreamTransport(os);
        TBinaryProtocol protocol = new TBinaryProtocol(trans);
        try {
            _tableDescriptor.write(protocol);
        } catch (TException e) {
            throw new IOException(e);
        }
        os.close();
        byte[] bs = os.toByteArray();
        output.writeInt(bs.length);
        output.write(bs);
    }

    private void writeString(DataOutput output, String s) throws IOException {
        byte[] bs = s.getBytes();
        output.writeInt(bs.length);
        output.write(bs);
    }

    public int getMaxRecordsPerRow() {
        return _maxRecordsPerRow;
    }

    public void setMaxRecordsPerRow(int maxRecordsPerRow) {
        _maxRecordsPerRow = maxRecordsPerRow;
    }

    public boolean getOptimize() {
        return _optimize;
    }

    public void setOptimize(boolean optimize) {
        _optimize = optimize;
    }

    public INDEXING_TYPE getIndexingType() {
        return _indexingType;
    }

    public void setIndexingType(INDEXING_TYPE indexingType) {
        _indexingType = indexingType;
    }
}