gridool.db.partitioning.csv.CsvHashPartitioningJob.java Source code

Java tutorial

Introduction

Here is the source code for gridool.db.partitioning.csv.CsvHashPartitioningJob.java

Source

/*
 * @(#)$Id$
 *
 * Copyright 2006-2008 Makoto YUI
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Contributors:
 *     Makoto YUI - initial implementation
 */
package gridool.db.partitioning.csv;

import gridool.GridException;
import gridool.GridNode;
import gridool.GridResourceRegistry;
import gridool.GridTask;
import gridool.annotation.GridRegistryResource;
import gridool.communication.payload.GridNodeInfo;
import gridool.construct.GridJobBase;
import gridool.db.catalog.DistributionCatalog;
import gridool.db.helpers.ForeignKey;
import gridool.db.helpers.PrimaryKey;
import gridool.db.partitioning.DBPartitioningJobConf;
import gridool.db.partitioning.FileAppendTask;
import gridool.dht.ILocalDirectory;
import gridool.routing.GridTaskRouter;

import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.annotation.Nonnull;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import xbird.config.Settings;
import xbird.storage.DbException;
import xbird.storage.index.BTreeCallback;
import xbird.storage.index.Value;
import xbird.util.collections.FixedArrayList;
import xbird.util.collections.LRUMap;
import xbird.util.csv.CsvUtils;
import xbird.util.io.FastByteArrayOutputStream;
import xbird.util.primitive.MutableInt;
import xbird.util.primitive.Primitives;
import xbird.util.string.StringUtils;
import xbird.util.struct.Pair;

import com.sun.istack.internal.Nullable;

/**
 * 
 * <DIV lang="en"></DIV>
 * <DIV lang="ja"></DIV>
 * 
 * @author Makoto YUI (yuin405@gmail.com)
 */
public final class CsvHashPartitioningJob extends GridJobBase<PartitioningJobConf, Map<GridNode, MutableInt>> {
    private static final long serialVersionUID = 149683992715077498L;
    private static final Log LOG = LogFactory.getLog(CsvHashPartitioningJob.class);
    private static final int FK_INDEX_CACHE_SIZE = Primitives
            .parseInt(Settings.get("gridool.db.partitioning.fk_index_caches"), 8192);

    private transient Map<GridNode, MutableInt> assignedRecMap;

    @GridRegistryResource
    private transient GridResourceRegistry registry;

    public CsvHashPartitioningJob() {
    }

    @Override
    public boolean injectResources() {
        return true;
    }

    public Map<GridTask, GridNode> map(final GridTaskRouter router, final PartitioningJobConf ops)
            throws GridException {
        assert (registry != null);
        final String[] lines = ops.getLines();
        final String csvFileName = ops.getFileName();
        final boolean append = !ops.isFirst();
        final DBPartitioningJobConf jobConf = ops.getJobConf();

        // partitioning resources
        final String tableName;
        final int tablePartitionNo;
        final PrimaryKey primaryKey;
        final Collection<ForeignKey> foreignKeys;
        final int[] pkeyIndicies;
        final String[] parentTableFkIndexNames;
        final int numParentForeignKeys;
        final boolean hasParentTable;
        final int numForeignKeys;
        final String[] fkIdxNames;
        final int[][] fkPositions;
        final int[] childTablesPartitionNo;
        final LRUMap<String, List<NodeWithPartitionNo>>[] fkCaches;
        {
            tableName = jobConf.getTableName();
            DistributionCatalog catalog = registry.getDistributionCatalog();
            tablePartitionNo = catalog.getTablePartitionNo(tableName, true);
            Pair<PrimaryKey, Collection<ForeignKey>> primaryForeignKeys = ops.getPrimaryForeignKeys();
            primaryKey = primaryForeignKeys.getFirst();
            foreignKeys = primaryForeignKeys.getSecond();
            pkeyIndicies = primaryKey.getColumnPositions(true);
            parentTableFkIndexNames = getParentTableFkIndexNames(primaryKey);
            hasParentTable = (parentTableFkIndexNames != null);
            numParentForeignKeys = hasParentTable ? parentTableFkIndexNames.length : 0;
            numForeignKeys = foreignKeys.size();
            assert (numParentForeignKeys != 0);
            fkIdxNames = getFkIndexNames(foreignKeys, numForeignKeys);
            fkPositions = getFkPositions(foreignKeys, numForeignKeys);
            childTablesPartitionNo = getChildTablesPartitionNo(foreignKeys, numForeignKeys, catalog);
            fkCaches = getFkIndexCaches(numForeignKeys);
        }

        // COPY INTO control resources 
        final char filedSeparator = jobConf.getFieldSeparator();
        final char quoteChar = jobConf.getStringQuote();
        // working resources
        final ILocalDirectory index = registry.getDirectory();
        for (String idxName : fkIdxNames) {
            setFkIndexCacheSizes(index, idxName, FK_INDEX_CACHE_SIZE);
        }
        final String[] fields = new String[getMaxColumnCount(primaryKey, foreignKeys)];
        assert (fields.length > 0);
        final FixedArrayList<String> fieldList = new FixedArrayList<String>(fields);
        final Charset charset = Charset.forName("UTF-8");
        final StringBuilder strBuf = new StringBuilder(64);
        final int totalRecords = lines.length;
        final String[] fkeysFields = new String[numForeignKeys];
        final byte[][] distkeys = new byte[numForeignKeys][];
        final GridNode[] fkMappedNodes = new GridNode[numForeignKeys];

        final int numNodes = router.getGridSize();
        final Map<GridNode, Pair<MutableInt, FastByteArrayOutputStream>> nodeAssignMap = new HashMap<GridNode, Pair<MutableInt, FastByteArrayOutputStream>>(
                numNodes);
        final Map<GridNode, MutableInt> mappedNodes = new HashMap<GridNode, MutableInt>(numNodes);
        for (int i = 0; i < totalRecords; i++) {
            String line = lines[i];
            lines[i] = null;
            final byte[] lineBytes = line.getBytes(charset);
            {
                CsvUtils.retrieveFields(line, pkeyIndicies, fieldList, filedSeparator, quoteChar);
                fieldList.trimToZero();
                String pkeysField = combineFields(fields, pkeyIndicies.length, strBuf);
                // "primary" fragment mapping
                final byte[] distkey = StringUtils.getBytes(pkeysField);
                mapPrimaryFragment(distkey, mappedNodes, tablePartitionNo, router);
                if (hasParentTable) {
                    // "derived by parent" fragment mapping
                    for (int kk = 0; kk < numParentForeignKeys; kk++) {
                        String idxName = parentTableFkIndexNames[kk];
                        mapDerivedByParentFragment(distkey, mappedNodes, index, idxName);
                    }
                }
            }
            if (numForeignKeys > 0) {
                // "derived by child" fragment mapping
                for (int jj = 0; jj < numForeignKeys; jj++) {
                    int[] pos = fkPositions[jj];
                    CsvUtils.retrieveFields(line, pos, fieldList, filedSeparator, quoteChar);
                    fieldList.trimToZero();
                    String fkeysField = combineFields(fields, pos.length, strBuf);
                    byte[] distkey = StringUtils.getBytes(fkeysField);
                    fkMappedNodes[jj] = mapDerivedByChildFragment(distkey, mappedNodes, childTablesPartitionNo[jj],
                            router);
                    fkeysFields[jj] = fkeysField;
                    distkeys[jj] = distkey;
                }
                // store information for derived fragment mapping
                for (int kk = 0; kk < numForeignKeys; kk++) {
                    final String fkIdxName = fkIdxNames[kk];
                    final String fkeysField = fkeysFields[kk];
                    final byte[] distkey = distkeys[kk];
                    final GridNode fkMappedNode = fkMappedNodes[kk];
                    final LRUMap<String, List<NodeWithPartitionNo>> fkCache = fkCaches[kk];
                    List<NodeWithPartitionNo> storedNodeInfo = fkCache.get(fkeysField);
                    for (final Map.Entry<GridNode, MutableInt> e : mappedNodes.entrySet()) {
                        final GridNode node = e.getKey();
                        if (node.equals(fkMappedNode)) {
                            continue; // no need to map
                        }
                        int hiddenValue = e.getValue().intValue();
                        NodeWithPartitionNo nodeInfo = new NodeWithPartitionNo(node, hiddenValue);
                        if (storedNodeInfo == null) {
                            storedNodeInfo = new ArrayList<NodeWithPartitionNo>(8);
                            fkCache.put(fkeysField, storedNodeInfo);
                        } else if (storedNodeInfo.contains(nodeInfo)) {// Note that node has unique hiddenValue to persist
                            continue;
                        }
                        storedNodeInfo.add(nodeInfo);
                        byte[] value = serialize(node, hiddenValue);
                        storeDerivedFragmentationInfo(distkey, value, index, fkIdxName);
                    }
                }
            }
            if (mappedNodes.isEmpty()) {
                throw new IllegalStateException("Could not map records for table: '" + tableName + '\'');
            }
            // bind a record to nodes
            mapRecord(lineBytes, totalRecords, numNodes, nodeAssignMap, mappedNodes, filedSeparator);
            mappedNodes.clear();
        }

        final Map<GridTask, GridNode> taskmap = new IdentityHashMap<GridTask, GridNode>(numNodes);
        final Map<GridNode, MutableInt> assignedRecMap = new HashMap<GridNode, MutableInt>(numNodes);
        for (final Map.Entry<GridNode, Pair<MutableInt, FastByteArrayOutputStream>> e : nodeAssignMap.entrySet()) {
            GridNode node = e.getKey();
            Pair<MutableInt, FastByteArrayOutputStream> pair = e.getValue();
            MutableInt numRecords = pair.first;
            assignedRecMap.put(node, numRecords);
            FastByteArrayOutputStream rows = pair.second;
            byte[] b = rows.toByteArray();
            pair.clear();
            GridTask task = new FileAppendTask(this, csvFileName, b, append, true);
            taskmap.put(task, node);
        }

        for (final GridNode node : router.getAllNodes()) {
            if (!assignedRecMap.containsKey(node)) {
                assignedRecMap.put(node, new MutableInt(0));
            }
        }
        this.assignedRecMap = assignedRecMap;
        return taskmap;
    }

    private static GridNode mapPrimaryFragment(final byte[] distkey, final Map<GridNode, MutableInt> mappedNodes,
            final int tablePartitionNo, final GridTaskRouter router) throws GridException {
        assert (mappedNodes.isEmpty());
        GridNode mappedNode = router.selectNode(distkey);
        if (mappedNode == null) {
            throw new GridException("Could not find any node in cluster.");
        }
        MutableInt newHidden = new MutableInt(tablePartitionNo);
        mappedNodes.put(mappedNode, newHidden);
        return mappedNode;
    }

    private static GridNode mapDerivedByChildFragment(final byte[] distkey,
            final Map<GridNode, MutableInt> mappedNodes, final int tablePartitionNo, final GridTaskRouter router)
            throws GridException {
        GridNode mappedNode = router.selectNode(distkey);
        if (mappedNode == null) {
            throw new GridException("Could not find any node in cluster.");
        }
        final MutableInt hiddenValue = mappedNodes.get(mappedNode);
        if (hiddenValue == null) {
            mappedNodes.put(mappedNode, new MutableInt(tablePartitionNo));
        } else {
            int oldValue = hiddenValue.intValue();
            int newValue = oldValue | tablePartitionNo;
            hiddenValue.setValue(newValue);
        }
        return mappedNode;
    }

    private static void mapDerivedByParentFragment(final byte[] distkey,
            final Map<GridNode, MutableInt> mappedNodes, final ILocalDirectory index,
            final String parentTableFkIndex) throws GridException {
        final BTreeCallback handler = new BTreeCallback() {
            public boolean indexInfo(Value key, byte[] value) {
                int partitionNo = deserializePartitionNo(value);
                GridNode node = deserializeGridNode(value);

                final MutableInt hiddenValue = mappedNodes.get(node);
                if (hiddenValue == null) {
                    MutableInt newHidden = new MutableInt(partitionNo);
                    mappedNodes.put(node, newHidden);
                } else {
                    final int oldValue = hiddenValue.intValue();
                    if (oldValue != partitionNo) {
                        int newValue = oldValue | partitionNo;
                        hiddenValue.setValue(newValue);
                    }
                }
                return true;
            }

            public boolean indexInfo(Value value, long pointer) {
                throw new UnsupportedOperationException();
            }
        };
        try {
            index.exactSearch(parentTableFkIndex, distkey, handler);
        } catch (DbException e) {
            throw new GridException(e);
        }
    }

    private static void mapRecord(final byte[] line, final int totalRecords, final int numNodes,
            final Map<GridNode, Pair<MutableInt, FastByteArrayOutputStream>> nodeAssignMap,
            final Map<GridNode, MutableInt> mappedNodes, final char filedSeparator) {
        final int lineSize = line.length;
        for (Map.Entry<GridNode, MutableInt> e : mappedNodes.entrySet()) {
            final GridNode node = e.getKey();
            final int hiddenValue = e.getValue().intValue();

            final FastByteArrayOutputStream rowsBuf;
            final Pair<MutableInt, FastByteArrayOutputStream> pair = nodeAssignMap.get(node);
            if (pair == null) {
                int expected = (int) ((lineSize * (totalRecords / numNodes)) * 1.3f);
                if (expected > 209715200) {
                    LOG.warn("Expected record buffer for shuffling is too large: " + expected + " bytes");
                }
                rowsBuf = new FastByteArrayOutputStream(Math.min(expected, 209715200)); //max 200MB
                Pair<MutableInt, FastByteArrayOutputStream> newPair = new Pair<MutableInt, FastByteArrayOutputStream>(
                        new MutableInt(1), rowsBuf);
                nodeAssignMap.put(node, newPair);
            } else {
                rowsBuf = pair.second;
                MutableInt cnt = pair.first;
                cnt.increment();
            }
            rowsBuf.write(line, 0, lineSize);
            if (hiddenValue == 0) {
                throw new IllegalStateException("Illegal hidden value was detected");
            }
            final String str = Integer.toString(hiddenValue);
            final int strlen = str.length();
            for (int i = 0; i < strlen; i++) {
                char c = str.charAt(i);
                rowsBuf.write(c);
            }
            rowsBuf.write(filedSeparator); // REVIEWME this is monetdb workaround
            rowsBuf.write('\n'); // TODO FIXME support other record separator 
        }
    }

    private static void storeDerivedFragmentationInfo(final byte[] distkey, final byte[] nodeWithPartitionNo,
            final ILocalDirectory index, final String idxName) throws GridException {
        try {
            index.addMapping(idxName, distkey, nodeWithPartitionNo);
        } catch (DbException e) {
            throw new GridException(e);
        }
    }

    public Map<GridNode, MutableInt> reduce() throws GridException {
        return assignedRecMap;
    }

    private static int getMaxColumnCount(@Nonnull final PrimaryKey pkey,
            @Nonnull final Collection<ForeignKey> fkeys) {
        int max = pkey.getColumnNames().size();
        for (final ForeignKey fkey : fkeys) {
            int size = fkey.getColumnNames().size();
            max = Math.max(size, max);
        }
        return max;
    }

    private static String combineFields(@Nonnull final String[] fields, final int numFields,
            @Nonnull final StringBuilder buf) {
        if (numFields < 1) {
            throw new IllegalArgumentException("Illegal numField: " + numFields);
        }
        if (numFields == 1) {
            return fields[0];
        }
        StringUtils.clear(buf);
        buf.append(fields[0]);
        for (int i = 1; i < numFields; i++) {
            buf.append('|');
            buf.append(fields[i]);
        }
        return buf.toString();
    }

    private static String[] getFkIndexNames(final Collection<ForeignKey> fkeys, final int numFkeys) {
        final String[] idxNames = new String[numFkeys];
        final Iterator<ForeignKey> itor = fkeys.iterator();
        for (int i = 0; i < numFkeys; i++) {
            ForeignKey fk = itor.next();
            String fkTable = fk.getFkTableName();
            List<String> fkColumns = fk.getFkColumnNames();
            idxNames[i] = getIndexName(fkTable, fkColumns);
        }
        return idxNames;
    }

    private static int[][] getFkPositions(final Collection<ForeignKey> fkeys, final int numFkeys) {
        final int[][] positions = new int[numFkeys][];
        final Iterator<ForeignKey> itor = fkeys.iterator();
        for (int i = 0; i < numFkeys; i++) {
            ForeignKey fk = itor.next();
            int[] fkeyPos = fk.getFkColumnPositions(true);
            positions[i] = fkeyPos;
        }
        return positions;
    }

    private static int[] getChildTablesPartitionNo(final Collection<ForeignKey> fkeys, final int numFkeys,
            final DistributionCatalog catalog) {
        final int[] partitionNo = new int[numFkeys];
        final Iterator<ForeignKey> itor = fkeys.iterator();
        for (int i = 0; i < numFkeys; i++) {
            ForeignKey fk = itor.next();
            String tblname = fk.getPkTableName();
            int n = catalog.getTablePartitionNo(tblname, true);
            partitionNo[i] = n;
        }
        return partitionNo;
    }

    @SuppressWarnings("unchecked")
    @Nullable
    private static LRUMap<String, List<NodeWithPartitionNo>>[] getFkIndexCaches(final int numFks) {
        if (numFks == 0) {
            return null;
        }
        final LRUMap<String, List<NodeWithPartitionNo>>[] caches = new LRUMap[numFks];
        for (int i = 0; i < numFks; i++) {
            caches[i] = new LRUMap<String, List<NodeWithPartitionNo>>(1000);
        }
        return caches;
    }

    @Nullable
    private static String[] getParentTableFkIndexNames(final PrimaryKey childTablePkey) {
        final List<ForeignKey> parentFkeys = childTablePkey.getExportedKeys();
        if (parentFkeys == null) {
            return null;
        }
        final int numParents = parentFkeys.size();
        if (numParents == 0) {
            return null;
        }
        final String[] idxNames = new String[numParents];
        for (int i = 0; i < numParents; i++) {
            ForeignKey parentFkey = parentFkeys.get(i);
            String fkTable = parentFkey.getFkTableName();
            List<String> fkColumns = parentFkey.getFkColumnNames();
            String idxName = getIndexName(fkTable, fkColumns);
            idxNames[i] = idxName;
        }
        return idxNames;
    }

    private static String getIndexName(final String tableName, final List<String> columnNames) {
        final int numColumns = columnNames.size();
        if (numColumns == 0) {
            throw new IllegalArgumentException("No columns was specified for table: " + tableName);
        }
        final StringBuilder buf = new StringBuilder(32);
        buf.append(tableName);
        buf.append('.');
        for (int i = 0; i < numColumns; i++) {
            if (i != 0) {
                buf.append('_');
            }
            String colname = columnNames.get(i);
            buf.append(colname);
        }
        buf.append(".fktbl");
        return buf.toString();
    }

    private static byte[] serialize(final GridNode node, final int partitionNo) {
        if (partitionNo < 1 || partitionNo > Short.MAX_VALUE) {
            throw new IllegalArgumentException("Illeal PartitionNo: " + partitionNo);
        }
        byte[] nodeBytes = node.toBytes();
        final byte[] b = new byte[nodeBytes.length + 2];
        Primitives.putShort(b, 0, (short) partitionNo);
        System.arraycopy(nodeBytes, 0, b, 2, nodeBytes.length);
        return b;
    }

    private static int deserializePartitionNo(final byte[] b) {
        final int partitionNo = Primitives.getShortAsInt(b, 0);
        if (partitionNo < 1 || partitionNo > Short.MAX_VALUE) {
            throw new IllegalArgumentException("Illeal PartitionNo: " + partitionNo);
        }
        return partitionNo;
    }

    private static GridNode deserializeGridNode(final byte[] b) {
        int nodeLength = b.length - 2;
        assert (nodeLength > 0) : nodeLength;
        final byte[] nodeBytes = new byte[nodeLength];
        System.arraycopy(b, 2, nodeBytes, 0, nodeLength);
        GridNode node = GridNodeInfo.fromBytes(nodeBytes);
        return node;
    }

    private static void setFkIndexCacheSizes(final ILocalDirectory index, final String idxName,
            final int cacheSize) {
        index.setCacheSize(idxName, cacheSize);
    }

    private static final class NodeWithPartitionNo {

        final GridNode node;
        final int partitionNo;

        NodeWithPartitionNo(@Nonnull GridNode node, int partitionNo) {
            assert (node != null);
            this.node = node;
            this.partitionNo = partitionNo;
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + node.hashCode();
            result = prime * result + partitionNo;
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            final NodeWithPartitionNo other = (NodeWithPartitionNo) obj;
            if (partitionNo != other.partitionNo) {
                return false;
            }
            if (!node.equals(other.node)) {
                return false;
            }
            return true;
        }

        @Override
        public String toString() {
            return node.toString() + " [partitionNo=" + partitionNo + "]";
        }
    }
}