Java tutorial
/* * @(#)$Id$ * * Copyright 2006-2008 Makoto YUI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Contributors: * Makoto YUI - initial implementation */ package gridool.db.partitioning.phihash.csv.dist; import gridool.GridConfiguration; import gridool.GridException; import gridool.GridNode; import gridool.GridResourceRegistry; import gridool.GridTask; import gridool.Settings; import gridool.annotation.GridConfigResource; import gridool.annotation.GridRegistryResource; import gridool.construct.GridJobBase; import gridool.db.catalog.DistributionCatalog; import gridool.db.helpers.ForeignKey; import gridool.db.helpers.GridDbUtils; import gridool.db.helpers.PrimaryKey; import gridool.db.partitioning.phihash.DBPartitioningJobConf; import gridool.db.partitioning.phihash.NodeWithPartitionNo; import gridool.db.partitioning.phihash.csv.PartitioningJobConf; import gridool.dfs.GridXferClient; import gridool.dht.ILocalDirectory; import gridool.dht.btree.CallbackHandler; import gridool.dht.btree.IndexException; import gridool.dht.btree.Value; import gridool.routing.GridRouter; import gridool.util.collections.FixedArrayList; import gridool.util.collections.LRUMap; import gridool.util.concurrent.ExecutorFactory; import gridool.util.concurrent.ExecutorUtils; import gridool.util.csv.CsvUtils; import gridool.util.io.FastByteArrayOutputStream; import gridool.util.io.IOUtils; import gridool.util.lang.ArrayUtils; import gridool.util.primitive.MutableInt; import gridool.util.primitive.Primitives; import gridool.util.string.StringUtils; import gridool.util.struct.Pair; import java.io.Externalizable; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.IdentityHashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ExecutorService; import javax.annotation.Nonnull; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * * <DIV lang="en"></DIV> * <DIV lang="ja"></DIV> * * @author Makoto YUI (yuin405@gmail.com) */ public final class LocalCsvHashPartitioningJob extends GridJobBase<Pair<PartitioningJobConf, GridRouter>, HashMap<GridNode, MutableInt>> { private static final long serialVersionUID = 149683992715077498L; private static final Log LOG = LogFactory.getLog(LocalCsvHashPartitioningJob.class); private static final int FK_INDEX_CACHE_SIZE = Primitives .parseInt(Settings.get("gridool.db.partitioning.fk_index_caches"), 8192); private transient HashMap<GridNode, MutableInt> assignedRecMap; @GridConfigResource private transient GridConfiguration config; @GridRegistryResource private transient GridResourceRegistry registry; public LocalCsvHashPartitioningJob() { } @Override public boolean injectResources() { return true; } @Override public boolean handleNodeFailure() { return false; } public Map<GridTask, GridNode> map(final GridRouter router, final Pair<PartitioningJobConf, GridRouter> args) throws GridException { assert (registry != null); final PartitioningJobConf ops = args.getFirst(); final String[] lines = ops.getLines(); final String csvFileName = ops.getFileName(); final DBPartitioningJobConf jobConf = ops.getJobConf(); final GridRouter origRouter = args.getSecond(); // partitioning resources final String tableName; final int tablePartitionNo; final PrimaryKey primaryKey; final Collection<ForeignKey> foreignKeys; final int[] pkeyIndicies; final String[] parentTableFkIndexNames; final int numParentForeignKeys; final boolean hasParentTable; final int numForeignKeys; final String[] fkIdxNames; final int[][] fkPositions; final int[] childTablesPartitionNo; final LRUMap<String, List<NodeWithPartitionNo>>[] fkCaches; { tableName = jobConf.getTableName(); DistributionCatalog catalog = registry.getDistributionCatalog(); tablePartitionNo = catalog.getTablePartitionNo(tableName, true); Pair<PrimaryKey, Collection<ForeignKey>> primaryForeignKeys = ops.getPrimaryForeignKeys(); primaryKey = primaryForeignKeys.getFirst(); foreignKeys = primaryForeignKeys.getSecond(); pkeyIndicies = primaryKey.getColumnPositions(true); parentTableFkIndexNames = GridDbUtils.getParentTableFkIndexNames(primaryKey); hasParentTable = (parentTableFkIndexNames != null); numParentForeignKeys = hasParentTable ? parentTableFkIndexNames.length : 0; numForeignKeys = foreignKeys.size(); assert (numParentForeignKeys != 0); fkIdxNames = GridDbUtils.getFkIndexNames(foreignKeys, numForeignKeys); fkPositions = GridDbUtils.getFkPositions(foreignKeys, numForeignKeys); childTablesPartitionNo = GridDbUtils.getChildTablesPartitionNo(foreignKeys, numForeignKeys, catalog); fkCaches = GridDbUtils.getFkIndexCaches(numForeignKeys); } // COPY INTO control resources final char filedSeparator = jobConf.getFieldSeparator(); final char quoteChar = jobConf.getStringQuote(); // working resources final ILocalDirectory index = registry.getDirectory(); for (String idxName : fkIdxNames) { configureFkIndex(index, idxName, FK_INDEX_CACHE_SIZE); } final GridNode localNode = config.getLocalNode(); final String[] fields = new String[GridDbUtils.getMaxColumnCount(primaryKey, foreignKeys)]; assert (fields.length > 0); final FixedArrayList<String> fieldList = new FixedArrayList<String>(fields); final Charset charset = Charset.forName("UTF-8"); final StringBuilder strBuf = new StringBuilder(64); final int totalRecords = lines.length; final String[] fkeysFields = new String[numForeignKeys]; final byte[][] distkeys = new byte[numForeignKeys][]; final GridNode[] fkMappedNodes = new GridNode[numForeignKeys]; final int numNodes = origRouter.getGridSize(); final Map<GridNode, Pair<MutableInt, FastByteArrayOutputStream>> nodeAssignMap = new HashMap<GridNode, Pair<MutableInt, FastByteArrayOutputStream>>( numNodes); final Map<GridNode, MutableInt> mappedNodes = new HashMap<GridNode, MutableInt>(numNodes); final Map<GridNode, List<DerivedFragmentInfo>> idxShippingMap = new HashMap<GridNode, List<DerivedFragmentInfo>>( numNodes); for (int i = 0; i < totalRecords; i++) { String line = lines[i]; lines[i] = null; // "primary" fragment mapping mapPrimaryFragment(mappedNodes, tablePartitionNo, localNode); if (hasParentTable) { // "derived by parent" fragment mapping CsvUtils.retrieveFields(line, pkeyIndicies, fieldList, filedSeparator, quoteChar); fieldList.trimToZero(); String pkeysField = GridDbUtils.combineFields(fields, pkeyIndicies.length, strBuf); final byte[] distkey = StringUtils.getBytes(pkeysField); for (int kk = 0; kk < numParentForeignKeys; kk++) { String idxName = parentTableFkIndexNames[kk]; mapDerivedByParentFragment(distkey, mappedNodes, index, idxName); } } if (numForeignKeys > 0) { // "derived by child" fragment mapping for (int jj = 0; jj < numForeignKeys; jj++) { int[] pos = fkPositions[jj]; CsvUtils.retrieveFields(line, pos, fieldList, filedSeparator, quoteChar); fieldList.trimToZero(); String fkeysField = GridDbUtils.combineFields(fields, pos.length, strBuf); byte[] distkey = StringUtils.getBytes(fkeysField); fkMappedNodes[jj] = mapDerivedByChildFragment(distkey, mappedNodes, childTablesPartitionNo[jj], origRouter); fkeysFields[jj] = fkeysField; distkeys[jj] = distkey; } // store information for derived fragment mapping for (int kk = 0; kk < numForeignKeys; kk++) { final String fkIdxName = fkIdxNames[kk]; final String fkeysField = fkeysFields[kk]; final byte[] distkey = distkeys[kk]; final GridNode fkMappedNode = fkMappedNodes[kk]; List<DerivedFragmentInfo> shipIdxList = idxShippingMap.get(fkMappedNode); if (shipIdxList == null) { shipIdxList = new ArrayList<DerivedFragmentInfo>(2000); idxShippingMap.put(fkMappedNode, shipIdxList); } final LRUMap<String, List<NodeWithPartitionNo>> fkCache = fkCaches[kk]; List<NodeWithPartitionNo> storedNodeInfo = fkCache.get(fkeysField); for (final Map.Entry<GridNode, MutableInt> e : mappedNodes.entrySet()) { final GridNode node = e.getKey(); //if(node.equals(fkMappedNode)) continue; // no need to map int hiddenValue = e.getValue().intValue(); NodeWithPartitionNo nodeInfo = new NodeWithPartitionNo(node, hiddenValue); if (storedNodeInfo == null) { storedNodeInfo = new ArrayList<NodeWithPartitionNo>(8); fkCache.put(fkeysField, storedNodeInfo); } else if (storedNodeInfo.contains(nodeInfo)) {// Note that node has unique hiddenValue to persist continue; } storedNodeInfo.add(nodeInfo); byte[] value = NodeWithPartitionNo.serialize(node, hiddenValue); // index shipping DerivedFragmentInfo fragInfo = new DerivedFragmentInfo(fkIdxName, distkey, value); shipIdxList.add(fragInfo); } } } if (mappedNodes.isEmpty()) { throw new IllegalStateException("Could not map records for table: '" + tableName + '\''); } // bind a record to nodes byte[] lineBytes = line.getBytes(charset); mapRecord(lineBytes, totalRecords, numNodes, nodeAssignMap, mappedNodes, filedSeparator); mappedNodes.clear(); } final int numTasks = idxShippingMap.size(); final Map<GridTask, GridNode> taskmap = new IdentityHashMap<GridTask, GridNode>(numTasks); for (final Map.Entry<GridNode, List<DerivedFragmentInfo>> e : idxShippingMap.entrySet()) { GridNode node = e.getKey(); List<DerivedFragmentInfo> storeList = e.getValue(); GridTask task = new GridBuildIndexTask(this, storeList); taskmap.put(task, node); } final ExecutorService sendExecs = ExecutorFactory.newFixedThreadPool(GridXferClient.SENDER_CONCURRENCY, "FileSender", true); final GridXferClient dfsClient = registry.getDfsService().getDFSClient(); final int recvPort = config.getFileReceiverPort(); final HashMap<GridNode, MutableInt> assignedRecMap = new HashMap<GridNode, MutableInt>(numNodes); for (final Map.Entry<GridNode, Pair<MutableInt, FastByteArrayOutputStream>> e : nodeAssignMap.entrySet()) { GridNode node = e.getKey(); GridNode trgNode = origRouter.resolve(node); Pair<MutableInt, FastByteArrayOutputStream> pair = e.getValue(); MutableInt numRecords = pair.first; assignedRecMap.put(trgNode, numRecords); FastByteArrayOutputStream rows = pair.second; pair.clear(); GridDbUtils.sendfile(sendExecs, dfsClient, csvFileName, rows, trgNode, recvPort); } for (final GridNode node : origRouter.getAllNodes()) { if (!assignedRecMap.containsKey(node)) { assignedRecMap.put(node, new MutableInt(0)); } } this.assignedRecMap = assignedRecMap; ExecutorUtils.shutdownAndAwaitTermination(sendExecs); return taskmap; } private static void mapPrimaryFragment(final Map<GridNode, MutableInt> mappedNodes, final int tablePartitionNo, final GridNode localNode) throws GridException { MutableInt newHidden = new MutableInt(tablePartitionNo); mappedNodes.put(localNode, newHidden); } private static GridNode mapDerivedByChildFragment(final byte[] distkey, final Map<GridNode, MutableInt> mappedNodes, final int tablePartitionNo, final GridRouter router) throws GridException { GridNode mappedNode = router.selectNode(distkey); if (mappedNode == null) { throw new GridException("Could not find any node in cluster."); } final MutableInt hiddenValue = mappedNodes.get(mappedNode); if (hiddenValue == null) { mappedNodes.put(mappedNode, new MutableInt(tablePartitionNo)); } else { int oldValue = hiddenValue.intValue(); int newValue = oldValue | tablePartitionNo; hiddenValue.setValue(newValue); } return mappedNode; } private static void mapDerivedByParentFragment(final byte[] distkey, final Map<GridNode, MutableInt> mappedNodes, final ILocalDirectory index, final String parentTableFkIndex) throws GridException { final CallbackHandler handler = new CallbackHandler() { public boolean indexInfo(Value key, byte[] value) { int partitionNo = NodeWithPartitionNo.deserializePartitionNo(value); GridNode node = NodeWithPartitionNo.deserializeGridNode(value); final MutableInt hiddenValue = mappedNodes.get(node); if (hiddenValue == null) { MutableInt newHidden = new MutableInt(partitionNo); mappedNodes.put(node, newHidden); } else { final int oldValue = hiddenValue.intValue(); if (oldValue != partitionNo) { int newValue = oldValue | partitionNo; hiddenValue.setValue(newValue); } } return true; } public boolean indexInfo(Value value, long pointer) { throw new UnsupportedOperationException(); } }; try { index.exactSearch(parentTableFkIndex, distkey, handler); } catch (IndexException e) { throw new GridException(e); } } private static void mapRecord(final byte[] line, final int totalRecords, final int numNodes, final Map<GridNode, Pair<MutableInt, FastByteArrayOutputStream>> nodeAssignMap, final Map<GridNode, MutableInt> mappedNodes, final char filedSeparator) { final int lineSize = line.length; for (final Map.Entry<GridNode, MutableInt> e : mappedNodes.entrySet()) { final GridNode node = e.getKey(); final int hiddenValue = e.getValue().intValue(); final FastByteArrayOutputStream rowsBuf; final Pair<MutableInt, FastByteArrayOutputStream> pair = nodeAssignMap.get(node); if (pair == null) { int expected = (int) ((lineSize * (totalRecords / numNodes)) * 1.3f); if (expected > 209715200) { LOG.warn("Expected record buffer for shuffling is too large: " + expected + " bytes"); } rowsBuf = new FastByteArrayOutputStream(Math.min(expected, 209715200)); //max 200MB Pair<MutableInt, FastByteArrayOutputStream> newPair = new Pair<MutableInt, FastByteArrayOutputStream>( new MutableInt(1), rowsBuf); nodeAssignMap.put(node, newPair); } else { rowsBuf = pair.second; MutableInt cnt = pair.first; cnt.increment(); } rowsBuf.write(line, 0, lineSize); if (hiddenValue == 0) { throw new IllegalStateException("Illegal hidden value was detected"); } final String str = Integer.toString(hiddenValue); final int strlen = str.length(); for (int i = 0; i < strlen; i++) { char c = str.charAt(i); rowsBuf.write(c); } rowsBuf.write(filedSeparator); // REVIEWME this is monetdb workaround rowsBuf.write('\n'); // TODO FIXME support other record separator } } public HashMap<GridNode, MutableInt> reduce() throws GridException { return assignedRecMap; } private static void configureFkIndex(final ILocalDirectory index, final String idxName, final int cacheSize) { index.setCacheSize(idxName, cacheSize); index.setBulkloading(true, idxName); } static final class DerivedFragmentInfo implements Externalizable, Comparable<DerivedFragmentInfo> { private static final long serialVersionUID = -4472952971698389386L; private/* final */String fkIdxName; private/* final */byte[] distkey; private/* final */byte[] value; public DerivedFragmentInfo() { } // Externalizable DerivedFragmentInfo(@Nonnull String fkIdxName, @Nonnull byte[] distkey, byte[] value) { this.fkIdxName = fkIdxName; this.distkey = distkey; this.value = value; } String getFkIdxName() { return fkIdxName; } byte[] getDistkey() { return distkey; } byte[] getValue() { return value; } public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { this.fkIdxName = IOUtils.readString(in); this.distkey = IOUtils.readBytes(in); this.value = IOUtils.readBytes(in); } public void writeExternal(ObjectOutput out) throws IOException { IOUtils.writeString(fkIdxName, out); IOUtils.writeBytes(distkey, out); IOUtils.writeBytes(value, out); } static DerivedFragmentInfo readFrom(ObjectInput in) throws IOException, ClassNotFoundException { DerivedFragmentInfo info = new DerivedFragmentInfo(); info.readExternal(in); return info; } @Override public int compareTo(DerivedFragmentInfo other) { return ArrayUtils.compareTo(distkey, other.distkey); } } }