gridool.db.partitioning.phihash.csv.grace.CsvGraceHashPartitioningTask.java Source code

Introduction

Here is the source code for gridool.db.partitioning.phihash.csv.grace.CsvGraceHashPartitioningTask.java
Source

/*
 * @(#)$Id$
 *
 * Copyright 2006-2008 Makoto YUI
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Contributors:
 *     Makoto YUI - initial implementation
 */
package gridool.db.partitioning.phihash.csv.grace;

import gridool.GridException;
import gridool.GridJob;
import gridool.GridJobFuture;
import gridool.GridKernel;
import gridool.GridNode;
import gridool.GridResourceRegistry;
import gridool.Settings;
import gridool.annotation.GridKernelResource;
import gridool.annotation.GridRegistryResource;
import gridool.construct.GridTaskAdapter;
import gridool.db.helpers.DBAccessor;
import gridool.db.helpers.ForeignKey;
import gridool.db.helpers.GridDbUtils;
import gridool.db.helpers.PrimaryKey;
import gridool.db.partitioning.phihash.DBPartitioningJobConf;
import gridool.db.partitioning.phihash.PartitioningJobType;
import gridool.db.partitioning.phihash.csv.PartitioningJobConf;
import gridool.util.GridUtils;
import gridool.util.collections.ArrayQueue;
import gridool.util.collections.BoundedArrayQueue;
import gridool.util.collections.FixedArrayList;
import gridool.util.collections.ObservableLRUMap;
import gridool.util.concurrent.DirectExecutorService;
import gridool.util.concurrent.ExecutorFactory;
import gridool.util.concurrent.ExecutorUtils;
import gridool.util.csv.CsvUtils;
import gridool.util.csv.CsvReader;
import gridool.util.csv.SimpleCsvReader;
import gridool.util.datetime.TextLongProgressBar;
import gridool.util.datetime.TextProgressBar;
import gridool.util.hashes.FNVHash;
import gridool.util.hashes.HashUtils;
import gridool.util.io.FastBufferedInputStream;
import gridool.util.io.FastBufferedOutputStream;
import gridool.util.io.FileDeletionThread;
import gridool.util.io.FileUtils;
import gridool.util.io.IOUtils;
import gridool.util.primitive.MutableInt;
import gridool.util.primitive.MutableLong;
import gridool.util.primitive.Primitives;
import gridool.util.string.StringUtils;
import gridool.util.struct.Pair;
import gridool.util.system.SystemUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ThreadPoolExecutor;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * 
 * <DIV lang="en"></DIV>
 * <DIV lang="ja"></DIV>
 * 
 * @author Makoto YUI (yuin405@gmail.com)
 */
public class CsvGraceHashPartitioningTask extends GridTaskAdapter {
    private static final long serialVersionUID = -4477383489963213348L;
    private static final Log LOG = LogFactory.getLog(CsvGraceHashPartitioningTask.class);

    private static final int csvInputBufSize;
    private static final int DEFAULT_SHUFFLE_UNITS;
    private static final int DEFAULT_SHUFFLE_THREADS;
    static {
        csvInputBufSize = Primitives.parseInt(Settings.get("gridool.db.partitioning.csv_reader.bufsize"),
                32 * 1024);
        DEFAULT_SHUFFLE_UNITS = Primitives.parseInt(Settings.get("gridool.db.partitioning.shuffle_units"), 20000);
        int defaultNumThread = Math.max(2, SystemUtils.availableProcessors() - 1);
        DEFAULT_SHUFFLE_THREADS = Primitives.parseInt(Settings.get("gridool.db.partitioning.shuffle_threads"),
                defaultNumThread);
    }

    @Nonnull
    protected final DBPartitioningJobConf jobConf;

    private int shuffleUnits = DEFAULT_SHUFFLE_UNITS; // line 200 bytes * 100 nodes * 20,000 * 4 threads = 1600MB
    private int shuffleThreads = DEFAULT_SHUFFLE_THREADS;

    // ------------------------
    // injected resources

    @GridKernelResource
    protected transient GridKernel kernel;

    @GridRegistryResource
    private transient GridResourceRegistry registry;

    // ------------------------
    // working resources

    private transient ExecutorService shuffleExecPool;
    private transient BoundedArrayQueue<String> shuffleSink;

    protected transient String csvFileName;
    private transient boolean isFirstShuffle = true;
    private transient Pair<PrimaryKey, Collection<ForeignKey>> primaryForeignKeys;

    protected transient HashMap<GridNode, MutableLong> assignMap;
    @Nullable
    private transient Map<String, OutputStream> outputMap;

    @SuppressWarnings("unchecked")
    public CsvGraceHashPartitioningTask(GridJob job, DBPartitioningJobConf jobConf) {
        super(job, false);
        this.jobConf = jobConf;
    }

    @Override
    public boolean injectResources() {
        return true;
    }

    public int shuffleUnits() {
        return shuffleUnits;
    }

    public void setShuffleUnits(int shuffleUnits) {
        this.shuffleUnits = shuffleUnits;
    }

    public int shuffleThreads() {
        return shuffleThreads;
    }

    public void setShuffleThreads(int shuffleThreads) {
        this.shuffleThreads = shuffleThreads;
    }

    @Override
    protected HashMap<GridNode, MutableLong> execute() throws GridException {
        int numShuffleThreads = shuffleThreads();
        this.shuffleExecPool = (numShuffleThreads <= 0) ? new DirectExecutorService()
                : ExecutorFactory.newBoundedWorkQueueFixedThreadPool(numShuffleThreads, "Gridool#Shuffle", true,
                        new ThreadPoolExecutor.CallerRunsPolicy());
        this.shuffleSink = new BoundedArrayQueue<String>(shuffleUnits());
        this.csvFileName = generateCsvFileName();
        this.assignMap = new HashMap<GridNode, MutableLong>(64);

        // inquire primary foreign keys of the partitioning table
        DBAccessor dba = registry.getDbAccessor();
        String templateTableName = jobConf.getBaseTableName();
        this.primaryForeignKeys = GridDbUtils.getPrimaryForeignKeys(dba, templateTableName);

        ObservableLRUMap.Cleaner<String, OutputStream> cleaner = new ObservableLRUMap.Cleaner<String, OutputStream>() {
            public void cleanup(String key, OutputStream out) {
                try {
                    out.flush();
                } catch (IOException e) {
                    throw new IllegalStateException("failed to flush: " + key, e);
                }
                IOUtils.closeQuietly(out);
            }
        };
        // could not allocate large cache because GzipOutputStream consumes non-heap memory
        final Map<String, OutputStream> outputMap = new ObservableLRUMap<String, OutputStream>(128, cleaner);
        this.outputMap = outputMap;

        final int numShuffled;
        if (GridDbUtils.hasParentTable(primaryForeignKeys.getFirst())) {
            numShuffled = twoPassParseAndShuffle();
        } else {
            numShuffled = onePassParseAndShuffle();
        }
        postShuffle(numShuffled);

        for (OutputStream os : outputMap.values()) {
            try {
                os.flush();
            } catch (IOException ioe) {
                throw new GridException(ioe);
            }
            IOUtils.closeQuietly(os);
        }
        this.outputMap = null;

        return assignMap;
    }

    private String generateCsvFileName() {
        String tblName = jobConf.getTableName();
        GridNode senderNode = getSenderNode();
        String addr = senderNode.getPhysicalAdress().getHostAddress();
        return tblName + addr + ".csv";
    }

    private int onePassParseAndShuffle() throws GridException {
        String csvfile = jobConf.getCsvFilePath();
        char filedSeparator = jobConf.getFieldSeparator();
        char quoteChar = jobConf.getStringQuote();

        final long filesize = FileUtils.getFileSize(new File(csvfile));
        final PartitioningLongProgressBar progress = new PartitioningLongProgressBar(
                "[Partitioning] progress of CSV parsing and shuffling", filesize);

        final CsvReader reader = getCsvReader(csvfile, filedSeparator, quoteChar);
        int numShuffled = 0;
        try {
            String line;
            while ((line = reader.getNextLine()) != null) {
                shuffle(line);
                numShuffled++;
                long bytes = line.length() + 1;
                progress.inc(bytes);
            }
        } catch (IOException e) {
            LOG.error(e);
            throw new GridException(e);
        } finally {
            IOUtils.closeQuietly(reader);
        }
        progress.finish();
        return numShuffled;
    }

    private int twoPassParseAndShuffle() throws GridException {
        final int numBuckets = jobConf.getNumberOfBuckets();
        if (numBuckets <= 0) {
            throw new GridException("Illegal number of buckets for grace hash partitioning: " + numBuckets);
        }
        if (!HashUtils.isPowerOfTwo(numBuckets)) {
            throw new GridException("number of buckets is not power of two: " + numBuckets);
        }
        // #1 divide the input CSV file
        int totalShuffled = divideInputCsvFile(primaryForeignKeys, numBuckets);

        String csvfile = jobConf.getCsvFilePath();
        final String csvFileName = FileUtils.basename(csvfile);
        final char filedSeparator = jobConf.getFieldSeparator();
        final char quoteChar = jobConf.getStringQuote();

        // #2 run partitioning
        final PartitioningProgressBar progress = new PartitioningProgressBar(
                "[Partitioning] #2nd phase. Progress of CSV parsing and shuffling", totalShuffled);
        int numShuffled = 0;
        for (int bucketNo = 0; bucketNo < numBuckets; bucketNo++) {//for each bucket           
            File workdir = GridUtils.getWorkDir(false);
            File chunkfile = new File(workdir, csvFileName + "." + bucketNo);
            if (!chunkfile.exists()) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("CSV chunk file is not found: " + chunkfile.getAbsolutePath());
                }
                continue;
            }
            final CsvReader reader = getCsvReader(chunkfile.getAbsolutePath(), filedSeparator, quoteChar);
            try {
                String line;
                while ((line = reader.getNextLine()) != null) {
                    shuffle(line, bucketNo);
                    numShuffled++;
                    progress.inc();
                }
            } catch (IOException e) {
                LOG.error(e);
                throw new GridException(e);
            } finally {
                IOUtils.closeQuietly(reader);
                new FileDeletionThread(chunkfile, LOG).start();
            }
            forceShuffle(bucketNo);
        }
        progress.finish();
        return numShuffled;
    }

    private int divideInputCsvFile(final Pair<PrimaryKey, Collection<ForeignKey>> primaryForeignKeys,
            final int numBuckets) throws GridException {
        PrimaryKey primaryKey = primaryForeignKeys.getFirst();
        final int[] pkeyIndicies = primaryKey.getColumnPositions(true);
        assert (pkeyIndicies.length > 0);
        final String[] fields = new String[pkeyIndicies.length];
        final FixedArrayList<String> fieldList = new FixedArrayList<String>(fields);

        // COPY INTO control resources 
        final char filedSeparator = jobConf.getFieldSeparator();
        final char quoteChar = jobConf.getStringQuote();
        final StringBuilder strBuf = new StringBuilder(64);
        final Charset charset = Charset.forName("UTF-8");

        final Map<String, OutputStream> outputMap = new HashMap<String, OutputStream>(numBuckets * 10);
        String csvfile = jobConf.getCsvFilePath();
        final String csvFileName = FileUtils.basename(csvfile);

        final long filesize = FileUtils.getFileSize(new File(csvfile));
        final PartitioningLongProgressBar progress = new PartitioningLongProgressBar(
                "[Partitioning] #1st phase. Progress of CSV division", filesize);

        final int bucketShift = HashUtils.shiftsForNextPowerOfTwo(numBuckets);
        final CsvReader reader = getCsvReader(csvfile, filedSeparator, quoteChar);
        int numShuffled = 0;
        try {
            String line;
            while ((line = reader.getNextLine()) != null) {
                CsvUtils.retrieveFields(line, pkeyIndicies, fieldList, filedSeparator, quoteChar);
                fieldList.trimToZero();
                String pkeysField = GridDbUtils.combineFields(fields, pkeyIndicies.length, strBuf);
                // "primary" fragment mapping
                byte[] distkey = StringUtils.getBytes(pkeysField);
                int hashcode = FNVHash.hash32(distkey);
                int bucket = HashUtils.positiveXorFolding(hashcode, bucketShift);
                OutputStream out = prepareOutput(csvFileName, bucket, outputMap);
                byte[] lineBytes = line.getBytes(charset);
                out.write(lineBytes);
                out.write('\n');
                progress.inc(lineBytes.length + 1);
                numShuffled++;
            }
        } catch (IOException e) {
            LOG.error(e);
            throw new GridException(e);
        } finally {
            IOUtils.closeQuietly(reader);
            for (OutputStream os : outputMap.values()) {
                try {
                    os.flush();
                } catch (IOException ioe) {
                    throw new GridException(ioe);
                }
                IOUtils.closeQuietly(os);
            }
        }
        progress.finish();
        return numShuffled;
    }

    private static OutputStream prepareOutput(final String csvFileName, final int bucket,
            final Map<String, OutputStream> outputMap) {
        String fname = csvFileName + "." + bucket;
        OutputStream out = outputMap.get(fname);
        if (out == null) {
            File dir = GridUtils.getWorkDir(false);
            File file = new File(dir, fname);
            final FileOutputStream fos;
            try {
                fos = new FileOutputStream(file, false);
            } catch (FileNotFoundException fe) {
                throw new IllegalStateException("Failed to create FileOutputStream: " + file.getAbsolutePath(), fe);
            }
            out = new FastBufferedOutputStream(fos, 16384);
            outputMap.put(fname, out);
        }
        return out;
    }

    private void shuffle(@Nonnull final String record) {
        shuffle(record, -1);
    }

    private void shuffle(@Nonnull final String record, final int bucket) {
        assert (shuffleSink != null);
        if (!shuffleSink.offer(record)) {
            invokeShuffle(shuffleExecPool, shuffleSink, bucket);
            this.shuffleSink = new BoundedArrayQueue<String>(shuffleUnits());
            shuffleSink.offer(record);
        }
    }

    private void forceShuffle(final int bucket) {
        if (!shuffleSink.isEmpty()) {
            invokeShuffle(shuffleExecPool, shuffleSink, bucket);
            this.shuffleSink = new BoundedArrayQueue<String>(shuffleUnits());
        }
    }

    protected void postShuffle(final int numShuffled) {
        if (!shuffleSink.isEmpty()) {
            invokeShuffle(shuffleExecPool, shuffleSink, -1);
        }
        ExecutorUtils.shutdownAndAwaitTermination(shuffleExecPool);
    }

    private final void invokeShuffle(@Nonnull final ExecutorService shuffleExecPool,
            @Nonnull final ArrayQueue<String> queue, final int bucket) {
        assert (kernel != null);
        final String[] lines = queue.toArray(String.class);
        final String fileName = csvFileName;
        if (isFirstShuffle) {
            PartitioningJobConf conf = new PartitioningJobConf(lines, fileName, true, primaryForeignKeys, jobConf,
                    bucket);
            runShuffleJob(kernel, conf, assignMap, outputMap, deploymentGroup);
            this.isFirstShuffle = false;
        } else {
            shuffleExecPool.execute(new Runnable() {
                public void run() {
                    PartitioningJobConf conf = new PartitioningJobConf(lines, fileName, false, primaryForeignKeys,
                            jobConf, bucket);
                    runShuffleJob(kernel, conf, assignMap, outputMap, deploymentGroup);
                }
            });
        }
    }

    private static void runShuffleJob(final GridKernel kernel, final PartitioningJobConf conf,
            final Map<GridNode, MutableLong> recMap, final Map<String, OutputStream> outputMap,
            final String deploymentGroup) {
        if (outputMap != null) {
            conf.setOutputMap(outputMap);
        }
        PartitioningJobType jobType = conf.getJobConf().getJobType();
        Class<? extends GridJob<PartitioningJobConf, Map<GridNode, MutableInt>>> jobClass = jobType
                .getFirstPartitioningJobClass();
        //final GridJobFuture<Map<GridNode, MutableInt>> future = kernel.execute(CsvGraceHashPartitioningJob.class, conf);
        final GridJobFuture<Map<GridNode, MutableInt>> future = kernel.execute(jobClass, conf);
        final Map<GridNode, MutableInt> map;
        try {
            map = future.get(); // wait for execution
        } catch (InterruptedException ie) {
            LOG.error(ie.getMessage(), ie);
            throw new IllegalStateException(ie);
        } catch (ExecutionException ee) {
            LOG.error(ee.getMessage(), ee);
            throw new IllegalStateException(ee);
        }
        synchronized (recMap) {
            for (final Map.Entry<GridNode, MutableInt> e : map.entrySet()) {
                GridNode node = e.getKey();
                MutableInt assigned = e.getValue();
                long v = assigned.longValue();
                MutableLong prev = recMap.get(node);
                if (prev == null) {
                    recMap.put(node, new MutableLong(v));
                } else {
                    prev.add(v);
                }
            }
        }
    }

    private static final CsvReader getCsvReader(final String csvPath, final char filedSeparator,
            final char quoteChar) throws GridException {
        final Reader reader;
        try {
            FileInputStream fis = new FileInputStream(csvPath);
            FastBufferedInputStream bis = new FastBufferedInputStream(fis, csvInputBufSize);
            reader = new InputStreamReader(bis, "UTF-8");
        } catch (FileNotFoundException fne) {
            LOG.error(fne);
            throw new GridException("CSV file not found: " + csvPath, fne);
        } catch (UnsupportedEncodingException uee) {
            LOG.error(uee);
            throw new IllegalStateException(uee); // should never happens
        }
        PushbackReader pushback = new PushbackReader(reader);
        return new SimpleCsvReader(pushback, filedSeparator, quoteChar);
    }

    private static final class PartitioningProgressBar extends TextProgressBar {

        PartitioningProgressBar(String title, int totalSteps) {
            super(title, totalSteps);
            setRefreshTime(30000L);
            setRefreshFluctations(10);
        }

        @Override
        protected void show() {
            if (LOG.isInfoEnabled()) {
                LOG.info(getInfo());
            }
        }
    }

    private static final class PartitioningLongProgressBar extends TextLongProgressBar {

        PartitioningLongProgressBar(String title, long totalSteps) {
            super(title, totalSteps);
            setRefreshTime(30000L);
            setRefreshFluctations(10);
        }

        @Override
        protected void show() {
            if (LOG.isInfoEnabled()) {
                LOG.info(getInfo());
            }
        }
    }
}