com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.controlprogram.parfor;

import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map.Entry;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException;
import com.ibm.bi.dml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject;
import com.ibm.bi.dml.runtime.controlprogram.parfor.util.Cell;
import com.ibm.bi.dml.runtime.controlprogram.parfor.util.IDSequence;
import com.ibm.bi.dml.runtime.controlprogram.parfor.util.StagingFileUtils;
import com.ibm.bi.dml.runtime.io.MatrixReader;
import com.ibm.bi.dml.runtime.matrix.data.IJV;
import com.ibm.bi.dml.runtime.matrix.data.InputInfo;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixCell;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.data.OutputInfo;
import com.ibm.bi.dml.runtime.matrix.data.SparseRowsIterator;
import com.ibm.bi.dml.runtime.util.FastStringTokenizer;
import com.ibm.bi.dml.runtime.util.LocalFileUtils;

/**
 * Partitions a given matrix into row or column partitions with a two pass-approach.
 * In the first phase the input matrix is read from HDFS and sorted into block partitions
 * in a staging area in the local file system according to the partition format. 
 * In order to allow for scalable partitioning, we process one block at a time. 
 * Furthermore, in the second phase, all blocks of a partition are append to a sequence file
 * on HDFS. Block-wise partitioning and write-once semantics of sequence files require the
 * indirection over the local staging area. For scalable computation, we process one 
 * sequence file at a time.
 *
 * NOTE: For the resulting partitioned matrix, we store block and cell indexes wrt partition boundaries.
 *       This means that the partitioned matrix CANNOT be read as a traditional matrix because there are
 *       for example multiple blocks with same index (while the actual index is encoded in the path).
 *       In order to enable full read of partition matrices, data converter would need to handle negative
 *       row/col offsets for partitioned read. Currently not done in order to avoid overhead from normal read
 *       and since partitioning only applied if exclusively indexed access.
 *
 *
 */
public class DataPartitionerLocal extends DataPartitioner {

    private static final boolean PARALLEL = true;

    private IDSequence _seq = null;
    private MatrixBlock _reuseBlk = null;

    private int _par = -1;

    /**
     * 
     * @param dpf
     * @param n
     * @param par -1 for serial otherwise number of threads, can be ignored by implementation
     * @throws DMLRuntimeException
     */
    public DataPartitionerLocal(PDataPartitionFormat dpf, int n, int par) throws DMLRuntimeException {
        super(dpf, n);

        //TODO
        if (dpf == PDataPartitionFormat.ROW_BLOCK_WISE_N || dpf == PDataPartitionFormat.COLUMN_BLOCK_WISE_N)
            throw new DMLRuntimeException(
                    "Data partitioning formt '" + dpf + "' not supported by DataPartitionerLocal");

        _seq = new IDSequence();
        _par = (par > 0) ? par : 1;
    }

    @Override
    protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen,
            long clen, int brlen, int bclen) throws DMLRuntimeException {
        //force writing to disk (typically not required since partitioning only applied if dataset exceeds CP size)
        in.exportData(); //written to disk iff dirty

        String fname = in.getFileName();
        String fnameStaging = LocalFileUtils.getUniqueWorkingDir(LocalFileUtils.CATEGORY_PARTITIONING);

        //reblock input matrix
        if (ii == InputInfo.TextCellInputInfo)
            partitionTextCell(fname, fnameStaging, fnameNew, rlen, clen, brlen, bclen);
        else if (ii == InputInfo.BinaryCellInputInfo)
            partitionBinaryCell(fname, fnameStaging, fnameNew, rlen, clen, brlen, bclen);
        else if (ii == InputInfo.BinaryBlockInputInfo) {
            if (oi == OutputInfo.BinaryBlockOutputInfo)
                partitionBinaryBlock(fname, fnameStaging, fnameNew, rlen, clen, brlen, bclen);
            else if (oi == OutputInfo.BinaryCellOutputInfo)
                partitionBinaryBlock2BinaryCell(fname, fnameStaging, fnameNew, rlen, clen, brlen, bclen);
        } else
            throw new DMLRuntimeException("Cannot create data partitions of format: " + ii.toString());

        LocalFileUtils.cleanupWorkingDirectory(fnameStaging);
    }

    /**
     * 
     * @param fname
     * @param fnameStaging
     * @param fnameNew
     * @param brlen
     * @param bclen
     * @throws DMLRuntimeException
     */
    private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen,
            int brlen, int bclen) throws DMLRuntimeException {
        long row = -1;
        long col = -1;

        try {
            //STEP 1: read matrix from HDFS and write blocks to local staging area         
            //check and add input path
            JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
            Path path = new Path(fname);
            FileInputFormat.addInputPath(job, path);
            TextInputFormat informat = new TextInputFormat();
            informat.configure(job);
            InputSplit[] splits = informat.getSplits(job, 1);

            LinkedList<Cell> buffer = new LinkedList<Cell>();
            LongWritable key = new LongWritable();
            Text value = new Text();
            FastStringTokenizer st = new FastStringTokenizer(' ');

            for (InputSplit split : splits) {
                RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
                try {
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reset tokenizer
                        row = st.nextLong();
                        col = st.nextLong();
                        double lvalue = st.nextDouble();
                        Cell tmp = new Cell(row, col, lvalue);

                        buffer.addLast(tmp);
                        if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                        {
                            appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                            buffer.clear();
                        }
                    }

                    //final flush
                    if (!buffer.isEmpty()) {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                } finally {
                    if (reader != null)
                        reader.close();
                }
            }

            //STEP 2: read matrix blocks from staging area and write matrix to HDFS
            String[] fnamesPartitions = new File(fnameStaging).list();
            if (PARALLEL) {
                int len = Math.min(fnamesPartitions.length, _par);
                Thread[] threads = new Thread[len];
                for (int i = 0; i < len; i++) {
                    int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                    int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                    end = Math.min(end, fnamesPartitions.length - 1);
                    threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging,
                            fnamesPartitions, start, end));
                    threads[i].start();
                }

                for (Thread t : threads)
                    t.join();
            } else {
                for (String pdir : fnamesPartitions)
                    writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
            }
        } catch (Exception e) {
            //post-mortem error handling and bounds checking
            if (row < 1 || row > rlen || col < 1 || col > clen) {
                throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] "
                        + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
            } else
                throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
        }
    }

    /**
     * 
     * @param fname
     * @param fnameStaging
     * @param fnameNew
     * @param brlen
     * @param bclen
     * @throws DMLRuntimeException
     */
    @SuppressWarnings("deprecation")
    private void partitionBinaryCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen,
            int brlen, int bclen) throws DMLRuntimeException {
        long row = -1;
        long col = -1;

        try {
            //STEP 1: read matrix from HDFS and write blocks to local staging area
            //check and add input path
            JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
            Path path = new Path(fname);
            FileSystem fs = FileSystem.get(job);

            //prepare sequence file reader, and write to local staging area   
            LinkedList<Cell> buffer = new LinkedList<Cell>();
            MatrixIndexes key = new MatrixIndexes();
            MatrixCell value = new MatrixCell();

            for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) {
                SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);
                try {
                    while (reader.next(key, value)) {
                        row = key.getRowIndex();
                        col = key.getColumnIndex();
                        Cell tmp = new Cell(row, col, value.getValue());

                        buffer.addLast(tmp);
                        if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                        {
                            appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                            buffer.clear();
                        }
                    }

                    //final flush
                    if (!buffer.isEmpty()) {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                } finally {
                    if (reader != null)
                        reader.close();
                }
            }

            //STEP 2: read matrix blocks from staging area and write matrix to HDFS
            String[] fnamesPartitions = new File(fnameStaging).list();
            if (PARALLEL) {
                int len = Math.min(fnamesPartitions.length, _par);
                Thread[] threads = new Thread[len];
                for (int i = 0; i < len; i++) {
                    int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                    int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                    end = Math.min(end, fnamesPartitions.length - 1);
                    threads[i] = new Thread(new DataPartitionerWorkerBinaryCell(job, fnameNew, fnameStaging,
                            fnamesPartitions, start, end));
                    threads[i].start();
                }

                for (Thread t : threads)
                    t.join();
            } else {
                for (String pdir : fnamesPartitions)
                    writeBinaryCellSequenceFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
            }
        } catch (Exception e) {
            //post-mortem error handling and bounds checking
            if (row < 1 || row > rlen || col < 1 || col > clen) {
                throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] "
                        + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
            } else
                throw new DMLRuntimeException("Unable to partition binary cell matrix.", e);
        }
    }

    /**
     * 
     * @param fname
     * @param fnameStaging
     * @param fnameNew
     * @param brlen
     * @param bclen
     * @throws DMLRuntimeException
     */
    @SuppressWarnings("deprecation")
    private void partitionBinaryBlock(String fname, String fnameStaging, String fnameNew, long rlen, long clen,
            int brlen, int bclen) throws DMLRuntimeException {
        try {
            //create reuse object
            _reuseBlk = DataPartitioner.createReuseMatrixBlock(_format, brlen, bclen);

            //STEP 1: read matrix from HDFS and write blocks to local staging area   
            //check and add input path
            JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
            Path path = new Path(fname);
            FileSystem fs = FileSystem.get(job);

            //prepare sequence file reader, and write to local staging area
            MatrixIndexes key = new MatrixIndexes();
            MatrixBlock value = new MatrixBlock();

            for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) {
                SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);
                try {
                    while (reader.next(key, value)) //for each block
                    {
                        long row_offset = (key.getRowIndex() - 1) * brlen;
                        long col_offset = (key.getColumnIndex() - 1) * bclen;
                        long rows = value.getNumRows();
                        long cols = value.getNumColumns();

                        //bound check per block
                        if (row_offset + rows < 1 || row_offset + rows > rlen || col_offset + cols < 1
                                || col_offset + cols > clen) {
                            throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows)
                                    + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] "
                                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
                        }

                        appendBlockToStagingArea(fnameStaging, value, row_offset, col_offset, brlen, bclen);
                    }
                } finally {
                    if (reader != null)
                        reader.close();
                }
            }

            //STEP 2: read matrix blocks from staging area and write matrix to HDFS
            String[] fnamesPartitions = new File(fnameStaging).list();
            if (PARALLEL) {
                int len = Math.min(fnamesPartitions.length, _par);
                Thread[] threads = new Thread[len];
                for (int i = 0; i < len; i++) {
                    int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                    int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                    end = Math.min(end, fnamesPartitions.length - 1);
                    threads[i] = new Thread(new DataPartitionerWorkerBinaryBlock(job, fnameNew, fnameStaging,
                            fnamesPartitions, start, end));
                    threads[i].start();
                }

                for (Thread t : threads)
                    t.join();
            } else {
                for (String pdir : fnamesPartitions)
                    writeBinaryBlockSequenceFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir, false);
            }
        } catch (Exception e) {
            throw new DMLRuntimeException("Unable to partition binary block matrix.", e);
        }
    }

    /**
     * 
     * @param fname
     * @param fnameStaging
     * @param fnameNew
     * @param brlen
     * @param bclen
     * @throws DMLRuntimeException
     */
    @SuppressWarnings("deprecation")
    private void partitionBinaryBlock2BinaryCell(String fname, String fnameStaging, String fnameNew, long rlen,
            long clen, int brlen, int bclen) throws DMLRuntimeException {
        try {
            //STEP 1: read matrix from HDFS and write blocks to local staging area   
            //check and add input path
            JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
            Path path = new Path(fname);
            FileSystem fs = FileSystem.get(job);

            //prepare sequence file reader, and write to local staging area
            MatrixIndexes key = new MatrixIndexes();
            MatrixBlock value = new MatrixBlock();

            LinkedList<Cell> buffer = new LinkedList<Cell>();

            for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) {
                SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);
                try {
                    while (reader.next(key, value)) //for each block
                    {
                        long row_offset = (key.getRowIndex() - 1) * brlen;
                        long col_offset = (key.getColumnIndex() - 1) * bclen;
                        long rows = value.getNumRows();
                        long cols = value.getNumColumns();

                        //bound check per block
                        if (row_offset + rows < 1 || row_offset + rows > rlen || col_offset + cols < 1
                                || col_offset + cols > clen) {
                            throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows)
                                    + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] "
                                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
                        }

                        boolean sparse = value.isInSparseFormat();
                        if (sparse) //SPARSE
                        {
                            SparseRowsIterator iter = value.getSparseRowsIterator();
                            while (iter.hasNext()) {
                                IJV lcell = iter.next();
                                Cell tmp = new Cell(row_offset + lcell.i + 1, col_offset + lcell.j + 1, lcell.v);
                                buffer.addLast(tmp);
                            }
                        } else //DENSE
                        {
                            for (int i = 0; i < rows; i++)
                                for (int j = 0; j < cols; j++) {
                                    double lvalue = value.getValueDenseUnsafe(i, j);
                                    if (lvalue != 0) //for nnz
                                    {
                                        Cell tmp = new Cell(row_offset + i + 1, col_offset + j + 1, lvalue);
                                        buffer.addLast(tmp);
                                    }
                                }
                        }

                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                } finally {
                    if (reader != null)
                        reader.close();
                }
            }

            //STEP 2: read matrix blocks from staging area and write matrix to HDFS
            String[] fnamesPartitions = new File(fnameStaging).list();
            if (PARALLEL) {
                int len = Math.min(fnamesPartitions.length, _par);
                Thread[] threads = new Thread[len];
                for (int i = 0; i < len; i++) {
                    int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                    int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                    end = Math.min(end, fnamesPartitions.length - 1);
                    threads[i] = new Thread(new DataPartitionerWorkerBinaryCell(job, fnameNew, fnameStaging,
                            fnamesPartitions, start, end));
                    threads[i].start();
                }

                for (Thread t : threads)
                    t.join();
            } else {
                for (String pdir : fnamesPartitions)
                    writeBinaryCellSequenceFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
            }
        } catch (Exception e) {
            throw new DMLRuntimeException("Unable to partition binary block matrix.", e);
        }
    }

    /**
     * 
     * @param dir
     * @param mb
     * @param row_offset
     * @param col_offset
     * @param brlen
     * @param bclen
     * @throws DMLRuntimeException
     * @throws IOException
     * @throws DMLUnsupportedOperationException 
     */
    private void appendBlockToStagingArea(String dir, MatrixBlock mb, long row_offset, long col_offset, long brlen,
            long bclen) throws DMLRuntimeException, IOException, DMLUnsupportedOperationException {
        //NOTE: for temporary block we always create dense representations
        boolean sparse = mb.isInSparseFormat();
        long nnz = mb.getNonZeros();
        long rows = mb.getNumRows();
        long cols = mb.getNumColumns();
        double sparsity = ((double) nnz) / (rows * cols);

        if (_format == PDataPartitionFormat.ROW_WISE) {
            _reuseBlk.reset(1, (int) cols, sparse, (int) (cols * sparsity));
            for (int i = 0; i < rows; i++) {
                String pdir = LocalFileUtils.checkAndCreateStagingDir(dir + "/" + (row_offset + 1 + i));
                String pfname = pdir + "/" + "block_" + (col_offset / bclen + 1);
                mb.sliceOperations(i, i, 0, (int) (cols - 1), _reuseBlk);
                LocalFileUtils.writeMatrixBlockToLocal(pfname, _reuseBlk);
                _reuseBlk.reset();
            }
        } else if (_format == PDataPartitionFormat.ROW_BLOCK_WISE) {
            String pdir = LocalFileUtils.checkAndCreateStagingDir(dir + "/" + (row_offset / brlen + 1));
            String pfname = pdir + "/" + "block_" + (col_offset / bclen + 1);
            LocalFileUtils.writeMatrixBlockToLocal(pfname, mb);
        } else if (_format == PDataPartitionFormat.COLUMN_WISE) {
            //create object for reuse
            _reuseBlk.reset((int) rows, 1, false);

            for (int i = 0; i < cols; i++) {
                String pdir = LocalFileUtils.checkAndCreateStagingDir(dir + "/" + (col_offset + 1 + i));
                String pfname = pdir + "/" + "block_" + (row_offset / brlen + 1);
                mb.sliceOperations(0, (int) (rows - 1), i, i, _reuseBlk);
                LocalFileUtils.writeMatrixBlockToLocal(pfname, _reuseBlk);
                _reuseBlk.reset();
            }
        } else if (_format == PDataPartitionFormat.COLUMN_BLOCK_WISE) {
            String pdir = LocalFileUtils.checkAndCreateStagingDir(dir + "/" + (col_offset / bclen + 1));
            String pfname = pdir + "/" + "block_" + (row_offset / brlen + 1);
            LocalFileUtils.writeMatrixBlockToLocal(pfname, mb);
        }
    }

    /**
     * 
     * @param dir
     * @param buffer
     * @param brlen
     * @param bclen
     * @throws DMLRuntimeException
     * @throws IOException
     */
    private void appendCellBufferToStagingArea(String dir, LinkedList<Cell> buffer, int brlen, int bclen)
            throws DMLRuntimeException, IOException {
        HashMap<Long, LinkedList<Cell>> sortedBuffer = new HashMap<Long, LinkedList<Cell>>();

        //sort cells in buffer wrt key
        long key = -1;
        for (Cell c : buffer) {
            switch (_format) {
            case ROW_WISE:
                key = c.getRow();
                c.setRow(1);
                break;
            case ROW_BLOCK_WISE:
                key = (c.getRow() - 1) / brlen + 1;
                c.setRow((c.getRow() - 1) % brlen + 1);
                break;
            case COLUMN_WISE:
                key = c.getCol();
                c.setCol(1);
                break;
            case COLUMN_BLOCK_WISE:
                key = (c.getCol() - 1) / bclen + 1;
                c.setCol((c.getCol() - 1) % bclen + 1);
                break;
            default:
                //do nothing
            }

            if (!sortedBuffer.containsKey(key))
                sortedBuffer.put(key, new LinkedList<Cell>());
            sortedBuffer.get(key).addLast(c);
        }

        //write lists of cells to local files
        for (Entry<Long, LinkedList<Cell>> e : sortedBuffer.entrySet()) {
            String pdir = LocalFileUtils.checkAndCreateStagingDir(dir + "/" + e.getKey());
            String pfname = pdir + "/" + "block_" + _seq.getNextID();
            StagingFileUtils.writeCellListToLocal(pfname, e.getValue());
        }
    }

    /////////////////////////////////////
    //     Helper methods for HDFS     //
    // read/write in different formats //
    /////////////////////////////////////

    @SuppressWarnings("deprecation")
    public void writeBinaryBlockSequenceFileToHDFS(JobConf job, String dir, String lpdir, boolean threadsafe)
            throws IOException {
        long key = getKeyFromFilePath(lpdir);
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(dir + "/" + key);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class); //beware ca 50ms

        try {
            String[] fnameBlocks = new File(lpdir).list();
            for (String fnameBlock : fnameBlocks) {
                long key2 = getKey2FromFileName(fnameBlock);
                MatrixBlock tmp = null;
                if (threadsafe)
                    tmp = LocalFileUtils.readMatrixBlockFromLocal(lpdir + "/" + fnameBlock);
                else
                    tmp = LocalFileUtils.readMatrixBlockFromLocal(lpdir + "/" + fnameBlock, _reuseBlk);

                if (_format == PDataPartitionFormat.ROW_WISE || _format == PDataPartitionFormat.ROW_BLOCK_WISE) {
                    writer.append(new MatrixIndexes(1, key2), tmp);
                } else if (_format == PDataPartitionFormat.COLUMN_WISE
                        || _format == PDataPartitionFormat.COLUMN_BLOCK_WISE) {
                    writer.append(new MatrixIndexes(key2, 1), tmp);
                }
            }
        } finally {
            if (writer != null)
                writer.close();
        }
    }

    @SuppressWarnings("deprecation")
    public void writeBinaryCellSequenceFileToHDFS(JobConf job, String dir, String lpdir) throws IOException {
        long key = getKeyFromFilePath(lpdir);
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(dir + "/" + key);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixCell.class); //beware ca 50ms

        try {
            MatrixIndexes indexes = new MatrixIndexes();
            MatrixCell cell = new MatrixCell();

            String[] fnameBlocks = new File(lpdir).list();
            for (String fnameBlock : fnameBlocks) {
                LinkedList<Cell> tmp = StagingFileUtils.readCellListFromLocal(lpdir + "/" + fnameBlock);
                for (Cell c : tmp) {
                    indexes.setIndexes(c.getRow(), c.getCol());
                    cell.setValue(c.getValue());
                    writer.append(indexes, cell);
                }
            }
        } finally {
            if (writer != null)
                writer.close();
        }
    }

    public void writeTextCellFileToHDFS(JobConf job, String dir, String lpdir) throws IOException {
        long key = getKeyFromFilePath(lpdir);
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(dir + "/" + key);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
        try {
            //for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();

            String[] fnameBlocks = new File(lpdir).list();
            for (String fnameBlock : fnameBlocks) {
                LinkedList<Cell> tmp = StagingFileUtils.readCellListFromLocal(lpdir + "/" + fnameBlock);
                for (Cell c : tmp) {
                    sb.append(c.getRow());
                    sb.append(' ');
                    sb.append(c.getCol());
                    sb.append(' ');
                    sb.append(c.getValue());
                    sb.append('\n');
                    out.write(sb.toString());
                    sb.setLength(0);
                }
            }
        } finally {
            if (out != null)
                out.close();
        }
    }

    /////////////////////////////////
    // Helper methods for local fs //
    //         read/write          //
    /////////////////////////////////

    /**
     * 
     * @param dir
     * @return
     */
    private long getKeyFromFilePath(String dir) {
        String[] dirparts = dir.split("/");
        long key = Long.parseLong(dirparts[dirparts.length - 1]);
        return key;
    }

    /**
     * 
     * @param fname
     * @return
     */
    private long getKey2FromFileName(String fname) {
        return Long.parseLong(fname.split("_")[1]);
    }

    private abstract class DataPartitionerWorker implements Runnable {
        private JobConf _job = null;
        private String _fnameNew = null;
        private String _fnameStaging = null;
        private String[] _fnamesPartitions = null;
        private int _start = -1;
        private int _end = -1;

        public DataPartitionerWorker(JobConf job, String fnameNew, String fnameStaging, String[] fnamesPartitions,
                int start, int end) {
            _job = job;
            _fnameNew = fnameNew;
            _fnameStaging = fnameStaging;
            _fnamesPartitions = fnamesPartitions;
            _start = start;
            _end = end;
        }

        @Override
        public void run() {
            //read each input if required
            try {
                for (int i = _start; i <= _end; i++) {
                    String pdir = _fnamesPartitions[i];
                    writeFileToHDFS(_job, _fnameNew, _fnameStaging + "/" + pdir);
                }
            } catch (Exception ex) {
                throw new RuntimeException("Failed on parallel data partitioning.", ex);
            }
        }

        public abstract void writeFileToHDFS(JobConf job, String fnameNew, String stagingDir) throws IOException;
    }

    private class DataPartitionerWorkerTextCell extends DataPartitionerWorker {
        public DataPartitionerWorkerTextCell(JobConf job, String fnameNew, String fnameStaging,
                String[] fnamesPartitions, int start, int end) {
            super(job, fnameNew, fnameStaging, fnamesPartitions, start, end);
        }

        @Override
        public void writeFileToHDFS(JobConf job, String fnameNew, String stagingDir) throws IOException {
            writeTextCellFileToHDFS(job, fnameNew, stagingDir);
        }
    }

    private class DataPartitionerWorkerBinaryCell extends DataPartitionerWorker {
        public DataPartitionerWorkerBinaryCell(JobConf job, String fnameNew, String fnameStaging,
                String[] fnamesPartitions, int start, int end) {
            super(job, fnameNew, fnameStaging, fnamesPartitions, start, end);
        }

        @Override
        public void writeFileToHDFS(JobConf job, String fnameNew, String stagingDir) throws IOException {
            writeBinaryCellSequenceFileToHDFS(job, fnameNew, stagingDir);
        }
    }

    private class DataPartitionerWorkerBinaryBlock extends DataPartitionerWorker {
        public DataPartitionerWorkerBinaryBlock(JobConf job, String fnameNew, String fnameStaging,
                String[] fnamesPartitions, int start, int end) {
            super(job, fnameNew, fnameStaging, fnamesPartitions, start, end);
        }

        @Override
        public void writeFileToHDFS(JobConf job, String fnameNew, String stagingDir) throws IOException {
            writeBinaryBlockSequenceFileToHDFS(job, fnameNew, stagingDir, true);
        }
    }

}