com.ibm.bi.dml.udf.lib.RemoveEmptyRows.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.udf.lib.RemoveEmptyRows.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.udf.lib;

import java.io.DataOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.StringTokenizer;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.runtime.util.MapReduceTool;
import com.ibm.bi.dml.udf.FunctionParameter;
import com.ibm.bi.dml.udf.Matrix;
import com.ibm.bi.dml.udf.PackageFunction;
import com.ibm.bi.dml.udf.Matrix.ValueType;

/**
 * 
 *
 */
@Deprecated
public class RemoveEmptyRows extends PackageFunction {

    private static final long serialVersionUID = 1L;
    private static final String OUTPUT_FILE = "TMP";

    private Matrix _ret;

    @Override
    public int getNumFunctionOutputs() {
        return 1;
    }

    @Override
    public FunctionParameter getFunctionOutput(int pos) {
        return _ret;
    }

    @Override
    public void execute() {
        Matrix mat = (Matrix) this.getFunctionInput(0);
        String fnameOld = mat.getFilePath();

        HashMap<Long, Long> keyMap = new HashMap<Long, Long>(); //old,new rowID

        try {
            //prepare input
            JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
            Path path = new Path(fnameOld);
            FileSystem fs = FileSystem.get(job);
            if (!fs.exists(path))
                throw new IOException("File " + fnameOld + " does not exist on HDFS.");
            FileInputFormat.addInputPath(job, path);
            TextInputFormat informat = new TextInputFormat();
            informat.configure(job);

            //prepare output
            String fnameNew = createOutputFilePathAndName(OUTPUT_FILE);
            DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream(fnameNew, true);

            //read and write if necessary
            InputSplit[] splits = informat.getSplits(job, 1);

            LongWritable key = new LongWritable();
            Text value = new Text();
            long ID = 1;

            try {
                //for obj reuse and preventing repeated buffer re-allocations
                StringBuilder sb = new StringBuilder();

                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
                    try {
                        while (reader.next(key, value)) {
                            String cellStr = value.toString().trim();
                            StringTokenizer st = new StringTokenizer(cellStr, " ");
                            long row = Integer.parseInt(st.nextToken());
                            long col = Integer.parseInt(st.nextToken());
                            double lvalue = Double.parseDouble(st.nextToken());

                            if (!keyMap.containsKey(row))
                                keyMap.put(row, ID++);
                            long rowNew = keyMap.get(row);

                            sb.append(rowNew);
                            sb.append(' ');
                            sb.append(col);
                            sb.append(' ');
                            sb.append(lvalue);
                            sb.append('\n');

                            ostream.writeBytes(sb.toString());
                            sb.setLength(0);
                        }
                    } finally {
                        if (reader != null)
                            reader.close();
                    }
                }

                _ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double);
            } finally {
                if (ostream != null)
                    ostream.close();
            }
        } catch (Exception ex) {
            throw new RuntimeException("Unable to execute external function.", ex);
        }
    }
}