Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.transform; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.nio.charset.CharacterCodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.wink.json4j.JSONArray; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import scala.Tuple2; import com.ibm.bi.dml.runtime.transform.MVImputeAgent.MVMethod; import com.ibm.bi.dml.runtime.util.UtilFunctions; public class BinAgent extends TransformationAgent { private static final long serialVersionUID = 1917445005206076078L; public static final String MIN_PREFIX = "min"; public static final String MAX_PREFIX = "max"; public static final String NBINS_PREFIX = "nbins"; private int[] _binList = null; //private byte[] _binMethodList = null; // Not used, since only equi-width is supported for now. private int[] _numBins = null; private double[] _min = null, _max = null; // min and max among non-missing values private double[] _binWidths = null; // width of a bin for each attribute BinAgent() { } BinAgent(JSONObject parsedSpec) throws JSONException { if (!parsedSpec.containsKey(TX_METHOD.BIN.toString())) return; JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.BIN.toString()); JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS); //JSONArray mthds = (JSONArray) obj.get(JSON_MTHD); JSONArray nbins = (JSONArray) obj.get(JSON_NBINS); assert (attrs.size() == nbins.size()); _binList = new int[attrs.size()]; _numBins = new int[attrs.size()]; for (int i = 0; i < _binList.length; i++) { _binList[i] = UtilFunctions.toInt(attrs.get(i)); _numBins[i] = UtilFunctions.toInt(nbins.get(i)); } // initialize internal transformation metadata _min = new double[_binList.length]; Arrays.fill(_min, Double.MAX_VALUE); _max = new double[_binList.length]; Arrays.fill(_max, -Double.MAX_VALUE); _binWidths = new double[_binList.length]; } public void prepare(String[] words, TfUtils agents) { if (_binList == null) return; for (int i = 0; i < _binList.length; i++) { int colID = _binList[i]; String w = null; double d = 0; // equi-width w = UtilFunctions.unquote(words[colID - 1].trim()); if (!agents.isNA(w)) { d = UtilFunctions.parseToDouble(w); if (d < _min[i]) _min[i] = d; if (d > _max[i]) _max[i] = d; } } } private DistinctValue prepMinOutput(int idx) throws CharacterCodingException { String s = MIN_PREFIX + Double.toString(_min[idx]); return new DistinctValue(s, -1L); } private DistinctValue prepMaxOutput(int idx) throws CharacterCodingException { String s = MAX_PREFIX + Double.toString(_max[idx]); return new DistinctValue(s, -1L); } private DistinctValue prepNBinsOutput(int idx) throws CharacterCodingException { String s = NBINS_PREFIX + Double.toString(_numBins[idx]); return new DistinctValue(s, -1L); } /** * Method to output transformation metadata from the mappers. * This information is collected and merged by the reducers. * * @param out * @throws IOException */ @Override public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException { if (_binList == null) return; try { for (int i = 0; i < _binList.length; i++) { int colID = _binList[i]; IntWritable iw = new IntWritable(-colID); out.collect(iw, prepMinOutput(i)); out.collect(iw, prepMaxOutput(i)); out.collect(iw, prepNBinsOutput(i)); } } catch (Exception e) { throw new IOException(e); } } public ArrayList<Tuple2<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Tuple2<Integer, DistinctValue>> list, TfUtils agents) throws IOException { if (_binList == null) return list; try { for (int i = 0; i < _binList.length; i++) { int colID = _binList[i]; Integer iw = -colID; list.add(new Tuple2<Integer, DistinctValue>(iw, prepMinOutput(i))); list.add(new Tuple2<Integer, DistinctValue>(iw, prepMaxOutput(i))); list.add(new Tuple2<Integer, DistinctValue>(iw, prepNBinsOutput(i))); } } catch (Exception e) { throw new IOException(e); } return list; } private void writeTfMtd(int colID, String min, String max, String binwidth, String nbins, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException { Path pt = new Path(tfMtdDir + "/Bin/" + agents.getName(colID) + BIN_FILE_SUFFIX); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); br.write(colID + TXMTD_SEP + min + TXMTD_SEP + max + TXMTD_SEP + binwidth + TXMTD_SEP + nbins + "\n"); br.close(); } /** * Method to merge map output transformation metadata. * * @param values * @return * @throws IOException */ @Override public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException { double min = Double.MAX_VALUE; double max = -Double.MAX_VALUE; int nbins = 0; DistinctValue val = new DistinctValue(); String w = null; double d; while (values.hasNext()) { val.reset(); val = values.next(); w = val.getWord(); if (w.startsWith(MIN_PREFIX)) { d = UtilFunctions.parseToDouble(w.substring(MIN_PREFIX.length())); if (d < min) min = d; } else if (w.startsWith(MAX_PREFIX)) { d = UtilFunctions.parseToDouble(w.substring(MAX_PREFIX.length())); if (d > max) max = d; } else if (w.startsWith(NBINS_PREFIX)) { nbins = (int) UtilFunctions.parseToLong(w.substring(NBINS_PREFIX.length())); } else throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w); } // write merged metadata double binwidth = (max - min) / nbins; writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents); } public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException { if (_binList == null) return; MVImputeAgent mvagent = agents.getMVImputeAgent(); for (int i = 0; i < _binList.length; i++) { int colID = _binList[i]; // If the column is imputed with a constant, then adjust min and max based the value of the constant. if (mvagent.isImputed(colID) != -1 && mvagent.getMethod(colID) == MVMethod.CONSTANT) { double cst = UtilFunctions.parseToDouble(mvagent.getReplacement(colID)); if (cst < _min[i]) _min[i] = cst; if (cst > _max[i]) _max[i] = cst; } double binwidth = (_max[i] - _min[i]) / _numBins[i]; writeTfMtd(colID, Double.toString(_min[i]), Double.toString(_max[i]), Double.toString(binwidth), Integer.toString(_numBins[i]), outputDir, fs, agents); } } // ------------------------------------------------------------------------------------------------ public int[] getBinList() { return _binList; } public int[] getNumBins() { return _numBins; } public double[] getMin() { return _min; } public double[] getBinWidths() { return _binWidths; } /** * Method to load transform metadata for all attributes * * @param job * @throws IOException */ @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if (_binList == null) return; if (fs.isDirectory(txMtdDir)) { for (int i = 0; i < _binList.length; i++) { int colID = _binList[i]; Path path = new Path(txMtdDir + "/Bin/" + agents.getName(colID) + BIN_FILE_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); // format: colID,min,max,nbins String[] fields = br.readLine().split(TXMTD_SEP); double min = UtilFunctions.parseToDouble(fields[1]); //double max = UtilFunctions.parseToDouble(fields[2]); double binwidth = UtilFunctions.parseToDouble(fields[3]); int nbins = UtilFunctions.parseToInt(fields[4]); _numBins[i] = nbins; _min[i] = min; _binWidths[i] = binwidth; // (max-min)/nbins; br.close(); } } else { fs.close(); throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir); } } /** * Method to apply transformations. * * @param words * @return */ @Override public String[] apply(String[] words, TfUtils agents) { if (_binList == null) return words; for (int i = 0; i < _binList.length; i++) { int colID = _binList[i]; try { double val = UtilFunctions.parseToDouble(words[colID - 1]); int binid = 1; double tmp = _min[i] + _binWidths[i]; while (val > tmp && binid < _numBins[i]) { tmp += _binWidths[i]; binid++; } words[colID - 1] = Integer.toString(binid); } catch (NumberFormatException e) { throw new RuntimeException("Encountered \"" + words[colID - 1] + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + words[colID - 1] + "\" to na.strings, along with an appropriate imputation method."); } } return words; } /** * Check if the given column ID is subjected to this transformation. * */ public int isBinned(int colID) { if (_binList == null) return -1; int idx = Arrays.binarySearch(_binList, colID); return (idx >= 0 ? idx : -1); } @Override public void print() { System.out.print("Binning List (Equi-width): \n "); for (int i : _binList) { System.out.print(i + " "); } System.out.print("\n "); for (int b : _numBins) { System.out.print(b + " "); } System.out.println(); } }