Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.transform; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.nio.charset.CharacterCodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.wink.json4j.JSONArray; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import org.apache.sysml.lops.Lop; import org.apache.sysml.runtime.matrix.data.FrameBlock; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.transform.MVImputeAgent.MVMethod; import org.apache.sysml.runtime.transform.encode.Encoder; import org.apache.sysml.runtime.transform.meta.TfMetaUtils; import org.apache.sysml.runtime.util.UtilFunctions; public class BinAgent extends Encoder { private static final long serialVersionUID = 1917445005206076078L; public static final String MIN_PREFIX = "min"; public static final String MAX_PREFIX = "max"; public static final String NBINS_PREFIX = "nbins"; private int[] _numBins = null; private double[] _min = null, _max = null; // min and max among non-missing values private double[] _binWidths = null; // width of a bin for each attribute //frame transform-apply attributes private double[][] _binMins = null; private double[][] _binMaxs = null; public BinAgent(JSONObject parsedSpec, String[] colnames, int clen) throws JSONException, IOException { this(parsedSpec, colnames, clen, false); } public BinAgent(JSONObject parsedSpec, String[] colnames, int clen, boolean colsOnly) throws JSONException, IOException { super(null, clen); if (!parsedSpec.containsKey(TfUtils.TXMETHOD_BIN)) return; if (colsOnly) { List<Integer> collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames); initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0]))); } else { JSONObject obj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_BIN); JSONArray attrs = (JSONArray) obj.get(TfUtils.JSON_ATTRS); JSONArray nbins = (JSONArray) obj.get(TfUtils.JSON_NBINS); initColList(attrs); _numBins = new int[attrs.size()]; for (int i = 0; i < _numBins.length; i++) _numBins[i] = UtilFunctions.toInt(nbins.get(i)); // initialize internal transformation metadata _min = new double[_colList.length]; Arrays.fill(_min, Double.MAX_VALUE); _max = new double[_colList.length]; Arrays.fill(_max, -Double.MAX_VALUE); _binWidths = new double[_colList.length]; } } public int[] getNumBins() { return _numBins; } public double[] getMin() { return _min; } public double[] getBinWidths() { return _binWidths; } public void prepare(String[] words, TfUtils agents) { if (!isApplicable()) return; for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; String w = null; double d = 0; // equi-width w = UtilFunctions.unquote(words[colID - 1].trim()); if (!TfUtils.isNA(agents.getNAStrings(), w)) { d = UtilFunctions.parseToDouble(w); if (d < _min[i]) _min[i] = d; if (d > _max[i]) _max[i] = d; } } } private DistinctValue prepMinOutput(int idx) throws CharacterCodingException { String s = MIN_PREFIX + Double.toString(_min[idx]); return new DistinctValue(s, -1L); } private DistinctValue prepMaxOutput(int idx) throws CharacterCodingException { String s = MAX_PREFIX + Double.toString(_max[idx]); return new DistinctValue(s, -1L); } private DistinctValue prepNBinsOutput(int idx) throws CharacterCodingException { String s = NBINS_PREFIX + Double.toString(_numBins[idx]); return new DistinctValue(s, -1L); } /** * Method to output transformation metadata from the mappers. * This information is collected and merged by the reducers. */ @Override public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException { if (!isApplicable()) return; try { for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; IntWritable iw = new IntWritable(-colID); out.collect(iw, prepMinOutput(i)); out.collect(iw, prepMaxOutput(i)); out.collect(iw, prepNBinsOutput(i)); } } catch (Exception e) { throw new IOException(e); } } public ArrayList<Pair<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Pair<Integer, DistinctValue>> list, TfUtils agents) throws IOException { if (!isApplicable()) return list; try { for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; Integer iw = -colID; list.add(new Pair<Integer, DistinctValue>(iw, prepMinOutput(i))); list.add(new Pair<Integer, DistinctValue>(iw, prepMaxOutput(i))); list.add(new Pair<Integer, DistinctValue>(iw, prepNBinsOutput(i))); } } catch (Exception e) { throw new IOException(e); } return list; } private void writeTfMtd(int colID, String min, String max, String binwidth, String nbins, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException { Path pt = new Path(tfMtdDir + "/Bin/" + agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); br.write(colID + TfUtils.TXMTD_SEP + min + TfUtils.TXMTD_SEP + max + TfUtils.TXMTD_SEP + binwidth + TfUtils.TXMTD_SEP + nbins + "\n"); br.close(); } /** * Method to merge map output transformation metadata. */ @Override public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException { double min = Double.MAX_VALUE; double max = -Double.MAX_VALUE; int nbins = 0; DistinctValue val = new DistinctValue(); String w = null; double d; while (values.hasNext()) { val.reset(); val = values.next(); w = val.getWord(); if (w.startsWith(MIN_PREFIX)) { d = UtilFunctions.parseToDouble(w.substring(MIN_PREFIX.length())); if (d < min) min = d; } else if (w.startsWith(MAX_PREFIX)) { d = UtilFunctions.parseToDouble(w.substring(MAX_PREFIX.length())); if (d > max) max = d; } else if (w.startsWith(NBINS_PREFIX)) { nbins = (int) UtilFunctions.parseToLong(w.substring(NBINS_PREFIX.length())); } else throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w); } // write merged metadata double binwidth = (max - min) / nbins; writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents); } public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException { if (!isApplicable()) return; MVImputeAgent mvagent = agents.getMVImputeAgent(); for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; // If the column is imputed with a constant, then adjust min and max based the value of the constant. if (mvagent.isApplicable(colID) != -1 && mvagent.getMethod(colID) == MVMethod.CONSTANT) { double cst = UtilFunctions.parseToDouble(mvagent.getReplacement(colID)); if (cst < _min[i]) _min[i] = cst; if (cst > _max[i]) _max[i] = cst; } double binwidth = (_max[i] - _min[i]) / _numBins[i]; writeTfMtd(colID, Double.toString(_min[i]), Double.toString(_max[i]), Double.toString(binwidth), Integer.toString(_numBins[i]), outputDir, fs, agents); } } // ------------------------------------------------------------------------------------------------ /** * Method to load transform metadata for all attributes */ @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if (!isApplicable()) return; if (fs.isDirectory(txMtdDir)) { for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; Path path = new Path(txMtdDir + "/Bin/" + agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); // format: colID,min,max,nbins String[] fields = br.readLine().split(TfUtils.TXMTD_SEP); double min = UtilFunctions.parseToDouble(fields[1]); //double max = UtilFunctions.parseToDouble(fields[2]); double binwidth = UtilFunctions.parseToDouble(fields[3]); int nbins = UtilFunctions.parseToInt(fields[4]); _numBins[i] = nbins; _min[i] = min; _binWidths[i] = binwidth; // (max-min)/nbins; br.close(); } } else { fs.close(); throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir); } } @Override public MatrixBlock encode(FrameBlock in, MatrixBlock out) { build(in); return apply(in, out); } @Override public void build(FrameBlock in) { // TODO Auto-generated method stub } /** * Method to apply transformations. */ @Override public String[] apply(String[] words) { if (!isApplicable()) return words; for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; try { double val = UtilFunctions.parseToDouble(words[colID - 1]); int binid = 1; double tmp = _min[i] + _binWidths[i]; while (val > tmp && binid < _numBins[i]) { tmp += _binWidths[i]; binid++; } words[colID - 1] = Integer.toString(binid); } catch (NumberFormatException e) { throw new RuntimeException("Encountered \"" + words[colID - 1] + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + words[colID - 1] + "\" to na.strings, along with an appropriate imputation method."); } } return words; } @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; for (int i = 0; i < in.getNumRows(); i++) { double inVal = UtilFunctions.objectToDouble(in.getSchema()[colID - 1], in.get(i, colID - 1)); int ix = Arrays.binarySearch(_binMaxs[j], inVal); int binID = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1; out.quickSetValue(i, colID - 1, binID); } } return out; } @Override public FrameBlock getMetaData(FrameBlock meta) { return meta; } @Override public void initMetaData(FrameBlock meta) { _binMins = new double[_colList.length][]; _binMaxs = new double[_colList.length][]; for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; //1-based int nbins = (int) meta.getColumnMetadata()[colID - 1].getNumDistinct(); _binMins[j] = new double[nbins]; _binMaxs[j] = new double[nbins]; for (int i = 0; i < nbins; i++) { String[] tmp = meta.get(i, colID - 1).toString().split(Lop.DATATYPE_PREFIX); _binMins[j][i] = Double.parseDouble(tmp[0]); _binMaxs[j][i] = Double.parseDouble(tmp[1]); } } } }