Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.transform; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.sysml.lops.Lop; import org.apache.sysml.runtime.io.IOUtilFunctions; import org.apache.sysml.runtime.matrix.data.FrameBlock; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.transform.MVImputeAgent.MVMethod; import org.apache.sysml.runtime.transform.decode.DecoderRecode; import org.apache.sysml.runtime.transform.encode.Encoder; import org.apache.sysml.runtime.transform.meta.TfMetaUtils; import org.apache.sysml.runtime.util.UtilFunctions; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; public class RecodeAgent extends Encoder { private static final long serialVersionUID = 8213163881283341874L; private int[] _mvrcdList = null; private int[] _fullrcdList = null; //recode maps and custom map for partial recode maps private HashMap<Integer, HashMap<String, Long>> _rcdMaps = new HashMap<Integer, HashMap<String, Long>>(); private HashMap<Integer, HashMap<String, String>> _finalMaps = null; private HashMap<Integer, HashSet<Object>> _rcdMapsPart = null; public RecodeAgent(JSONObject parsedSpec, String[] colnames, int clen) throws JSONException { super(null, clen); int rcdCount = 0; if (parsedSpec.containsKey(TfUtils.TXMETHOD_RECODE)) { int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_RECODE); rcdCount = initColList(collist); } if (parsedSpec.containsKey(TfUtils.TXMETHOD_MVRCD)) { _mvrcdList = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_MVRCD); rcdCount += _mvrcdList.length; } if (rcdCount > 0) { _fullrcdList = new int[rcdCount]; int idx = -1; if (_colList != null) for (int i = 0; i < _colList.length; i++) _fullrcdList[++idx] = _colList[i]; if (_mvrcdList != null) for (int i = 0; i < _mvrcdList.length; i++) _fullrcdList[++idx] = _mvrcdList[i]; } } public HashMap<Integer, HashMap<String, Long>> getCPRecodeMaps() { return _rcdMaps; } public HashMap<Integer, HashSet<Object>> getCPRecodeMapsPartial() { return _rcdMapsPart; } public HashMap<Integer, HashMap<String, String>> getRecodeMaps() { return _finalMaps; } void prepare(String[] words, TfUtils agents) { if (_colList == null && _mvrcdList == null) return; String w = null; for (int colID : _fullrcdList) { w = UtilFunctions.unquote(words[colID - 1].trim()); if (_rcdMaps.get(colID) == null) _rcdMaps.put(colID, new HashMap<String, Long>()); HashMap<String, Long> map = _rcdMaps.get(colID); Long count = map.get(w); if (count == null) map.put(w, new Long(1)); else map.put(w, count + 1); } } private HashMap<String, Long> handleMVConstant(int colID, TfUtils agents, HashMap<String, Long> map) { MVImputeAgent mvagent = agents.getMVImputeAgent(); if (mvagent.getMethod(colID) == MVMethod.CONSTANT) { // check if the "replacement" is part of the map. If not, add it. String repValue = mvagent.getReplacement(colID); if (repValue == null) throw new RuntimeException("Expecting a constant replacement value for column ID " + colID); repValue = UtilFunctions.unquote(repValue); Long count = map.get(repValue); long mvCount = agents.getValid() - mvagent.getNonMVCount(colID); if (count == null) map.put(repValue, mvCount); else map.put(repValue, count + mvCount); } return map; } /** * Method to output transformation metadata from the mappers. * This information is collected and merged by the reducers. */ @Override public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException { mapOutputHelper(taskID, out, null, agents); } public ArrayList<Pair<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Pair<Integer, DistinctValue>> list, TfUtils agents) throws IOException { mapOutputHelper(taskID, null, list, agents); return list; } public void mapOutputHelper(int taskID, OutputCollector<IntWritable, DistinctValue> out, ArrayList<Pair<Integer, DistinctValue>> list, TfUtils agents) throws IOException { if (_colList == null && _mvrcdList == null) return; try { for (int i = 0; i < _fullrcdList.length; i++) { int colID = _fullrcdList[i]; HashMap<String, Long> map = _rcdMaps.get(colID); if (map != null) { map = handleMVConstant(colID, agents, map); if (out != null) { IntWritable iw = new IntWritable(colID); for (String s : map.keySet()) out.collect(iw, new DistinctValue(s, map.get(s))); } else if (list != null) { for (String s : map.keySet()) list.add(new Pair<Integer, DistinctValue>(colID, new DistinctValue(s, map.get(s)))); } } } } catch (Exception e) { throw new IOException(e); } } /** * Function to output transformation metadata, including: * - recode maps, * - number of distinct values, * - mode, and * - imputation value (in the case of global_mode) * * The column for which this function is invoked can be one of the following: * - just recoded (write .map, .ndistinct, .mode) * - just mv imputed (w/ global_mode) (write .impute) * - both recoded and mv imputed (write .map, .ndistinct, .mode, .impute) * * @param map recode maps * @param outputDir output directory * @param colID column id * @param fs file system * @param agents ? * @param fromCP ? * @throws IOException if IOException occurs */ private void writeMetadata(HashMap<String, Long> map, String outputDir, int colID, FileSystem fs, TfUtils agents, boolean fromCP) throws IOException { // output recode maps and mode MVImputeAgent mvagent = agents.getMVImputeAgent(); String mode = null; Long count = null; int rcdIndex = 0, modeIndex = 0; long maxCount = Long.MIN_VALUE; boolean isRecoded = (isApplicable(colID) != -1); boolean isModeImputed = (mvagent.getMethod(colID) == MVMethod.GLOBAL_MODE); Path pt = new Path(outputDir + "/Recode/" + agents.getName(colID) + TfUtils.TXMTD_RCD_MAP_SUFFIX); BufferedWriter br = null; try { if (isRecoded) br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); // remove NA strings if (agents.getNAStrings() != null) for (String naword : agents.getNAStrings()) map.remove(naword); if (fromCP) map = handleMVConstant(colID, agents, map); if (map.size() == 0) throw new RuntimeException("Can not proceed since \"" + agents.getName(colID) + "\" (id=" + colID + ") contains only the missing values, and not a single valid value -- set imputation method to \"constant\"."); // Order entries by category (string) value List<String> newNames = new ArrayList<String>(map.keySet()); Collections.sort(newNames); for (String w : newNames) { //map.keySet()) { count = map.get(w); ++rcdIndex; // output (w, count, rcdIndex) if (br != null) br.write(UtilFunctions.quote(w) + TfUtils.TXMTD_SEP + rcdIndex + TfUtils.TXMTD_SEP + count + "\n"); if (maxCount < count) { maxCount = count; mode = w; modeIndex = rcdIndex; } // Replace count with recode index (useful when invoked from CP) map.put(w, (long) rcdIndex); } } finally { IOUtilFunctions.closeSilently(br); } if (mode == null) { mode = ""; maxCount = 0; } if (isRecoded) { // output mode pt = new Path(outputDir + "/Recode/" + agents.getName(colID) + TfUtils.MODE_FILE_SUFFIX); try (BufferedWriter br2 = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)))) { br2.write(UtilFunctions.quote(mode) + "," + modeIndex + "," + maxCount); } // output number of distinct values pt = new Path(outputDir + "/Recode/" + agents.getName(colID) + TfUtils.TXMTD_RCD_DISTINCT_SUFFIX); try (BufferedWriter br2 = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)))) { br2.write("" + map.size()); } } if (isModeImputed) { pt = new Path(outputDir + "/Impute/" + agents.getName(colID) + TfUtils.TXMTD_MV_FILE_SUFFIX); try (BufferedWriter br2 = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)))) { br2.write(colID + "," + UtilFunctions.quote(mode)); } } } public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException { if (_colList == null && _mvrcdList == null) return; for (int i = 0; i < _fullrcdList.length; i++) { int colID = _fullrcdList[i]; writeMetadata(_rcdMaps.get(colID), outputDir, colID, fs, agents, true); } } /** * Method to merge map output transformation metadata. */ @Override public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException { HashMap<String, Long> map = new HashMap<String, Long>(); DistinctValue d = new DistinctValue(); String word = null; Long count = null, val = null; while (values.hasNext()) { d.reset(); d = values.next(); word = d.getWord(); count = d.getCount(); val = map.get(word); if (val == null) map.put(word, count); else map.put(word, val + count); } writeMetadata(map, outputDir, colID, fs, agents, false); } /** * Method to load recode maps of all attributes, at once. */ @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if (!isApplicable()) return; _finalMaps = new HashMap<Integer, HashMap<String, String>>(); if (fs.isDirectory(txMtdDir)) { for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; Path path = new Path(txMtdDir + "/Recode/" + agents.getName(colID) + TfUtils.TXMTD_RCD_MAP_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); HashMap<String, String> map = new HashMap<String, String>(); Pair<String, String> pair = new Pair<String, String>(); String line = null; try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)))) { // Example line to parse: "WN (1)67492",1,61975 while ((line = br.readLine()) != null) { DecoderRecode.parseRecodeMapEntry(line, pair); map.put(pair.getKey(), pair.getValue()); } } _finalMaps.put(colID, map); } } else { throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir); } } private String lookupRCDMap(int colID, String key) { if (_finalMaps != null) return _finalMaps.get(colID).get(key); else { //used for cp Long tmp = _rcdMaps.get(colID).get(key); return (tmp != null) ? Long.toString(tmp) : null; } } @Override public MatrixBlock encode(FrameBlock in, MatrixBlock out) { if (!isApplicable()) return out; //build and apply recode maps build(in); apply(in, out); return out; } @Override public void build(FrameBlock in) { if (!isApplicable()) return; Iterator<String[]> iter = in.getStringRowIterator(); while (iter.hasNext()) { String[] row = iter.next(); for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; //1-based //allocate column map if necessary if (!_rcdMaps.containsKey(colID)) _rcdMaps.put(colID, new HashMap<String, Long>()); //probe and build column map HashMap<String, Long> map = _rcdMaps.get(colID); String key = row[colID - 1]; if (key != null && !key.isEmpty() && !map.containsKey(key)) map.put(key, Long.valueOf(map.size() + 1)); } } } public void buildPartial(FrameBlock in) { if (!isApplicable()) return; //ensure allocated partial recode map if (_rcdMapsPart == null) _rcdMapsPart = new HashMap<Integer, HashSet<Object>>(); //construct partial recode map (tokens w/o codes) //iterate over columns for sequential access for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; //1-based //allocate column map if necessary if (!_rcdMapsPart.containsKey(colID)) _rcdMapsPart.put(colID, new HashSet<Object>()); HashSet<Object> map = _rcdMapsPart.get(colID); //probe and build column map for (int i = 0; i < in.getNumRows(); i++) map.add(in.get(i, colID - 1)); //cleanup unnecessary entries once map.remove(null); map.remove(""); } } /** * Method to apply transformations. */ @Override public String[] apply(String[] words) { if (!isApplicable()) return words; //apply recode maps on relevant columns of given row for (int i = 0; i < _colList.length; i++) { //prepare input and get code int colID = _colList[i]; String key = UtilFunctions.unquote(words[colID - 1].trim()); String val = lookupRCDMap(colID, key); // replace unseen keys with NaN words[colID - 1] = (val != null) ? val : "NaN"; } return words; } @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { //apply recode maps column wise for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; for (int i = 0; i < in.getNumRows(); i++) { Object okey = in.get(i, colID - 1); String key = (okey != null) ? okey.toString() : null; String val = lookupRCDMap(colID, key); out.quickSetValue(i, colID - 1, (val != null) ? Double.parseDouble(val) : Double.NaN); } } return out; } @Override public FrameBlock getMetaData(FrameBlock meta) { if (!isApplicable()) return meta; //inverse operation to initRecodeMaps //allocate output rows int maxDistinct = 0; for (int j = 0; j < _colList.length; j++) if (_rcdMaps.containsKey(_colList[j])) maxDistinct = Math.max(maxDistinct, _rcdMaps.get(_colList[j]).size()); meta.ensureAllocatedColumns(maxDistinct); //create compact meta data representation for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; //1-based int rowID = 0; if (_rcdMaps.containsKey(_colList[j])) for (Entry<String, Long> e : _rcdMaps.get(colID).entrySet()) { String tmp = constructRecodeMapEntry(e.getKey(), e.getValue()); meta.set(rowID++, colID - 1, tmp); } meta.getColumnMetadata(colID - 1).setNumDistinct(_rcdMaps.get(colID).size()); } return meta; } /** * Construct the recodemaps from the given input frame for all * columns registered for recode. * * @param meta frame block */ public void initMetaData(FrameBlock meta) { if (meta == null || meta.getNumRows() <= 0) return; for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; //1-based _rcdMaps.put(colID, meta.getRecodeMap(colID - 1)); } } /** * Returns the Recode map entry which consists of concatenation of code, delimiter and token. * @param token is part of Recode map * @param code is code for token * @return the concatenation of code and token with delimiter in between */ public static String constructRecodeMapEntry(String token, Long code) { return token + Lop.DATATYPE_PREFIX + code.toString(); } }