Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.transform; import java.io.BufferedReader; import java.io.EOFException; import java.io.IOException; import java.io.InputStreamReader; import java.io.Serializable; import java.util.Arrays; import java.util.regex.Pattern; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.ByteWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.Reader; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import com.ibm.bi.dml.conf.ConfigurationManager; import com.ibm.bi.dml.parser.DataExpression; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import com.ibm.bi.dml.runtime.io.MatrixReader; import com.ibm.bi.dml.runtime.matrix.CSVReblockMR; import com.ibm.bi.dml.runtime.matrix.CSVReblockMR.OffsetCount; import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration; import com.ibm.bi.dml.runtime.util.MapReduceTool; import com.ibm.bi.dml.runtime.util.UtilFunctions; import com.ibm.bi.dml.utils.JSONHelper; @SuppressWarnings("deprecation") public class TfUtils implements Serializable { private static final long serialVersionUID = 526252850872633125L; private OmitAgent _oa = null; private MVImputeAgent _mia = null; private RecodeAgent _ra = null; private BinAgent _ba = null; private DummycodeAgent _da = null; private long _numRecordsInPartFile; // Total number of records in the data file private long _numValidRecords; // (_numRecordsInPartFile - #of omitted records) private long _numTransformedRows; // Number of rows after applying transformations private long _numTransformedColumns; // Number of columns after applying transformations private String _headerLine = null; private boolean _hasHeader; private Pattern _delim = null; private String _delimString = null; private String[] _NAstrings = null; private String[] _outputColumnNames = null; private long _numInputCols = -1; private String _tfMtdDir = null; private String _specFile = null; private String _offsetFile = null; private String _tmpDir = null; private String _outputPath = null; protected static boolean checkValidInputFile(FileSystem fs, Path path, boolean err) throws IOException { // check non-existing file if (!fs.exists(path)) if (err) throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS."); else return false; // check for empty file if (MapReduceTool.isFileEmpty(fs, path.toString())) if (err) throw new EOFException("Empty input file " + path.toString() + "."); else return false; return true; } public static String getPartFileName(JobConf job) throws IOException { FileSystem fs = FileSystem.get(job); Path thisPath = new Path(job.get("map.input.file")).makeQualified(fs); return thisPath.toString(); } public static boolean isPartFileWithHeader(JobConf job) throws IOException { FileSystem fs = FileSystem.get(job); String thisfile = getPartFileName(job); Path smallestFilePath = new Path(job.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs); if (thisfile.toString().equals(smallestFilePath.toString())) return true; else return false; } public static JSONObject readSpec(FileSystem fs, String specFile) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(specFile)))); JSONObject obj = JSONHelper.parse(br); br.close(); return obj; } /** * Prepare NA strings so that they can be sent to workers via JobConf. * A "dummy" string is added at the end to handle the case of empty strings. * @param na * @return */ public static String prepNAStrings(String na) { return na + DataExpression.DELIM_NA_STRING_SEP + "dummy"; } public static String[] parseNAStrings(String na) { if (na == null) return null; String[] tmp = Pattern.compile(Pattern.quote(DataExpression.DELIM_NA_STRING_SEP)).split(na, -1); return tmp; //Arrays.copyOf(tmp, tmp.length-1); } public static String[] parseNAStrings(JobConf job) { return parseNAStrings(job.get(MRJobConfiguration.TF_NA_STRINGS)); } private void createAgents(JSONObject spec) throws IOException, JSONException { _oa = new OmitAgent(spec); _mia = new MVImputeAgent(spec); _ra = new RecodeAgent(spec); _ba = new BinAgent(spec); _da = new DummycodeAgent(spec, _numInputCols); } public void setupAgents(OmitAgent oa, MVImputeAgent mia, RecodeAgent ra, BinAgent ba, DummycodeAgent da) { _oa = oa; _mia = mia; _ra = ra; _ba = ba; _da = da; } private void parseColumnNames() { _outputColumnNames = _delim.split(_headerLine, -1); for (int i = 0; i < _outputColumnNames.length; i++) _outputColumnNames[i] = UtilFunctions.unquote(_outputColumnNames[i]); } private void init(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long numCols, String offsetFile, String tmpPath, String outputPath) throws IOException, JSONException { _numRecordsInPartFile = 0; _numValidRecords = 0; _numTransformedRows = 0; _numTransformedColumns = 0; _headerLine = headerLine; _hasHeader = hasHeader; _delimString = delim; _delim = Pattern.compile(Pattern.quote(delim)); _NAstrings = naStrings; _numInputCols = numCols; _offsetFile = offsetFile; _tmpDir = tmpPath; _outputPath = outputPath; parseColumnNames(); createAgents(spec); } public TfUtils(JobConf job, boolean minimal) throws IOException, JSONException { if (!InfrastructureAnalyzer.isLocalMode(job)) { ConfigurationManager.setCachedJobConf(job); } _NAstrings = TfUtils.parseNAStrings(job); _specFile = job.get(MRJobConfiguration.TF_SPEC_FILE); FileSystem fs = FileSystem.get(job); JSONObject spec = TfUtils.readSpec(fs, _specFile); _oa = new OmitAgent(spec); } // called from GenTFMtdMapper, ApplyTf (Hadoop) public TfUtils(JobConf job) throws IOException, JSONException { if (!InfrastructureAnalyzer.isLocalMode(job)) { ConfigurationManager.setCachedJobConf(job); } boolean hasHeader = Boolean.parseBoolean(job.get(MRJobConfiguration.TF_HAS_HEADER)); //Pattern delim = Pattern.compile(Pattern.quote(job.get(MRJobConfiguration.TF_DELIM))); String[] naStrings = TfUtils.parseNAStrings(job); long numCols = UtilFunctions.parseToLong(job.get(MRJobConfiguration.TF_NUM_COLS)); // #of columns in input data String specFile = job.get(MRJobConfiguration.TF_SPEC_FILE); String offsetFile = job.get(MRJobConfiguration.TF_OFFSETS_FILE); String tmpPath = job.get(MRJobConfiguration.TF_TMP_LOC); String outputPath = FileOutputFormat.getOutputPath(job).toString(); FileSystem fs = FileSystem.get(job); JSONObject spec = TfUtils.readSpec(fs, specFile); init(job.get(MRJobConfiguration.TF_HEADER), hasHeader, job.get(MRJobConfiguration.TF_DELIM), naStrings, spec, numCols, offsetFile, tmpPath, outputPath); } // called from GenTfMtdReducer public TfUtils(JobConf job, String tfMtdDir) throws IOException, JSONException { this(job); _tfMtdDir = tfMtdDir; } // called from GenTFMtdReducer and ApplyTf (Spark) public TfUtils(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long ncol, String tfMtdDir, String offsetFile, String tmpPath) throws IOException, JSONException { init(headerLine, hasHeader, delim, naStrings, spec, ncol, offsetFile, tmpPath, null); _tfMtdDir = tfMtdDir; } public void incrValid() { _numValidRecords++; } public long getValid() { return _numValidRecords; } public long getTotal() { return _numRecordsInPartFile; } public long getNumTransformedRows() { return _numTransformedRows; } public long getNumTransformedColumns() { return _numTransformedColumns; } public String getHeader() { return _headerLine; } public boolean hasHeader() { return _hasHeader; } public String getDelimString() { return _delimString; } public Pattern getDelim() { return _delim; } public String[] getNAStrings() { return _NAstrings; } public long getNumCols() { return _numInputCols; } public String getSpecFile() { return _specFile; } public String getTfMtdDir() { return _tfMtdDir; } public String getOffsetFile() { return _offsetFile; } public String getTmpDir() { return _tmpDir; } public String getOutputPath() { return _outputPath; } public String getName(int colID) { return _outputColumnNames[colID - 1]; } public void setValid(long n) { _numValidRecords = n; } public void incrTotal() { _numRecordsInPartFile++; } public void setTotal(long n) { _numRecordsInPartFile = n; } public OmitAgent getOmitAgent() { return _oa; } public MVImputeAgent getMVImputeAgent() { return _mia; } public RecodeAgent getRecodeAgent() { return _ra; } public BinAgent getBinAgent() { return _ba; } public DummycodeAgent getDummycodeAgent() { return _da; } /** * Function that checks if the given string is one of NA strings. * * @param w * @return */ public boolean isNA(String w) { if (_NAstrings == null) return false; for (String na : _NAstrings) { if (w.equals(na)) return true; } return false; } public String[] getWords(Text line) { return getWords(line.toString()); } public String[] getWords(String line) { return getDelim().split(line.trim(), -1); } /** * Process a given row to construct transformation metadata. * * @param line * @return * @throws IOException */ public String[] prepareTfMtd(String line) throws IOException { String[] words = getWords(line); if (!getOmitAgent().omit(words, this)) { getMVImputeAgent().prepare(words, this); getRecodeAgent().prepare(words, this); getBinAgent().prepare(words, this); incrValid(); ; } incrTotal(); return words; } public void loadTfMetadata() throws IOException { JobConf job = ConfigurationManager.getCachedJobConf(); loadTfMetadata(job, false); } public void loadTfMetadata(JobConf job, boolean fromLocalFS) throws IOException { Path tfMtdDir = null; FileSystem fs = null; if (fromLocalFS) { // metadata must be read from local file system (e.g., distributed cache in the case of Hadoop) tfMtdDir = (DistributedCache.getLocalCacheFiles(job))[0]; fs = FileSystem.getLocal(job); } else { fs = FileSystem.get(job); tfMtdDir = new Path(getTfMtdDir()); } // load transformation metadata getMVImputeAgent().loadTxMtd(job, fs, tfMtdDir, this); getRecodeAgent().loadTxMtd(job, fs, tfMtdDir, this); getBinAgent().loadTxMtd(job, fs, tfMtdDir, this); // associate recode maps and bin definitions with dummycoding agent, // as recoded and binned columns are typically dummycoded getDummycodeAgent().setRecodeMaps(getRecodeAgent().getRecodeMaps()); getDummycodeAgent().setNumBins(getBinAgent().getBinList(), getBinAgent().getNumBins()); getDummycodeAgent().loadTxMtd(job, fs, tfMtdDir, this); } /*public void loadTfMetadata () throws IOException { Path tfMtdDir = (DistributedCache.getLocalCacheFiles(_rJob))[0]; FileSystem localFS = FileSystem.getLocal(_rJob); loadTfMetadata(_rJob, localFS, tfMtdDir); FileSystem fs; fs = FileSystem.get(_rJob); Path thisPath=new Path(_rJob.get("map.input.file")).makeQualified(fs); String thisfile=thisPath.toString(); Path smallestFilePath=new Path(_rJob.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs); if(thisfile.toString().equals(smallestFilePath.toString())) _partFileWithHeader=true; else _partFileWithHeader = false; }*/ public String processHeaderLine() throws IOException { FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); String dcdHeader = getDummycodeAgent().constructDummycodedHeader(getHeader(), getDelim()); getDummycodeAgent().genDcdMapsAndColTypes(fs, getTmpDir(), (int) getNumCols(), this); // write header information (before and after transformation) to temporary path // these files are copied into txMtdPath, once the ApplyTf job is complete. DataTransform.generateHeaderFiles(fs, getTmpDir(), getHeader(), dcdHeader); return dcdHeader; //_numTransformedColumns = getDelim().split(dcdHeader, -1).length; //return _numTransformedColumns; } public boolean omit(String[] words) { if (getOmitAgent() == null) return false; return getOmitAgent().omit(words, this); } public String[] apply(String[] words) { return apply(words, false); } /** * Function to apply transformation metadata on a given row. * * @param words * @param optimizeMaps * @return */ public String[] apply(String[] words, boolean optimizeMaps) { words = getMVImputeAgent().apply(words, this); if (optimizeMaps) // specific case of transform() invoked from CP (to save boxing and unboxing) words = getRecodeAgent().cp_apply(words, this); else words = getRecodeAgent().apply(words, this); words = getBinAgent().apply(words, this); words = getDummycodeAgent().apply(words, this); _numTransformedRows++; return words; } public void check(String[] words) throws DMLRuntimeException { boolean checkEmptyString = (getNAStrings() != null); if (checkEmptyString) { final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: "; for (int i = 0; i < words.length; i++) if (words[i] != null && words[i].equals("")) throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i + 1)); } } public String checkAndPrepOutputString(String[] words) throws DMLRuntimeException { return checkAndPrepOutputString(words, new StringBuilder()); } public String checkAndPrepOutputString(String[] words, StringBuilder sb) throws DMLRuntimeException { /* * Check if empty strings ("") have to be handled. * * Unless na.strings are provided, empty strings are (implicitly) considered as value zero. * When na.strings are provided, then "" is considered a missing value indicator, and the * user is expected to provide an appropriate imputation method. Therefore, when na.strings * are provided, "" encountered in any column (after all transformations are applied) * denotes an erroneous condition. */ boolean checkEmptyString = (getNAStrings() != null); //&& !MVImputeAgent.isNA("", TransformationAgent.NAstrings) ) { //StringBuilder sb = new StringBuilder(); sb.setLength(0); int i = 0; if (checkEmptyString) { final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: "; if (words[0] != null) if (words[0].equals("")) throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(1)); else sb.append(words[0]); else sb.append("0"); for (i = 1; i < words.length; i++) { sb.append(_delimString); if (words[i] != null) if (words[i].equals("")) throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i + 1)); else sb.append(words[i]); else sb.append("0"); } } else { sb.append(words[0] != null ? words[0] : "0"); for (i = 1; i < words.length; i++) { sb.append(_delimString); sb.append(words[i] != null ? words[i] : "0"); } } return sb.toString(); } private Reader initOffsetsReader(JobConf job) throws IOException { Path path = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME)); FileSystem fs = FileSystem.get(job); Path[] files = MatrixReader.getSequenceFilePaths(fs, path); if (files.length != 1) throw new IOException("Expecting a single file under counters file: " + path.toString()); Reader reader = new SequenceFile.Reader(fs, files[0], job); return reader; } /** * Function to generate custom file names (transform-part-.....) for * mappers' output for ApplyTfCSV job. The idea is to find the index * of (thisfile, fileoffset) in the list of all offsets from the * counters/offsets file, which was generated from either GenTfMtdMR * or AssignRowIDMR job. * */ public String getPartFileID(JobConf job, long offset) throws IOException { Reader reader = initOffsetsReader(job); ByteWritable key = new ByteWritable(); OffsetCount value = new OffsetCount(); String thisFile = TfUtils.getPartFileName(job); int id = 0; while (reader.next(key, value)) { if (thisFile.equals(value.filename) && value.fileOffset == offset) break; id++; } reader.close(); String sid = Integer.toString(id); char[] carr = new char[5 - sid.length()]; Arrays.fill(carr, '0'); String ret = (new String(carr)).concat(sid); return ret; } }