List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java
License:Open Source License
/** * /*from w ww . j a v a 2 s . c om*/ * @param srcFileName * @param destFileName * @param csvprop * @param rlen * @param clen * @throws IOException */ @SuppressWarnings("unchecked") public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path srcFilePath = new Path(srcFileName); Path destFilePath = new Path(destFileName); FileSystem hdfs = FileSystem.get(conf); if (!_props.hasHeader()) { // simply move srcFile to destFile /* * TODO: Remove this roundabout way! * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv * & the only path that exists already on HDFS is /user/biadmin/csv/. * In this case: the directory structure /user/biadmin/csv/temp/out must be created. * Simple hdfs.rename() does not seem to create this directory structure. */ // delete the destination file, if exists already //boolean ret1 = hdfs.delete(destFilePath, true); // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created. //boolean ret2 = hdfs.createNewFile(destFilePath); // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/ //boolean ret3 = hdfs.delete(destFilePath, true); // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv //boolean ret4 = hdfs.rename(srcFilePath, destFilePath); //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4); return; } // construct the header line StringBuilder sb = new StringBuilder(); for (int i = 0; i < clen; i++) { sb.append("C" + (i + 1)); if (i < clen - 1) sb.append(_props.getDelim()); } sb.append('\n'); if (hdfs.isDirectory(srcFilePath)) { // compute sorted order among part files ArrayList<Path> files = new ArrayList<Path>(); for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); // first part file path Path firstpart = files.get(0); // create a temp file, and add header and contents of first part Path tmp = new Path(firstpart.toString() + ".tmp"); OutputStream out = hdfs.create(tmp, true); out.write(sb.toString().getBytes()); sb.setLength(0); // copy rest of the data from firstpart InputStream in = null; try { in = hdfs.open(firstpart); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } // rename tmp to firstpart hdfs.delete(firstpart, true); hdfs.rename(tmp, firstpart); // rename srcfile to destFile hdfs.delete(destFilePath, true); hdfs.createNewFile(destFilePath); // force the creation of directory structure hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure hdfs.rename(srcFilePath, destFilePath); // move the data } else if (hdfs.isFile(srcFilePath)) { // create destination file OutputStream out = hdfs.create(destFilePath, true); // write header out.write(sb.toString().getBytes()); sb.setLength(0); // copy the data from srcFile InputStream in = null; try { in = hdfs.open(srcFilePath); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(srcFilePath.toString() + ": No such file or directory"); } }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfHelper.java
License:Open Source License
public JSONObject parseSpec() throws IOException { FileSystem fs = FileSystem.get(_rJob); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(_specFile)))); JSONObject obj = JSONHelper.parse(br); br.close();//www . ja v a2 s .co m return obj; }
From source file:com.ibm.bi.dml.runtime.transform.BinAgent.java
License:Open Source License
/** * Method to load transform metadata for all attributes * // ww w. j a v a2 s . c o m * @param job * @throws IOException */ @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if (_binList == null) return; if (fs.isDirectory(txMtdDir)) { for (int i = 0; i < _binList.length; i++) { int colID = _binList[i]; Path path = new Path(txMtdDir + "/Bin/" + agents.getName(colID) + BIN_FILE_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); // format: colID,min,max,nbins String[] fields = br.readLine().split(TXMTD_SEP); double min = UtilFunctions.parseToDouble(fields[1]); //double max = UtilFunctions.parseToDouble(fields[2]); double binwidth = UtilFunctions.parseToDouble(fields[3]); int nbins = UtilFunctions.parseToInt(fields[4]); _numBins[i] = nbins; _min[i] = min; _binWidths[i] = binwidth; // (max-min)/nbins; br.close(); } } else { fs.close(); throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir); } }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
/** * Method to read the header line from the input data file. * //from w w w.ja va 2s . c om * @param fs * @param prop * @param smallestFile * @return * @throws IOException */ private static String readHeaderLine(FileSystem fs, CSVFileFormatProperties prop, String smallestFile) throws IOException { String line = null; BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(smallestFile)))); line = br.readLine(); br.close(); if (prop.hasHeader()) { ; // nothing here } else { // construct header with default column names, V1, V2, etc. int ncol = Pattern.compile(Pattern.quote(prop.getDelim())).split(line, -1).length; line = null; StringBuilder sb = new StringBuilder(); sb.append("V1"); for (int i = 2; i <= ncol; i++) sb.append(prop.getDelim() + "V" + i); line = sb.toString(); } return line; }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
/** * Convert input transformation specification file with column names into a * specification with corresponding column Ids. This file is sent to all the * relevant MR jobs./*from w w w. j a v a 2 s .c o m*/ * * @param fs * @param inputPath * @param smallestFile * @param colNames * @param prop * @param specFileWithNames * @return * @throws IllegalArgumentException * @throws IOException * @throws JSONException */ private static String processSpecFile(FileSystem fs, String inputPath, String smallestFile, HashMap<String, Integer> colNames, CSVFileFormatProperties prop, String specFileWithNames) throws IllegalArgumentException, IOException, JSONException { // load input spec file with Names BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(specFileWithNames)))); JSONObject inputSpec = JSONHelper.parse(br); br.close(); final String NAME = "name"; final String ID = "id"; final String METHOD = "method"; final String VALUE = "value"; final String MV_METHOD_MEAN = "global_mean"; final String MV_METHOD_MODE = "global_mode"; final String MV_METHOD_CONSTANT = "constant"; final String BIN_METHOD_WIDTH = "equi-width"; final String BIN_METHOD_HEIGHT = "equi-height"; final String SCALE_METHOD_Z = "z-score"; final String SCALE_METHOD_M = "mean-subtraction"; final String JSON_BYPOS = "ids"; String stmp = null; JSONObject entry = null; byte btmp = 0; final int[] mvList; int[] rcdList, dcdList, omitList; final int[] binList; final int[] scaleList; byte[] mvMethods = null, binMethods = null, scaleMethods = null; Object[] numBins = null; Object[] mvConstants = null; boolean byPositions = (inputSpec.containsKey(JSON_BYPOS) && ((Boolean) inputSpec.get(JSON_BYPOS)).booleanValue() == true); // -------------------------------------------------------------------------- // Omit if (inputSpec.containsKey(TX_METHOD.OMIT.toString())) { JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.OMIT.toString()); omitList = new int[arrtmp.size()]; for (int i = 0; i < arrtmp.size(); i++) { if (byPositions) omitList[i] = UtilFunctions.toInt(arrtmp.get(i)); else { stmp = UtilFunctions.unquote((String) arrtmp.get(i)); omitList[i] = colNames.get(stmp); } } Arrays.sort(omitList); } else omitList = null; // -------------------------------------------------------------------------- // Missing value imputation if (inputSpec.containsKey(TX_METHOD.IMPUTE.toString())) { JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.IMPUTE.toString()); mvList = new int[arrtmp.size()]; mvMethods = new byte[arrtmp.size()]; mvConstants = new Object[arrtmp.size()]; for (int i = 0; i < arrtmp.size(); i++) { entry = (JSONObject) arrtmp.get(i); if (byPositions) { mvList[i] = UtilFunctions.toInt(entry.get(ID)); } else { stmp = UtilFunctions.unquote((String) entry.get(NAME)); mvList[i] = colNames.get(stmp); } stmp = UtilFunctions.unquote((String) entry.get(METHOD)); if (stmp.equals(MV_METHOD_MEAN)) btmp = (byte) 1; else if (stmp.equals(MV_METHOD_MODE)) btmp = (byte) 2; else if (stmp.equals(MV_METHOD_CONSTANT)) btmp = (byte) 3; else throw new IOException("Unknown missing value imputation method (" + stmp + ") in transformation specification file: " + specFileWithNames); mvMethods[i] = btmp; //txMethods.add( btmp ); mvConstants[i] = null; if (entry.containsKey(VALUE)) mvConstants[i] = entry.get(VALUE); } Integer[] idx = new Integer[mvList.length]; for (int i = 0; i < mvList.length; i++) idx[i] = i; Arrays.sort(idx, new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return (mvList[o1] - mvList[o2]); } }); // rearrange mvList, mvMethods, and mvConstants according to permutation idx inplacePermute(mvList, mvMethods, mvConstants, idx); } else mvList = null; // -------------------------------------------------------------------------- // Recoding if (inputSpec.containsKey(TX_METHOD.RECODE.toString())) { JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.RECODE.toString()); rcdList = new int[arrtmp.size()]; for (int i = 0; i < arrtmp.size(); i++) { if (byPositions) rcdList[i] = UtilFunctions.toInt(arrtmp.get(i)); else { stmp = UtilFunctions.unquote((String) arrtmp.get(i)); rcdList[i] = colNames.get(stmp); } } Arrays.sort(rcdList); } else rcdList = null; // -------------------------------------------------------------------------- // Binning if (inputSpec.containsKey(TX_METHOD.BIN.toString())) { JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.BIN.toString()); binList = new int[arrtmp.size()]; binMethods = new byte[arrtmp.size()]; numBins = new Object[arrtmp.size()]; for (int i = 0; i < arrtmp.size(); i++) { entry = (JSONObject) arrtmp.get(i); if (byPositions) { binList[i] = UtilFunctions.toInt(entry.get(ID)); } else { stmp = UtilFunctions.unquote((String) entry.get(NAME)); binList[i] = colNames.get(stmp); } stmp = UtilFunctions.unquote((String) entry.get(METHOD)); if (stmp.equals(BIN_METHOD_WIDTH)) btmp = (byte) 1; else if (stmp.equals(BIN_METHOD_HEIGHT)) throw new IOException( "Equi-height binning method is not yet supported, in transformation specification file: " + specFileWithNames); else throw new IOException("Unknown missing value imputation method (" + stmp + ") in transformation specification file: " + specFileWithNames); binMethods[i] = btmp; numBins[i] = entry.get(TransformationAgent.JSON_NBINS); if (((Integer) numBins[i]).intValue() <= 1) throw new IllegalArgumentException("Invalid transformation on column \"" + (String) entry.get(NAME) + "\". Number of bins must be greater than 1."); } Integer[] idx = new Integer[binList.length]; for (int i = 0; i < binList.length; i++) idx[i] = i; Arrays.sort(idx, new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return (binList[o1] - binList[o2]); } }); // rearrange binList and binMethods according to permutation idx inplacePermute(binList, binMethods, numBins, idx); } else binList = null; // -------------------------------------------------------------------------- // Dummycoding if (inputSpec.containsKey(TX_METHOD.DUMMYCODE.toString())) { JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.DUMMYCODE.toString()); dcdList = new int[arrtmp.size()]; for (int i = 0; i < arrtmp.size(); i++) { if (byPositions) dcdList[i] = UtilFunctions.toInt(arrtmp.get(i)); else { stmp = UtilFunctions.unquote((String) arrtmp.get(i)); dcdList[i] = colNames.get(stmp); } } Arrays.sort(dcdList); } else dcdList = null; // -------------------------------------------------------------------------- // Scaling if (inputSpec.containsKey(TX_METHOD.SCALE.toString())) { JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.SCALE.toString()); scaleList = new int[arrtmp.size()]; scaleMethods = new byte[arrtmp.size()]; for (int i = 0; i < arrtmp.size(); i++) { entry = (JSONObject) arrtmp.get(i); if (byPositions) { scaleList[i] = UtilFunctions.toInt(entry.get(ID)); } else { stmp = UtilFunctions.unquote((String) entry.get(NAME)); scaleList[i] = colNames.get(stmp); } stmp = UtilFunctions.unquote((String) entry.get(METHOD)); if (stmp.equals(SCALE_METHOD_M)) btmp = (byte) 1; else if (stmp.equals(SCALE_METHOD_Z)) btmp = (byte) 2; else throw new IOException("Unknown missing value imputation method (" + stmp + ") in transformation specification file: " + specFileWithNames); scaleMethods[i] = btmp; } Integer[] idx = new Integer[scaleList.length]; for (int i = 0; i < scaleList.length; i++) idx[i] = i; Arrays.sort(idx, new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return (scaleList[o1] - scaleList[o2]); } }); // rearrange scaleList and scaleMethods according to permutation idx inplacePermute(scaleList, scaleMethods, null, idx); } else scaleList = null; // -------------------------------------------------------------------------- // check for column IDs that are imputed with mode, but not recoded // These columns have be handled separately, because the computation of mode // requires the computation of distinct values (i.e., recode maps) ArrayList<Integer> tmpList = new ArrayList<Integer>(); if (mvList != null) for (int i = 0; i < mvList.length; i++) { int colID = mvList[i]; if (mvMethods[i] == 2 && (rcdList == null || Arrays.binarySearch(rcdList, colID) < 0)) tmpList.add(colID); } int[] mvrcdList = null; if (tmpList.size() > 0) { mvrcdList = new int[tmpList.size()]; for (int i = 0; i < tmpList.size(); i++) mvrcdList[i] = tmpList.get(i); } // Perform Validity Checks /* OMIT MVI RCD BIN DCD SCL OMIT - x * * * * MVI x - * * * * RCD * * - x * x BIN * * x - * x DCD * * * * - x SCL * * x x x - */ if (mvList != null) for (int i = 0; i < mvList.length; i++) { int colID = mvList[i]; if (omitList != null && Arrays.binarySearch(omitList, colID) >= 0) throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be both omitted and imputed."); if (mvMethods[i] == 1) { if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0) throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A numeric column can not be recoded."); if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0) // throw an error only if the column is not binned if (binList == null || Arrays.binarySearch(binList, colID) < 0) throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A numeric column can not be dummycoded."); } } if (scaleList != null) for (int i = 0; i < scaleList.length; i++) { int colID = scaleList[i]; if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0) throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be recoded and scaled."); if (binList != null && Arrays.binarySearch(binList, colID) >= 0) throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be binned and scaled."); if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0) throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be dummycoded and scaled."); } if (rcdList != null) for (int i = 0; i < rcdList.length; i++) { int colID = rcdList[i]; if (binList != null && Arrays.binarySearch(binList, colID) >= 0) throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be recoded and binned."); } // Check if dummycoded columns are either recoded or binned. // If not, add them to recode list. ArrayList<Integer> addToRcd = new ArrayList<Integer>(); if (dcdList != null) for (int i = 0; i < dcdList.length; i++) { int colID = dcdList[i]; boolean isRecoded = (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0); boolean isBinned = (binList != null && Arrays.binarySearch(binList, colID) >= 0); // If colID is neither recoded nor binned, then, add it to rcdList. if (!isRecoded && !isBinned) addToRcd.add(colID); } if (addToRcd.size() > 0) { int[] newRcdList = null; if (rcdList != null) newRcdList = Arrays.copyOf(rcdList, rcdList.length + addToRcd.size()); else newRcdList = new int[addToRcd.size()]; int i = (rcdList != null ? rcdList.length : 0); for (int idx = 0; i < newRcdList.length; i++, idx++) newRcdList[i] = addToRcd.get(idx); Arrays.sort(newRcdList); rcdList = newRcdList; } // ----------------------------------------------------------------------------- // Prepare output spec JSONObject outputSpec = new JSONObject(); if (omitList != null) { JSONObject rcdSpec = new JSONObject(); rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(omitList)); outputSpec.put(TX_METHOD.OMIT.toString(), rcdSpec); } if (mvList != null) { JSONObject mvSpec = new JSONObject(); mvSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvList)); mvSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(mvMethods)); mvSpec.put(TransformationAgent.JSON_CONSTS, toJSONArray(mvConstants)); outputSpec.put(TX_METHOD.IMPUTE.toString(), mvSpec); } if (rcdList != null) { JSONObject rcdSpec = new JSONObject(); rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(rcdList)); outputSpec.put(TX_METHOD.RECODE.toString(), rcdSpec); } if (binList != null) { JSONObject binSpec = new JSONObject(); binSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(binList)); binSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(binMethods)); binSpec.put(TransformationAgent.JSON_NBINS, toJSONArray(numBins)); outputSpec.put(TX_METHOD.BIN.toString(), binSpec); } if (dcdList != null) { JSONObject dcdSpec = new JSONObject(); dcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(dcdList)); outputSpec.put(TX_METHOD.DUMMYCODE.toString(), dcdSpec); } if (scaleList != null) { JSONObject scaleSpec = new JSONObject(); scaleSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(scaleList)); scaleSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(scaleMethods)); outputSpec.put(TX_METHOD.SCALE.toString(), scaleSpec); } if (mvrcdList != null) { JSONObject mvrcd = new JSONObject(); mvrcd.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvrcdList)); outputSpec.put(TX_METHOD.MVRCD.toString(), mvrcd); } // write out the spec with IDs String specFileWithIDs = MRJobConfiguration.constructTempOutputFilename(); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(specFileWithIDs), true))); out.write(outputSpec.toString()); out.close(); return specFileWithIDs; }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
/** * Helper function to determine the number of columns after applying * transformations. Note that dummycoding changes the number of columns. * /*from www . ja v a2s . co m*/ * @param fs * @param header * @param delim * @param tfMtdPath * @return * @throws IllegalArgumentException * @throws IOException * @throws DMLRuntimeException * @throws JSONException */ private static int getNumColumnsTf(FileSystem fs, String header, String delim, String tfMtdPath) throws IllegalArgumentException, IOException, DMLRuntimeException, JSONException { String[] columnNames = Pattern.compile(Pattern.quote(delim)).split(header, -1); int ret = columnNames.length; BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(tfMtdPath + "/spec.json")))); JSONObject spec = JSONHelper.parse(br); br.close(); // fetch relevant attribute lists if (!spec.containsKey(TX_METHOD.DUMMYCODE.toString())) return ret; JSONArray dcdList = (JSONArray) ((JSONObject) spec.get(TX_METHOD.DUMMYCODE.toString())) .get(TransformationAgent.JSON_ATTRS); // look for numBins among binned columns for (Object o : dcdList) { int id = UtilFunctions.toInt(o); Path binpath = new Path(tfMtdPath + "/Bin/" + UtilFunctions.unquote(columnNames[id - 1]) + TransformationAgent.BIN_FILE_SUFFIX); Path rcdpath = new Path(tfMtdPath + "/Recode/" + UtilFunctions.unquote(columnNames[id - 1]) + TransformationAgent.NDISTINCT_FILE_SUFFIX); if (TfUtils.checkValidInputFile(fs, binpath, false)) { br = new BufferedReader(new InputStreamReader(fs.open(binpath))); int nbins = UtilFunctions.parseToInt(br.readLine().split(TransformationAgent.TXMTD_SEP)[4]); br.close(); ret += (nbins - 1); } else if (TfUtils.checkValidInputFile(fs, rcdpath, false)) { br = new BufferedReader(new InputStreamReader(fs.open(rcdpath))); int ndistinct = UtilFunctions.parseToInt(br.readLine()); br.close(); ret += (ndistinct - 1); } else throw new DMLRuntimeException("Relevant transformation metadata for column (id=" + id + ", name=" + columnNames[id - 1] + ") is not found."); } //System.out.println("Number of columns in transformed data: " + ret); return ret; }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
private static String getOutputHeader(FileSystem fs, String headerLine, TransformOperands oprnds) throws IOException { String ret = null;//w w w . java 2 s .c o m if (oprnds.isApply) { BufferedReader br = new BufferedReader(new InputStreamReader( fs.open(new Path(oprnds.applyTxPath + "/" + TransformationAgent.OUT_HEADER)))); ret = br.readLine(); br.close(); } else { if (oprnds.outNamesFile == null) ret = headerLine; else { BufferedReader br = new BufferedReader( new InputStreamReader(fs.open(new Path(oprnds.outNamesFile)))); ret = br.readLine(); br.close(); } } return ret; }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
private static int[] countNumRows(ArrayList<Path> files, CSVFileFormatProperties prop, FileSystem fs, TfUtils agents) throws IOException { int[] rows = new int[2]; int numRows = 0, numRowsTf = 0; OmitAgent oa = agents.getOmitAgent(); if (!oa.isApplicable()) { for (int fileNo = 0; fileNo < files.size(); fileNo++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); if (fileNo == 0 && prop.hasHeader()) br.readLine(); //ignore header while (br.readLine() != null) numRows++;/*from www . ja v a 2 s . c o m*/ br.close(); } numRowsTf = numRows; } else { String line = null; String[] words; Pattern delim = Pattern.compile(Pattern.quote(prop.getDelim())); for (int fileNo = 0; fileNo < files.size(); fileNo++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); if (fileNo == 0 && prop.hasHeader()) br.readLine(); //ignore header while ((line = br.readLine()) != null) { numRows++; words = delim.split(line, -1); if (!oa.omit(words, agents)) numRowsTf++; } br.close(); } } rows[0] = numRows; rows[1] = numRowsTf; return rows; }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
/** * Main method to create and/or apply transformation metdata in-memory, on a single node. * /*from w w w . j a v a 2 s.c om*/ * @param job * @param fs * @param inputPath * @param ncols * @param prop * @param specFileWithIDs * @param tfMtdPath * @param applyTxPath * @param isApply * @param outputPath * @param headerLine * @throws IOException * @throws DMLRuntimeException * @throws JSONException * @throws IllegalArgumentException */ private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols, CSVFileFormatProperties prop, String specFileWithIDs, String tfMtdPath, boolean isApply, MatrixObject result, String headerLine, boolean isBB, boolean isCSV) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException { String[] na = TfUtils.parseNAStrings(prop.getNAStrings()); JSONObject spec = TfUtils.readSpec(fs, specFileWithIDs); TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath, null, null); MVImputeAgent _mia = agents.getMVImputeAgent(); RecodeAgent _ra = agents.getRecodeAgent(); BinAgent _ba = agents.getBinAgent(); DummycodeAgent _da = agents.getDummycodeAgent(); // List of files to read ArrayList<Path> files = collectInputFiles(inputPath, fs); // --------------------------------- // Construct transformation metadata // --------------------------------- String line = null; String[] words = null; int numColumnsTf = 0; BufferedReader br = null; if (!isApply) { for (int fileNo = 0; fileNo < files.size(); fileNo++) { br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); if (fileNo == 0 && prop.hasHeader()) br.readLine(); //ignore header line = null; while ((line = br.readLine()) != null) { agents.prepareTfMtd(line); } br.close(); } if (agents.getValid() == 0) throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS); _mia.outputTransformationMetadata(tfMtdPath, fs, agents); _ba.outputTransformationMetadata(tfMtdPath, fs, agents); _ra.outputTransformationMetadata(tfMtdPath, fs, agents); // prepare agents for the subsequent phase of applying transformation metadata // NO need to loadTxMtd for _ra, since the maps are already present in the memory Path tmp = new Path(tfMtdPath); _mia.loadTxMtd(job, fs, tmp, agents); _ba.loadTxMtd(job, fs, tmp, agents); _da.setRecodeMapsCP(_ra.getCPRecodeMaps()); _da.setNumBins(_ba.getBinList(), _ba.getNumBins()); _da.loadTxMtd(job, fs, tmp, agents); } else { // Count the number of rows int rows[] = countNumRows(files, prop, fs, agents); agents.setTotal(rows[0]); agents.setValid(rows[1]); if (agents.getValid() == 0) throw new DMLRuntimeException( "Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed."); // Load transformation metadata // prepare agents for the subsequent phase of applying transformation metadata Path tmp = new Path(tfMtdPath); _mia.loadTxMtd(job, fs, tmp, agents); _ra.loadTxMtd(job, fs, tmp, agents); _ba.loadTxMtd(job, fs, tmp, agents); _da.setRecodeMaps(_ra.getRecodeMaps()); _da.setNumBins(_ba.getBinList(), _ba.getNumBins()); _da.loadTxMtd(job, fs, tmp, agents); } // ----------------------------- // Apply transformation metadata // ----------------------------- numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath); MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName()); BufferedWriter out = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(result.getFileName()), true))); StringBuilder sb = new StringBuilder(); MatrixBlock mb = null; if (isBB) { int estNNZ = (int) agents.getValid() * ncols; mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ); if (mb.isInSparseFormat()) mb.allocateSparseRowsBlock(); else mb.allocateDenseBlock(); } int rowID = 0; // rowid to be used in filling the matrix block for (int fileNo = 0; fileNo < files.size(); fileNo++) { br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); if (fileNo == 0) { String header = null; if (prop.hasHeader()) br.readLine(); // ignore the header line from data file header = headerLine; String dcdHeader = _da.constructDummycodedHeader(header, agents.getDelim()); numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents); DataTransform.generateHeaderFiles(fs, tfMtdPath, header, dcdHeader); } line = null; while ((line = br.readLine()) != null) { words = agents.getWords(line); if (!agents.omit(words)) { words = agents.apply(words, !isApply); if (isCSV) { out.write(agents.checkAndPrepOutputString(words, sb)); out.write("\n"); } if (isBB) { agents.check(words); for (int c = 0; c < words.length; c++) { if (words[c] == null || words[c].isEmpty()) ; else mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c])); } } rowID++; } } br.close(); } out.close(); if (mb != null) { mb.recomputeNonZeros(); mb.examSparsity(); result.acquireModify(mb); result.release(); result.exportData(); } MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf, (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock()); JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true); return ret; }
From source file:com.ibm.bi.dml.runtime.transform.MVImputeAgent.java
License:Open Source License
private String readReplacement(int colID, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { Path path = new Path(txMtdDir + "/Impute/" + agents.getName(colID) + MV_FILE_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine();//from w w w. j a v a2 s . co m String replacement = UtilFunctions.unquote(line.split(TXMTD_SEP)[1]); br.close(); return replacement; }