Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import com.ibm.bi.dml.conf.ConfigurationManager; import com.ibm.bi.dml.parser.DataExpression; import com.ibm.bi.dml.parser.Expression.ValueType; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.io.MatrixReader; import com.ibm.bi.dml.runtime.io.MatrixReaderFactory; import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics; import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties; import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties; import com.ibm.bi.dml.runtime.matrix.data.InputInfo; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.NumItemsByEachReducerMetaData; import com.ibm.bi.dml.runtime.matrix.data.OutputInfo; import com.ibm.bi.dml.runtime.matrix.sort.ReadWithZeros; public class MapReduceTool { private static final Log LOG = LogFactory.getLog(MapReduceTool.class.getName()); private static JobConf _rJob = null; //cached job conf for read-only operations static { _rJob = ConfigurationManager.getCachedJobConf(); } public static String getUniqueKeyPerTask(JobConf job, boolean inMapper) { //TODO: investigate ID pattern, required for parallel jobs /*String nodePrefix = job.get("mapred.task.id"); return String.valueOf(IDHandler.extractLongID(nodePrefix));*/ String nodePrefix = job.get("mapred.task.id"); int i; if (inMapper) i = nodePrefix.indexOf("_m_"); else i = nodePrefix.indexOf("_r_"); int j = nodePrefix.lastIndexOf("_"); nodePrefix = nodePrefix.substring(i + 3, j); // remove all the leading 0s return String.valueOf(Long.parseLong(nodePrefix)); } @Deprecated public static String getUniqueKeyPerTaskWithLeadingZros(JobConf job, boolean inMapper) { String nodePrefix = job.get("mapred.task.id"); int i; if (inMapper) i = nodePrefix.indexOf("_m_"); else i = nodePrefix.indexOf("_r_"); int j = nodePrefix.lastIndexOf("_"); nodePrefix = nodePrefix.substring(i + 3, j); return nodePrefix; } public static int getUniqueTaskId(JobConf job) { //TODO: investigate ID pattern, required for parallel jobs /*String nodePrefix = job.get("mapred.task.id"); return IDHandler.extractIntID(nodePrefix);*/ String nodePrefix = job.get("mapred.task.id"); int j = nodePrefix.lastIndexOf("_"); int i = nodePrefix.lastIndexOf("_", j - 1); nodePrefix = nodePrefix.substring(i + 1, j); // System.out.println("nodePrefix = " + nodePrefix) ; return Integer.valueOf(nodePrefix); } public static String getGloballyUniqueName(JobConf job) { return job.get("mapred.task.id"); } public static boolean existsFileOnHDFS(String fname) { boolean ret = true; try { Path outpath = new Path(fname); ret = FileSystem.get(_rJob).exists(outpath); } catch (Exception ex) { LOG.error("Exception caught in existsFileOnHDFS", ex); ret = false; } return ret; } public static void deleteFileIfExistOnHDFS(Path outpath, JobConf job) throws IOException { if (FileSystem.get(job).exists(outpath)) { FileSystem.get(job).delete(outpath, true); } } public static void deleteFileIfExistOnLFS(Path outpath, JobConf job) throws IOException { if (FileSystem.getLocal(job).exists(outpath)) { FileSystem.getLocal(job).delete(outpath, true); } } public static void deleteFileIfExistOnHDFS(String dir) throws IOException { Path outpath = new Path(dir); FileSystem fs = FileSystem.get(_rJob); if (fs.exists(outpath)) { //System.err.println("Deleting " + outpath + " ... "); fs.delete(outpath, true); } } public static boolean isHDFSDirectory(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); Path pth = new Path(dir); FileStatus fstat = fs.getFileStatus(pth); return fstat.isDirectory(); } public static boolean isHDFSFileEmpty(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); return isFileEmpty(fs, dir); } public static boolean isFileEmpty(FileSystem fs, String dir) throws IOException { Path pth = new Path(dir); FileStatus fstat = fs.getFileStatus(pth); if (fstat.isDirectory()) { // it is a directory FileStatus[] stats = fs.listStatus(pth); if (stats != null) { for (FileStatus stat : stats) { if (stat.getLen() > 0) return false; } return true; } else { return true; } } else { // it is a regular file if (fstat.getLen() == 0) return true; else return false; } } public static void renameFileOnHDFS(String originalDir, String newDir) throws IOException { Path originalpath = new Path(originalDir); deleteFileIfExistOnHDFS(newDir); Path newpath = new Path(newDir); FileSystem fs = FileSystem.get(_rJob); if (fs.exists(originalpath)) { fs.rename(originalpath, newpath); } else { throw new FileNotFoundException(originalDir); } } public static void mergeIntoSingleFile(String originalDir, String newFile) throws IOException { FileSystem fs = FileSystem.get(_rJob); FileUtil.copyMerge(fs, new Path(originalDir), fs, new Path(newFile), true, _rJob, null); } public static void copyFileOnHDFS(String originalDir, String newDir) throws IOException { Path originalPath = new Path(originalDir); Path newPath = new Path(newDir); boolean deleteSource = false; boolean overwrite = true; JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); if (fs.exists(originalPath)) { FileUtil.copy(fs, originalPath, fs, newPath, deleteSource, overwrite, job); } } /** * * @param dir * @return * @throws IOException */ public static String getSubDirs(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); FileStatus[] files = fs.listStatus(new Path(dir)); StringBuilder sb = new StringBuilder(); for (FileStatus file : files) { if (sb.length() > 0) sb.append(","); sb.append(file.getPath().toString()); } return sb.toString(); } /** * * @param dir * @return * @throws IOException */ public static String getSubDirsIgnoreLogs(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); FileStatus[] files = fs.listStatus(new Path(dir)); StringBuilder sb = new StringBuilder(); for (FileStatus file : files) { String name = file.getPath().toString(); if (name.contains("_logs")) continue; if (sb.length() > 0) sb.append(","); sb.append(name); } return sb.toString(); } /** * Returns the size of a file or directory on hdfs in bytes. * * @param path * @return * @throws IOException */ public static long getFilesizeOnHDFS(Path path) throws IOException { FileSystem fs = FileSystem.get(_rJob); long ret = 0; //in bytes if (fs.isDirectory(path)) ret = fs.getContentSummary(path).getLength(); else ret = fs.getFileStatus(path).getLen(); //note: filestatus would return 0 on directories return ret; } private static BufferedReader setupInputFile(String filename) throws IOException { Path pt = new Path(filename); FileSystem fs = FileSystem.get(_rJob); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(pt))); return br; } public static double readDoubleFromHDFSFile(String filename) throws IOException { BufferedReader br = setupInputFile(filename); String line = br.readLine(); br.close(); if (line == null) throw new IOException("Empty file on hdfs: " + filename); return Double.parseDouble(line); } public static long readIntegerFromHDFSFile(String filename) throws IOException { BufferedReader br = setupInputFile(filename); String line = br.readLine(); br.close(); if (line == null) throw new IOException("Empty file on hdfs: " + filename); return Long.parseLong(line); } public static boolean readBooleanFromHDFSFile(String filename) throws IOException { BufferedReader br = setupInputFile(filename); String line = br.readLine(); br.close(); if (line == null) throw new IOException("Empty file on hdfs: " + filename); return Boolean.parseBoolean(line); } public static String readStringFromHDFSFile(String filename) throws IOException { BufferedReader br = setupInputFile(filename); // handle multi-line strings in the HDFS file StringBuilder sb = new StringBuilder(); String line = null; while ((line = br.readLine()) != null) { sb.append(line); sb.append("\n"); } br.close(); //return string without last character return sb.substring(0, sb.length() - 1); } private static BufferedWriter setupOutputFile(String filename) throws IOException { Path pt = new Path(filename); FileSystem fs = FileSystem.get(_rJob); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); return br; } public static void writeDoubleToHDFS(double d, String filename) throws IOException { BufferedWriter br = setupOutputFile(filename); String line = "" + d; br.write(line); br.close(); } public static void writeIntToHDFS(long i, String filename) throws IOException { BufferedWriter br = setupOutputFile(filename); String line = "" + i; br.write(line); br.close(); } public static void writeBooleanToHDFS(boolean b, String filename) throws IOException { BufferedWriter br = setupOutputFile(filename); String line = "" + b; br.write(line); br.close(); } public static void writeStringToHDFS(String s, String filename) throws IOException { BufferedWriter br = setupOutputFile(filename); String line = "" + s; br.write(line); br.close(); } public static void writeDimsFile(String filename, byte[] unknownFlags, long[] maxRows, long[] maxCols) throws IOException { BufferedWriter br = setupOutputFile(filename); StringBuilder line = new StringBuilder(); for (int i = 0; i < unknownFlags.length; i++) { if (unknownFlags[i] != (byte) 0) { line.append(i); line.append(" " + maxRows[i]); line.append(" " + maxCols[i]); line.append("\n"); } } br.write(line.toString()); br.close(); //System.out.println("Finished writing dimsFile: " + filename); } public static MatrixCharacteristics[] processDimsFiles(String dir, MatrixCharacteristics[] stats) throws IOException { Path pt = new Path(dir); FileSystem fs = FileSystem.get(_rJob); if (!fs.exists(pt)) return stats; FileStatus fstat = fs.getFileStatus(pt); if (fstat.isDirectory()) { FileStatus[] files = fs.listStatus(pt); for (int i = 0; i < files.length; i++) { Path filePath = files[i].getPath(); //System.out.println("Processing dims file: " + filePath.toString()); BufferedReader br = setupInputFile(filePath.toString()); String line = ""; while ((line = br.readLine()) != null) { String[] parts = line.split(" "); int resultIndex = Integer.parseInt(parts[0]); long maxRows = Long.parseLong(parts[1]); long maxCols = Long.parseLong(parts[2]); stats[resultIndex].setDimension( (stats[resultIndex].getRows() < maxRows ? maxRows : stats[resultIndex].getRows()), (stats[resultIndex].getCols() < maxCols ? maxCols : stats[resultIndex].getCols())); } br.close(); } } else { throw new IOException(dir + " is expected to be a folder!"); } return stats; } public static void writeMetaDataFile(String mtdfile, ValueType v, MatrixCharacteristics mc, OutputInfo outinfo) throws IOException { writeMetaDataFile(mtdfile, v, mc, outinfo, null); } public static void writeMetaDataFile(String mtdfile, ValueType v, MatrixCharacteristics mc, OutputInfo outinfo, FileFormatProperties formatProperties) throws IOException { Path pt = new Path(mtdfile); FileSystem fs = FileSystem.get(_rJob); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); formatProperties = (formatProperties == null && outinfo == OutputInfo.CSVOutputInfo) ? new CSVFileFormatProperties() : formatProperties; String line = ""; try { line += "{ \n" + " \"" + DataExpression.DATATYPEPARAM + "\": \"matrix\"\n" + " ,\"" + DataExpression.VALUETYPEPARAM + "\": "; switch (v) { case DOUBLE: line += "\"double\"\n"; break; case INT: line += "\"int\"\n"; break; case BOOLEAN: line += "\"boolean\"\n"; break; case STRING: line += "\"string\"\n"; break; case UNKNOWN: line += "\"unknown\"\n"; break; case OBJECT: line += "\"object\"\n"; break; } ; line += " ,\"" + DataExpression.READROWPARAM + "\": " + mc.getRows() + "\n" + " ,\"" + DataExpression.READCOLPARAM + "\": " + mc.getCols() + "\n"; // only output rows_in_block and cols_in_block for binary format if (outinfo == OutputInfo.BinaryBlockOutputInfo) { line += " ,\"" + DataExpression.ROWBLOCKCOUNTPARAM + "\": " + mc.getRowsPerBlock() + "\n" + " ,\"" + DataExpression.COLUMNBLOCKCOUNTPARAM + "\": " + mc.getColsPerBlock() + "\n"; } line += " ,\"" + DataExpression.READNUMNONZEROPARAM + "\": " + mc.getNonZeros() + "\n" + " ,\"" + DataExpression.FORMAT_TYPE + "\": "; if (outinfo == OutputInfo.TextCellOutputInfo) { line += "\"text\"\n"; } else if (outinfo == OutputInfo.BinaryBlockOutputInfo || outinfo == OutputInfo.BinaryCellOutputInfo) { line += "\"binary\"\n"; // currently, there is no way to differentiate between them } else if (outinfo == OutputInfo.CSVOutputInfo) { line += "\"csv\"\n"; } else { line += "\"specialized\"\n"; } if (outinfo == OutputInfo.CSVOutputInfo) { CSVFileFormatProperties csvProperties = (CSVFileFormatProperties) formatProperties; line += " ,\"" + DataExpression.DELIM_HAS_HEADER_ROW + "\": " + csvProperties.hasHeader() + "\n"; line += " ,\"" + DataExpression.DELIM_DELIMITER + "\": \"" + csvProperties.getDelim() + "\"\n"; } line += " ,\"description\": { \"author\": \"SystemML\" } \n" + "}"; br.write(line); br.close(); } catch (Exception e) { throw new IOException(e); } } public static void writeScalarMetaDataFile(String mtdfile, ValueType v) throws IOException { Path pt = new Path(mtdfile); FileSystem fs = FileSystem.get(_rJob); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); try { String line = ""; line += "{ \n" + " \"" + DataExpression.DATATYPEPARAM + "\": \"scalar\"\n" + " ,\"" + DataExpression.VALUETYPEPARAM + "\": "; switch (v) { case DOUBLE: line += "\"double\"\n"; break; case INT: line += "\"int\"\n"; break; case BOOLEAN: line += "\"boolean\"\n"; break; case STRING: line += "\"string\"\n"; break; case UNKNOWN: line += "\"unknown\"\n"; break; case OBJECT: throw new IOException("Write of generic object types not supported."); } ; line += " ,\"" + DataExpression.FORMAT_TYPE + "\": \"text\"\n" + " ,\"description\": { \"author\": \"SystemML\" } \n" + " }"; br.write(line); br.close(); } catch (Exception e) { throw new IOException(e); } } public static double[][] readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo); MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen * clen); return DataConverter.convertToDoubleMatrix(mb); } public static double[] readColumnVectorFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo); MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen * clen); return DataConverter.convertToDoubleVector(mb); } public static double median(String dir, NumItemsByEachReducerMetaData metadata) throws IOException { long[] counts = metadata.getNumItemsArray(); long[] ranges = new long[counts.length]; ranges[0] = counts[0]; for (int i = 1; i < counts.length; i++) ranges[i] = ranges[i - 1] + counts[i]; long total = ranges[ranges.length - 1]; return pickValueWeight(dir, metadata, 0.5, total % 2 == 0)[0]; } public static double pickValue(String dir, NumItemsByEachReducerMetaData metadata, double p) throws IOException { return pickValueWeight(dir, metadata, p, false)[0]; } public static double[] pickValueWeight(String dir, NumItemsByEachReducerMetaData metadata, double p, boolean average) throws IOException { long[] counts = metadata.getNumItemsArray(); long[] ranges = new long[counts.length]; ranges[0] = counts[0]; for (int i = 1; i < counts.length; i++) ranges[i] = ranges[i - 1] + counts[i]; long total = ranges[ranges.length - 1]; // do averaging only if it is asked for; and sum_wt is even average = average && (total % 2 == 0); int currentPart = 0; double cum_weight = 0; long pos = (long) Math.ceil(total * p); while (ranges[currentPart] < pos) { currentPart++; cum_weight += ranges[currentPart]; } int offset; if (currentPart > 0) offset = (int) (pos - ranges[currentPart - 1] - 1); else offset = (int) pos - 1; FileSystem fs = FileSystem.get(_rJob); Path path = new Path(dir); FileStatus[] files = fs.listStatus(path); Path fileToRead = null; for (FileStatus file : files) if (file.getPath().toString().endsWith(Integer.toString(currentPart))) { fileToRead = file.getPath(); break; } if (fileToRead == null) throw new RuntimeException("cannot read partition " + currentPart); FSDataInputStream currentStream = fs.open(fileToRead); DoubleWritable readKey = new DoubleWritable(); IntWritable readValue = new IntWritable(); boolean contain0s = false; long numZeros = 0; if (currentPart == metadata.getPartitionOfZero()) { contain0s = true; numZeros = metadata.getNumberOfZero(); } ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros); int numRead = 0; while (numRead <= offset) { reader.readNextKeyValuePairs(readKey, readValue); numRead += readValue.get(); cum_weight += readValue.get(); } double ret = readKey.get(); if (average) { if (numRead <= offset + 1) { reader.readNextKeyValuePairs(readKey, readValue); cum_weight += readValue.get(); ret = (ret + readKey.get()) / 2; } } currentStream.close(); return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) }; } /** * * @param name * @return */ public static int extractNumberFromOutputFile(String name) { int i = name.indexOf("part-"); assert (i >= 0); return Integer.parseInt(name.substring(i + 5)); } /** * * @param dir * @param permissions * @throws IOException */ public static void createDirIfNotExistOnHDFS(String dir, String permissions) throws IOException { Path path = new Path(dir); try { FileSystem fs = FileSystem.get(_rJob); if (!fs.exists(path)) { char[] c = permissions.toCharArray(); short sU = (short) ((c[0] - 48) * 64); short sG = (short) ((c[1] - 48) * 8); short sO = (short) ((c[2] - 48)); short mode = (short) (sU + sG + sO); FsPermission perm = new FsPermission(mode); fs.mkdirs(path, perm); } } catch (Exception ex) { throw new IOException("Failed in creating a non existing dir on HDFS", ex); } //NOTE: we depend on the configured umask, setting umask in job or fspermission has no effect //similarly setting dfs.datanode.data.dir.perm as no effect either. } /** * * @param filename * @param overwrite * @return * @throws IOException */ public static FSDataOutputStream getHDFSDataOutputStream(String filename, boolean overwrite) throws IOException { FileSystem fs = FileSystem.get(_rJob); Path path = new Path(filename); return fs.create(path, overwrite); } }