Java tutorial
/***************************************************************************** * * * Copyright 2014 Rice University * * * * Licensed under the Apache License, Version 2.0 (the "License"); * * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * * * *****************************************************************************/ package simsql.runtime; import java.util.*; import java.io.*; import simsql.shell.RuntimeParameter; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.io.*; import simsql.shell.PhysicalDatabase; /** * A general relational operator class. * * @author Luis. */ public abstract class RelOp { // every relational operation has two "pipe" networks... these are networks of additional, // pipelined operations that run on the input and on the output of the operation. private PipeNetwork myInputNetwork; private PipeNetwork myOutputNetwork; // used to allow access to the physical database private PhysicalDatabase myDB; // gets access to the input network protected PipeNetwork getNetworkOnInputSide() { return myInputNetwork; } // gets access to the output network protected PipeNetwork getNetworkOnOutputSide() { return myOutputNetwork; } // gets access to the physical database -- could be NULL protected PhysicalDatabase getDB() { return myDB; } // for replacements. protected void replaceNetworkOnInputSide(PipeNetwork newInputNetwork) { myInputNetwork = newInputNetwork; } // check to see if we can move part of this guy's operation over to be appended to run // on the output pipeline of another operation public boolean isAppendable(String[] inputFiles) { // the default is a no! return false; } // check to see if the current operation can accept another // operation on its output pipe. public boolean acceptsAppendable() { // the default is a yes return true; } // check to see if the current operation can accept another // operation on its input pipe. public boolean acceptsPipelineable() { // the default is a yes return true; } // this returns a pipe network that can be appended to an earlier operation in order to run part // of this operation at the same time that we run another operation. For example, in the case of // an aggregate, the aggregate would create a pipe network that consisted of (a) it's own input // pipe network, and (b) a new APipeNode object at the end of the pipe network that could run its // mapper. Note that after a call to getAppendableNetwork () has been made, the RelOp should never // again return "true" in response to a call to "isAppendable". public PipeNetwork getAppendableNetwork() { // default is to throw an exception throw new RuntimeException("This operation is not appendable!!"); } // this appends the given pipe network to the output public void appendNetworkOnOutputSide(PipeNetwork appendMe) { appendMe.appendNetworkOnto(myOutputNetwork); myOutputNetwork = appendMe; } // returns the set of existing sorting attributes for our input // relations, as known from the database. this method is to be // invoked at the beginning of the scheduling, so as to "recover" // the sorting attributes of base relations and/or previous // iterations. public Map<String, Set<Set<String>>> getExistingSortAtts(PhysicalDatabase myDBase) { // default: do nothing. return new HashMap<String, Set<Set<String>>>(); } // this is a helper for the above method -- just pass the keys to // get the list of existing sorting attributes. protected Map<String, Set<Set<String>>> getBaseSortAtts(PhysicalDatabase myDBase, String inFilesKey, String inAttsKey) { Map<String, Set<Set<String>>> ret = new HashMap<String, Set<Set<String>>>(); // get the input file name. String inFile = getValue(inFilesKey).getStringList().get(0); // check if it is sorted. if (myDBase.isTableSorted(myDBase.getTableName(inFile))) { // if so, get the name from the list of input atts. int sortedAttPos = myDBase.getTableSortingAttribute(myDBase.getTableName(inFile)); ArrayList<String> atts = getValue(inAttsKey).getIdentifierList(); if (sortedAttPos < atts.size()) { Set<Set<String>> mx = new HashSet<Set<String>>(); Set<String> kx = new HashSet<String>(); kx.add(atts.get(sortedAttPos)); mx.add(kx); ret.put(inFile, mx); } } return ret; } // this creates a simple pipe network taking exactly the files and type codes of this relational operation public final void setupPipeNetwork(PhysicalDatabase myDBase) { // first set up the input pipe network myDB = myDBase; String[] files = getInputs(); Short[] typeCodes = new Short[files.length]; for (int i = 0; i < files.length; i++) { typeCodes[i] = myDBase.getTypeCode(myDBase.getTableName(files[i])); } myInputNetwork = new PipeNetwork(files, typeCodes); // and now set up the output pipe network files = new String[1]; files[0] = getOutput(); typeCodes = new Short[1]; typeCodes[0] = myDBase.getTypeCode(myDBase.getTableName(files[0])); myOutputNetwork = new PipeNetwork(files, typeCodes); } // this puts a new pipelined operation into the INPUT pipe network for the relational op, that will be run on // the relational op's mapper public final PipeNetwork addPipelinedOperationToInputPipeNetwork(PipeNode addMe) { myInputNetwork.addPipelinedOperation(addMe); return myInputNetwork; } // put add the specified pipe network into the INPUT pipe network for the relational op public final void suckUpNetworkOnInput(PipeNetwork addMe) { // myInputNetwork is the one after, who needs to have something pre-pended myInputNetwork.appendNetworkOnto(addMe); } // this gets the list of input files to the relational operation... taking into account all of // the pipelined operations that have been stuffed into the operation, and might require thier // own input files, or might be masking other input files because those files are now not materialized final public String[] getInputFiles() { return myInputNetwork.getInputFiles(); } final public String[] getPipelinedInputFiles() { return myInputNetwork.getPipelinedInputFiles(); } final public String[] getNonPipelinedInputFiles() { return myInputNetwork.getNonPipelinedInputFiles(); } // returns true if this operation can be pipelined, given the specified amount of // memory... the default is no, since most operations cannot be pipelined public boolean isPipelineable(int numMegs) { return false; } // prunes out any input files that we do not actually want to be processed by the mapper of the // corresponding MapReduce job... this is used in the case that we know one inut file is already // sorted, so we can just merge it... the default is to not exclude anything... the default is to not exclude anything public String[] excludeAnyWhoWillNotBeMapped(String[] inFiles) { return inFiles; } // this returns a pipelined version of the relational operator... the default is to // just return a null, since it is not possible to pipeline most operations public PipeNetwork getPipelinedVersion() { return null; } // returns a set of necessary mappings. public abstract Set<String> getNecessaryValues(); // returns the name of the operation public abstract String getOperatorName(); // returns the set of inputs to the "bare" reltional operation, with no pipe network protected abstract String[] getInputs(); // returns the output of this operation public abstract String getOutput(); // this asks the relational operation to decide with algorithm it will use (for example, if // this is a join, it might decide just to do a merge of its two input files, assuming that // they are both sorted. The inputs are as follows: // // votes: this gives all of the sets of sort attributes that another operation using this operation's // output has indicated that it would like to see. For example, say this guy produces the file "x". // Two other ops use "x" as input. One would like to see "x" sorted on att1, and another would like // to see "x" sorted on att1 and att2. In this case, "votes" would contain {{att1}, {att1, att2}}. // // fds: this is the list of all functional dependencies that hold among attributes. It is a map from // an attribute name to the set of all attributes that are functionally determined by that att // // sortingAtts: this is a map from a file name to the set of all atts that the file is sorted on. // public void determineWhatAlgorithmToRun(Set<Set<String>> votes, Map<String, Set<String>> fds, Map<String, Set<Set<String>>> sortingAtts) { } // this asks the relational operation for the set (or sets) of attributes that its output will be sorted // on (it just returns null if there is not a known sort order that this op will produce). Note // that this method will always be called AFTER a call to determineWhatAlgorithmtoRun public Set<Set<String>> getSortedOutputAtts() { return null; } // returns the set of output attribute names public abstract String[] getOutputAttNames(); // returns the output type code public abstract short getOutputTypeCode(); // returns the set of macro replacements public abstract Map<String, String> buildMacroReplacements(); // returns the name of the template Java source file. public abstract String getTemplateFile(); // returns the mapper class. public abstract Class<? extends Mapper> getMapperClass(); // returns the reducer class. public abstract Class<? extends Reducer> getReducerClass(); // returns the set of functions // default -- override if necessary. public String[] getFunctions() { return new String[0]; } // returns the set of vg functions // default -- override if necessary. public String[] getVGFunctions() { return new String[0]; } // returns the number of reduce tasks // default -- override if necessary. public int getNumReducers(RuntimeParameter params) { ExampleRuntimeParameter p = (ExampleRuntimeParameter) params; return p.getNumCPUs(); } // used to obtain the length of a set of paths // returns zero if not possible. public static long getPathsTotalSize(String[] paths) { try { // get a configuration and a fileSystem Configuration conf = new Configuration(); FileSystem dfs = FileSystem.get(conf); long totalSize = 0; for (String s : paths) { Path path = new Path(s); if (dfs.exists(path)) { totalSize += dfs.getContentSummary(path).getLength(); } } // return return totalSize; } catch (Exception e) { return 0; } } // this is called in order to give the pipelined operations that are attached to this // op one last chance to fix up problems that can occur, and are associated with pipelining // tiny input files public void reEvaluateWhichFilesArePipelined() { myInputNetwork.reEvaluateWhichFilesArePipelined(); } // this tries to use the physicalDatabase to obtain the *uncompressed* size of a relation. public long getPathsActualSize(String[] paths) { // use the HDFS one if we fail.... if (myDB == null) { return getPathsTotalSize(paths) * 2; } long count = 0; for (int i = 0; i < paths.length; i++) { count += myDB.getTableSize(myDB.getTableName(paths[i])); } return count; } // returns the max split size, in bytes. // default -- override if necessary. public long getSplitSize(RuntimeParameter params) { // default value = fileSize / numProcessors ExampleRuntimeParameter p = (ExampleRuntimeParameter) params; Configuration conf = new Configuration(); long dfsBlockSize = (long) conf.getInt("dfs.blocksize", 128 * 1024 * 1024); try { // get a configuration and a fileSystem FileSystem dfs = FileSystem.get(conf); long totalSize = 0; for (String s : myInputNetwork.getPipelinedInputFiles()) { Path path = new Path(s); if (dfs.exists(path)) { totalSize += dfs.getContentSummary(path).getLength(); } } // if it's too small, just use a block. if (totalSize < dfsBlockSize) return dfsBlockSize; // otherwise, divide return totalSize / p.getNumCPUs(); } catch (Exception e) { // if we fail, just return the DFS block size!!! return (long) dfsBlockSize; } } // returns the amount of memory that we can afford to allocate to the pipeline // the default assumes that the RelOp uses 1/2 of the RAM for its own stuff public long getRemainingMemForPipelinePerMapper(RuntimeParameter params) { return (getMemPerMapper(params) / 2) - (myInputNetwork.getRAMToRun()); } // returns the memory to allocate per map task, in MB. // default -- override if necessary. public int getMemPerMapper(RuntimeParameter params) { ExampleRuntimeParameter p = (ExampleRuntimeParameter) params; return p.getMemoryPerCPUInMB(); } // returns the memory to allocate per reduce task, in MB. // default -- override if necessary. public int getMemPerReducer(RuntimeParameter params) { ExampleRuntimeParameter p = (ExampleRuntimeParameter) params; return p.getMemoryPerCPUInMB(); } // sets up additional configuration parameters. // default -- override if necessary. public void setConfigurations(Configuration conf, RuntimeParameter params) { // do nothing. } // returns the serializations string. // default value -- override if necessary. public String getSerializations() { return "simsql.runtime.RecordSerialization,simsql.runtime.RecordKeySerialization,org.apache.hadoop.io.serializer.WritableSerialization"; } // returns the map output key class. // default values -- override if necessary public Class getMapOutputKeyClass() { return RecordKey.class; } // returns the map output value class. // default value -- override if necessary. public Class getMapOutputValueClass() { return RecordWrapper.class; } // returns the output key class. // default value -- override if necessary. public Class getOutputKeyClass() { return Nothing.class; } // returns the output value class. // default value -- override if necessary. public Class getOutputValueClass() { return Record.class; } // returns the input format class. // default value -- override if necessary. public Class<? extends InputFormat> getInputFormatClass() { return RecordInputFormat.class; } // returns the output format class. // default value -- override if necessary. public Class<? extends OutputFormat> getOutputFormatClass() { return RecordOutputFormat.class; } // returns the grouping comparator class // default value -- override if necessary. public Class<? extends RawComparator> getGroupingComparatorClass() { return RecordKeyGroupingComparator.class; } // returns the partitioner class. // default value -- override if necessary. public Class<? extends Partitioner> getPartitionerClass() { return RecordPartitioner.class; } // returns the sort comparator class. // default value -- override if necessary. public Class<? extends RawComparator> getSortComparatorClass() { return RecordKeySortComparator.class; } // retrieves a given value public ParsedRHS getValue(String mappingName) { String curMapping = mappingName; Map<String, ParsedRHS> topRHS = values; // keep on moving down until we get the last one. while (curMapping.indexOf('.') >= 0) { String nextMapping = curMapping.substring(0, curMapping.indexOf('.')); topRHS = topRHS.get(nextMapping).getVarList(); // check validity. if (topRHS == null) return null; curMapping = curMapping.substring(curMapping.indexOf('.') + 1); } // and then, get the final RHS. return topRHS.get(curMapping); } // returns the set of function declarations public String buildFunctionDeclarations() { String outStr = ""; for (String s : getFunctions()) { outStr += "simsql.functions." + s + " func_" + s + " = new simsql.functions." + s + "();\n "; } return outStr; } // generic macro builder for a list of attributes (inAtts, // groupByAtts, etc.). transform the attribute names sequentially // into atts[i] corresponding to an AbstractRecord type, and // returns the mappings. // call like this: buildAttributeReplacements("leftInput.inAtts", "this."); // returns something like: {("o_orderkey", "atts[0]"), ...} public Map<String, String> buildAttributeReplacements(String valueKey, String prefix) { ArrayList<String> atts = getValue(valueKey).getIdentifierList(); Map<String, String> attReplacements = new HashMap<String, String>(); int counter = 0; for (String s : atts) { attReplacements.put(s, prefix + "atts[" + counter + "]"); counter++; } return attReplacements; } // generic macro builder for a selection predicate. // call it like this: buildSelectionReplacement("leftInput.selection", inAttReplacements); public String buildSelectionReplacement(String valueKey, Map<String, String> inAttReplacements) { // check if it's around ParsedRHS selRHS = getValue(valueKey); if (selRHS == null) { return "BitstringWithSingleValue.TRUE"; } return selRHS.getExpression().print(inAttReplacements); } // generic macro builder for a list of assignments (outAtts). it // uses existing attribute replacement names to transform things like // "output.o_orderkey = o_orderkey" into "output.atts[0] = atts[1]". // call like this: // buildAssignmentReplacements("leftInput.outAtts", "returnVal.", // inputAttReplacements); // // returns a String with a bunch of Java assignment statements. public String buildAssignmentReplacements(String valueKey, String prefix, Map<String, String> inAttReplacements) { String outStr = ""; int counter = 0; for (Assignment a : getValue(valueKey).getAssignmentList()) { outStr += prefix + "atts[" + counter + "] = " + a.getExpression().print(inAttReplacements) + ";\n "; counter++; } return outStr; } // same as above, but looks up the LHS attribute in a map instead of generating the counters. public String buildMappedAssignmentReplacements(String valueKey, String lhsPrefix, String rhsPrefix, Map<String, String> lhsMap, Map<String, String> rhsMap) { String outStr = ""; for (Assignment a : getValue(valueKey).getAssignmentList()) { outStr += lhsPrefix + lhsMap.get(a.getIdentifier()) + " = " + rhsPrefix + a.getExpression().print(rhsMap) + ";\n "; } return outStr; } // this returns the set of all possible sorting attribute combinations that this operation could produce public Set<Set<String>> getAllPossibleSortingAtts() { // this is the output Set<Set<String>> retVal = new HashSet<Set<String>>(); // we look for everything that has a "hashAtts" tag, then produce all subsets of the hashAtts for (Map.Entry<String, ParsedRHS> i : values.entrySet()) { ParsedRHS temp = null; if (i.getKey().equals("hashAtts")) temp = i.getValue(); else if (i.getValue().getVarList() != null) { temp = i.getValue().getVarList().get("hashAtts"); } if (temp != null) { if (temp.getIdentifierList() == null) continue; // we do, so get all possible subsets of the hash atts retVal.addAll(SubsetMachine.getAllSubsets(temp.getIdentifierList())); } } return retVal; } // this remembers any functional dependencies that are created by the relation public void addAnyNewFDs(Map<String, Set<String>> fds, Map<String, Set<Set<String>>> esa) { // the default implementation just looks for any seed () operations, since this creates FDs // we look for everything that has a "outAtts" tag for (String s : getNecessaryValues()) { if (!s.contains("outAtts")) continue; ParsedRHS temp = getValue(s); if (temp != null) { // see if we have an identifier list in here if (temp.getIdentifierList() != null) continue; // we do, so look for a "seed" operation ArrayList<Assignment> myList = temp.getAssignmentList(); String lhs = null; Set<String> rhs = new HashSet<String>(); // loop through all of the assignments in the "outAtts" list for (Assignment a : myList) { // see if we got an assignment from a "seed" function Expression e = a.getExpression(); if (e.getType().equals("func") && e.getValue().contains("seed")) { // we did, so remember whast we are assigning to lhs = a.getIdentifier(); } else { // we did not, so there is a functional dependency from the seed rhs.add(a.getIdentifier()); } } // if we got a seed operation, then add this to the output if (lhs != null) { // also, check if this comes with sorting attributes so that // we can set the equivalence. /*** for (String ff : esa.keySet()) { boolean gotThem = false; for (Set<String> ss : esa.get(ff)) { if (rhs.containsAll(ss)) { gotThem = true; break; } } if (gotThem) { for (Set<String> ss : esa.get(ff)) { rhs.addAll(ss); } } } ***/ // add output fds.put(lhs, rhs); } } } } // this asks the relational operation whether it has a preferred sort order for the input atts public Set<String> getPreferredSortOrder(String fName, Map<String, Set<String>> fds, Set<Set<String>> allPossibleSortingAtts) { // the default implementation gets all possible sorting atts... it then tries to find // the largest set of sorting atts that matches up with a set in allPossibleSortingAtts Set<Set<String>> myPossibleSortingAtts = getAllPossibleSortingAtts(); System.out.println("possible sorting atts: " + myPossibleSortingAtts); Set<String> best = null; for (Set<String> s : myPossibleSortingAtts) { if (allPossibleSortingAtts.contains(s)) { if (best == null || s.size() > best.size()) { best = s; } } } return best; } // generic macro builder for a list of hash attributes (hashAtts) // that is used to build a HashableRecord. // call it like this: // buildHashReplacements("leftInput.hashAtts", "leftInput.outAtts", "this."); // returns a string with the hash loop. public String buildHashReplacements(String valueKey, String outAttsValueKey, String prefix) { // check if (getValue(valueKey) == null) return ""; String outStr = ""; // for each of the hash atts, we have to find the position in the atts array. for (String h : getValue(valueKey).getIdentifierList()) { int counter = 0; for (Assignment a : getValue(outAttsValueKey).getAssignmentList()) { if (h.equals(a.getIdentifier())) { outStr += " ^ " + prefix + "atts[" + counter + "].getHashCode ()"; } counter++; } } return outStr; } public String buildHashReplacements(Set<String> values, String outAttsValueKey, String prefix) { // check if (values.size() == 0) return ""; String outStr = ""; // for each of the hash atts, we have to find the position in the atts array. for (String h : values) { int counter = 0; for (Assignment a : getValue(outAttsValueKey).getAssignmentList()) { if (h.equals(a.getIdentifier())) { outStr += " ^ " + prefix + "atts[" + counter + "].getHashCode ()"; } counter++; } } System.out.println(outStr); return outStr; } // similar to the previous one, but returns the positions of the // respective attributes as an array. public String buildAttPositionsArray(String valueKey, String attKey) { // check for nulls if (getValue(valueKey) == null) return ""; String outStr = ""; boolean first = true; for (String h : getValue(valueKey).getIdentifierList()) { int ct = 0; for (Assignment a : getValue(attKey).getAssignmentList()) { if (h.equals(a.getIdentifier())) { if (!first) { outStr += ", " + ct; } else { outStr += ct; first = false; } } } ct++; } return outStr; } public String buildAttPositionsArray(Set<String> values, String attKey) { String outStr = ""; boolean first = true; for (String h : values) { int ct = 0; for (Assignment a : getValue(attKey).getAssignmentList()) { if (h.equals(a.getIdentifier())) { if (!first) { outStr += ", " + ct; } else { outStr += ct; first = false; } } ct++; } } return outStr; } // generic macro builder for a list of attributes based on their // LHS assignment identifier (e.g. outAtts). // call it like this: // buildAttributeAssignReplacements("leftInput.outAtts", "left.", "leftRec."); // returns something like: {("left.o_orderkey", "leftRec.atts[0]"), ...}. public Map<String, String> buildAttributeAssignReplacements(String valueKey, String lhsPrefix, String rhsPrefix) { Map<String, String> retMap = new HashMap<String, String>(); int counter = 0; for (Assignment a : getValue(valueKey).getAssignmentList()) { retMap.put(lhsPrefix + a.getIdentifier(), rhsPrefix + "atts[" + counter + "]"); counter++; } return retMap; } // prints information about this operator. public String print() { String returnVal = getOperatorName(); for (Map.Entry<String, ParsedRHS> i : values.entrySet()) { returnVal += "<" + i.getKey(); returnVal += ": " + i.getValue().print() + ">\n"; } return returnVal; } // used to unlink an entire subdirectory structure protected void rmdir(File f) { // does it exist? if (!f.exists()) return; // is it a directory? if (f.isDirectory()) { // then, recursively delete its contents for (File ff : f.listFiles()) { rmdir(ff); } } // destroy it if (!f.delete()) { throw new RuntimeException("Could not prepare/delete the work directories for macro replacement"); } } // builds the jar file for this operation. public String buildJarFile(RuntimeParameter paramsIn) { // cast the parameters. ExampleRuntimeParameter params = (ExampleRuntimeParameter) paramsIn; // get a name for the jar file, java template names and work directory String newJarFileName = getOperatorName() + "_" + RelOp.COUNT_OP + ".jar"; String javaFileName = "/simsql/runtime/Macro" + getOperatorName() + RelOp.COUNT_OP + ".java"; String workDirectory = "work_" + getOperatorName() + "_" + RelOp.COUNT_OP; RelOp.COUNT_OP++; // destroy the temp directory, if it's there rmdir(new File(workDirectory)); // make the directories with their runtime/functions substructure if (!(new File(workDirectory + "/simsql/runtime").mkdirs()) || !(new File(workDirectory + "/simsql/functions").mkdirs())) { throw new RuntimeException("Could not prepare/create the work directories for macro replacement"); } return buildJarFile(paramsIn, newJarFileName, workDirectory, javaFileName, new String[] { javaFileName }); } // helper for building jars final protected String buildJarFile(RuntimeParameter paramsIn, String newJarFileName, String workDirectory, String javaFileName, String[] filesToCompile) { // first, we combine the input and output networks into the same network PipeNetwork myNetwork = new PipeNetwork(myInputNetwork, myOutputNetwork); // cast the parameters. ExampleRuntimeParameter params = (ExampleRuntimeParameter) paramsIn; // first, write out all of the Java source codes associated with the network of pipelined ops that will run in the mapper ArrayList<String> fNames = myNetwork.writeJavaCodes(workDirectory); // get the macro replacements and write out the java files Map<String, String> macroReplacements = buildMacroReplacements(); // this will allow us to deserialize any record types that will be used as input to a pipelined op String recTypesString = myNetwork.getPossibleRecordTypesString(); if (!recTypesString.trim().equals("")) recTypesString = recTypesString + ", "; macroReplacements.put("possRecordTypesString", recTypesString); // now we write out the jave source file for acual mapper MacroReplacer myReplacer = new MacroReplacer(getTemplateFile(), workDirectory + "/" + javaFileName, macroReplacements); // write the functions params.writeAllFunctionsToDirectory(workDirectory + "/simsql/functions"); // write the VG functions for (String vf : getVGFunctions()) { params.writeVGFunctionToFile(vf, new File(workDirectory + "/simsql/functions", vf + ".so")); } // now we write out the PipeNetwork object try { FileOutputStream fileOut = new FileOutputStream(workDirectory + "/simsql/runtime/PipeNetwork.obj"); ObjectOutputStream out = new ObjectOutputStream(fileOut); out.writeObject(myNetwork); out.close(); fileOut.close(); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException( "Got an error when I was trying to serialize the network of pipelined operations"); } // get the current jar/resource path String jarPath = null; try { jarPath = RelOp.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath(); } catch (Exception e) { throw new RuntimeException( "Unable to figure out the jar file being used when creating the operators: " + e); } // and compile the resulting file. String[] files = new String[filesToCompile.length + fNames.size()]; for (int i = 0; i < files.length; i++) { if (i < filesToCompile.length) files[i] = filesToCompile[i]; else files[i] = fNames.get(i - filesToCompile.length); } Compiler temp = new Compiler(); String[] compilationJars = { jarPath }; temp.getNewJar(workDirectory, jarPath, newJarFileName, compilationJars, files); // destroy the work directory. //rmdir(new File(workDirectory)); // return the file name. return newJarFileName; } // runs this operation. public boolean run(RuntimeParameter params, boolean verbose) { ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params; // build the jar. String jarFile = buildJarFile(params); // Get the default configuration object Configuration conf = new Configuration(); // set quite mode on/off conf.setQuietMode(!verbose); /*** conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," + "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s"); ***/ // tell it how to serialize and deserialize records and recordkeys conf.set("io.serializations", getSerializations()); conf.setBoolean("mapred.compress.map.output", true); int ioSortMB = conf.getInt("io.sort.mb", 256); conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.setInt("simsql.input.numSplits", pp.getNumCPUs()); conf.setInt("mapred.job.reuse.jvm.num.tasks", 1); // conf.setBoolean ("mapred.map.tasks.speculative.execution", false); // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false); // tell it to use the jar that we just created conf.set("mapred.jar", jarFile); // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar"); conf.setBoolean("mapred.output.compress", true); conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" }); // use snappy for the intermediate stuff conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); // do some additional operator-specific configurations setConfigurations(conf, params); // collect statistics for final relations always conf.setBoolean("simsql.collectStats", isFinal || collectStats); // figure out what file to map String[] inDirs = myInputNetwork.getPipelinedInputFiles(); inDirs = excludeAnyWhoWillNotBeMapped(inDirs); String inSingleString = inDirs[0]; conf.set("simsql.fileToMap", inSingleString); for (int i = 1; i < inDirs.length; i++) { inSingleString += "," + inDirs[i]; } // create and name the job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create a new job!", e); } job.setJobName(getJobName()); // set the map-reduce input and output types job.setMapOutputKeyClass(getMapOutputKeyClass()); job.setMapOutputValueClass(getMapOutputValueClass()); job.setOutputKeyClass(getOutputKeyClass()); job.setOutputValueClass(getOutputValueClass()); int numReducers = getNumReducers(params); job.setMapperClass(getMapperClass()); job.setReducerClass(getReducerClass()); // set the number of reducers job.setNumReduceTasks(numReducers); // set the input and the output formats... these extend FileInputFormat and FileOutputFormat job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(getOutputFormatClass()); // set the input and output paths try { System.out.println("input file: " + inSingleString); FileInputFormat.setInputPaths(job, inSingleString); FileInputFormat.setInputPathFilter(job, TableFileFilter.class); FileOutputFormat.setOutputPath(job, new Path(getOutput())); } catch (Exception e) { throw new RuntimeException("Unable to set up the input/output path for the job.", e); } // set the split size FileInputFormat.setMinInputSplitSize(job, getSplitSize(params)); FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params)); // set the various sorting/grouping/mapping classes job.setGroupingComparatorClass(getGroupingComparatorClass()); job.setPartitionerClass(getPartitionerClass()); job.setSortComparatorClass(getSortComparatorClass()); // and now, submit the job and wait for things to finish int exitCode; try { exitCode = job.waitForCompletion(verbose) ? 0 : 1; // get the output bytes counter. Counters c = job.getCounters(); Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); // and use them to set the size of the output relation. if (myDB != null) { myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue()); myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Unable to run the job", e); } // now, delete all the empty part files try { // get a filesystem FileSystem dfs = FileSystem.get(conf); Path outPath = new Path(getOutput()); if (dfs.exists(outPath) && dfs.isDirectory(outPath)) { FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... dfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } return (exitCode == 0); } // returns the name of the job in Hadoop public String getJobName() { String jobName = getOperatorName() + " {"; boolean first = true; for (String i : myInputNetwork.getPipelinedInputFiles()) { if (first) jobName += "'" + i + "'"; else jobName += ", '" + i + "'"; first = false; } jobName += "} || ["; first = true; for (String i : myInputNetwork.getNonPipelinedInputFiles()) { if (first) jobName += "'" + i + "'"; else jobName += ", '" + i + "'"; first = false; } jobName += "] => "; jobName += "'" + getOutput() + "'"; return jobName; } // returns true if the relation is final protected boolean isFinal = false; public boolean isFinal() { return isFinal; } protected boolean collectStats = false; public void setCollectStats(boolean value) { // this cannot be set back to false -- we need it for pipelined // operations on the output. collectStats |= value; } // returns true if we are collecting statistics. public boolean collectStats() { return isFinal || collectStats; } // builds the operator from all the value mappings. public RelOp(Map<String, ParsedRHS> valuesIn) { // make sure that all the necessary values are represented here. values = valuesIn; for (String val : getNecessaryValues()) { if (getValue(val) == null) throw new RuntimeException("Value " + val + " not found in " + getOperatorName() + " description!"); } // see if we have an "isFinal" here... ParsedRHS outputVal = getValue("output"); if (outputVal != null) { ParsedRHS finalVal = getValue("output.isFinal"); isFinal = (finalVal != null && finalVal.getStringLiteral().equals("true")); } } // counter for the jars protected static int COUNT_OP = 1; // the mappings. private Map<String, ParsedRHS> values; }