Java tutorial
/***************************************************************************** * * * Copyright 2014 Rice University * * * * Licensed under the Apache License, Version 2.0 (the "License"); * * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * * * *****************************************************************************/ package simsql.runtime; import java.util.*; import java.io.*; import simsql.shell.RuntimeParameter; import simsql.shell.PhysicalDatabase; import simsql.code_generator.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.io.*; /** * The VGWrapper operator. * * @author Luis. */ public class VGWrapperOp extends RelOp { // returns a set of necessary mappings. public Set<String> getNecessaryValues() { return new HashSet<String>(Arrays.asList("operation", "inAtts", "seedAtt", "outerInput", "function", "output", "outerInput.inFiles", "outerInput.typeCode", "outerInput.outAtts", "function.functionName", "function.vgInAtts", "function.vgOutAtts", "output.outFile", "output.typeCode", "output.outAtts")); } // returns the name of the operation public String getOperatorName() { return "VGWrapper"; } // returns the set of inputs public String[] getInputs() { ArrayList<String> inputSet = new ArrayList<String>(); inputSet.addAll(getValue("outerInput.inFiles").getStringList()); // add the inner inputs ParsedRHS innerInputs = getValue("innerInputs"); if (innerInputs != null) { for (String innerName : innerInputs.getVarList().keySet()) { inputSet.addAll(getValue("innerInputs." + innerName + ".inFiles").getStringList()); } } String[] foo = { "" }; return inputSet.toArray(foo); } public Map<String, Set<Set<String>>> getExistingSortAtts(PhysicalDatabase myDBase) { Map<String, Set<Set<String>>> ret = new HashMap<String, Set<Set<String>>>(); // add the ones for the outer input. ret.putAll(getBaseSortAtts(myDBase, "outerInput.inFiles", "outerInput.inAtts")); // now, do the same with all the inners. for (String innerName : getInnerInputs()) { ret.putAll(getBaseSortAtts(myDBase, "innerInputs." + innerName + ".inFiles", "innerInputs." + innerName + ".inAtts")); } return ret; } // returns the output of this operation public String getOutput() { return getValue("output.outFile").getStringLiteral(); } public short getOutputTypeCode() { return getValue("output.typeCode").getInteger().shortValue(); } // returns the set of output attribute names public String[] getOutputAttNames() { ArrayList<String> out = new ArrayList<String>(); for (Assignment a : getValue("output.outAtts").getAssignmentList()) { out.add(a.getIdentifier()); } return out.toArray(new String[0]); } // returns the functions -- from the selections and output expressions public String[] getFunctions() { HashSet<String> one = new HashSet<String>(); // outer selections ParsedRHS outerSelRHS = getValue("outerInput.selection"); if (outerSelRHS != null) { one.addAll(outerSelRHS.getExpression().getAllFunctions()); } // outer output for (Assignment a : getValue("outerInput.outAtts").getAssignmentList()) { one.addAll(a.getExpression().getAllFunctions()); } // go through all the inner inputs. ParsedRHS innerInputs = getValue("innerInputs"); if (innerInputs != null) { for (String innerName : innerInputs.getVarList().keySet()) { ParsedRHS innerSelRHS = getValue("innerInputs." + innerName + ".selection"); if (innerSelRHS != null) { one.addAll(innerSelRHS.getExpression().getAllFunctions()); } for (Assignment a : getValue("innerInputs." + innerName + ".outAtts").getAssignmentList()) { one.addAll(a.getExpression().getAllFunctions()); } } } // output selection ParsedRHS outputSelRHS = getValue("output.selection"); if (outputSelRHS != null) { one.addAll(outputSelRHS.getExpression().getAllFunctions()); } // output assignments for (Assignment a : getValue("output.outAtts").getAssignmentList()) { one.addAll(a.getExpression().getAllFunctions()); } return one.toArray(new String[0]); } // returns the set of vg functions public String[] getVGFunctions() { return new String[] { getValue("function.functionName").getStringLiteral() }; } // number of reducers -- zero if we got the sort orders correctly. public int getNumReducers(RuntimeParameter params) { // map-only job if there are no relations to sort. if (!runVGWrapperReducer) { return 0; } // otherwise, use reducers. ExampleRuntimeParameter p = (ExampleRuntimeParameter) params; return p.getNumCPUs(); } // sets up additional configuration parameters. public void setConfigurations(Configuration conf, RuntimeParameter params) { ExampleRuntimeParameter p = (ExampleRuntimeParameter) params; /** conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," + "heap=sites,depth=10,force=n,thread=y,verbose=n,file=%s"); **/ // set the number of iterations conf.setInt("simsql.numIterations", p.getNumIterations()); // set the file name of the VG function. conf.setStrings("simsql.functionFile", new String[] { "/simsql/functions/" + getVGFunctions()[0] + ".so" }); // set the buffer size for data exchange -- 2GB is the maximum because it is a long. int bSize = 0; if (((p.getMemoryPerCPUInMB() / 2) * 1024L * 1024L) > (long) Integer.MAX_VALUE) { bSize = Integer.MAX_VALUE; } else { bSize = (p.getMemoryPerCPUInMB() / 2) * 1024 * 1024; } conf.setInt("simsql.dataBufferSize", bSize); // set the cross product relations. if (crossFiles.size() > 0) { conf.setStrings("simsql.crossFiles", crossFiles.toArray(new String[0])); conf.setStrings("simsql.crossTypeCodes", crossTypeCodes.toArray(new String[0])); conf.setStrings("simsql.crossAttCounts", crossAttCounts.toArray(new String[0])); } // set the sorted input relations. if (sortedInnerFiles.size() > 0) { conf.setStrings("simsql.sortedFiles", sortedInnerFiles.toArray(new String[0])); conf.setStrings("simsql.sortedTypeCodes", sortedTypeCodes.toArray(new String[0])); conf.setStrings("simsql.sortedAttCounts", sortedAttCounts.toArray(new String[0])); } conf.setBoolean("simsql.runVGWrapperReducer", runVGWrapperReducer); } // returns the set of macro replacements public Map<String, String> buildMacroReplacements() { HashMap<String, String> replacements = new HashMap<String, String>(); // get the replacements for the 'merged input' (a.k.a. vgrecord) Map<String, String> mergedAtts = buildAttributeReplacements("inAtts", ""); replacements.put("numAttsVG", "" + mergedAtts.size()); replacements.put("vgSeedAtt", mergedAtts.get(getValue("seedAtt").getIdentifier())); // and the order of the vg call attributes String vgInputAtts = ""; boolean first = true; for (String vgInAtt : getValue("function.vgInAtts").getIdentifierList()) { if (!first) { vgInputAtts += ", "; } vgInputAtts += mergedAtts.get(vgInAtt); first = false; } replacements.put("vgInputAtts", vgInputAtts); // now, let's do the outer input record. Map<String, String> outerAtts = buildAttributeReplacements("outerInput.inAtts", ""); replacements.put("numAttsOuter", "" + outerAtts.size()); replacements.put("outerTypeCode", "" + getValue("outerInput.typeCode").getInteger()); replacements.put("outerInputSelection", buildSelectionReplacement("outerInput.selection", outerAtts)); replacements.put("outerInputAssignments", buildMappedAssignmentReplacements("outerInput.outAtts", "outRec.", "", mergedAtts, outerAtts)); // get the primary hash for the outer input record. String hh = ""; if (outerAtts.keySet().containsAll(sortedOutputAtts) && !sortedOutputAtts.isEmpty()) { for (String sa : sortedOutputAtts) { hh += " ^ " + outerAtts.get(sa) + ".getHashCode()"; } } else { hh = " ^ getSortAttribute()"; } replacements.put("outerPrimaryHash", hh); // and then, the final output record. replacements.put("outputTypeCode", "" + getValue("output.typeCode").getInteger()); // the attributes available to this guy are: all of inAtts + the output of the VGF. HashMap<String, String> finalAtts = new HashMap<String, String>(); finalAtts.putAll(buildAttributeReplacements("function.vgOutAtts", "input_")); finalAtts.putAll(buildAttributeReplacements("inAtts", "seedRec.")); replacements.put("outputAssignments", buildAssignmentReplacements("output.outAtts", "", finalAtts)); replacements.put("numAttsOutput", "" + getValue("output.outAtts").getAssignmentList().size()); replacements.put("outputSelection", buildSelectionReplacement("output.selection", finalAtts)); replacements.put("functionDeclarations", "" + buildFunctionDeclarations()); // then, finally, the names of all the VG inner input classes. ParsedRHS innerInputs = getValue("innerInputs"); if (innerInputs != null) { String innerNames = ""; for (String s : innerInputs.getVarList().keySet()) { innerNames += ", VGInnerInput_" + s + ".class"; } replacements.put("innerInputClassNames", innerNames); } return replacements; } // returns the set of macro replacements for a given inner query public Map<String, String> buildInnerMacroReplacements(String innerInputName, int innerInputID) { HashMap<String, String> replacements = new HashMap<String, String>(); Map<String, ParsedRHS> input = getValue("innerInputs." + innerInputName).getVarList(); if (!input.containsKey("inFiles") || !input.containsKey("inAtts") || !input.containsKey("typeCode") || !input.containsKey("outAtts")) throw new RuntimeException("Incomplete inner input description!"); // now, fill the replacements replacements.put("innerInputName", innerInputName); replacements.put("innerInputTypeCode", "" + input.get("typeCode").getInteger()); Map<String, String> mergedAtts = buildAttributeReplacements("inAtts", ""); Map<String, String> innerAtts = buildAttributeReplacements("innerInputs." + innerInputName + ".inAtts", ""); replacements.put("innerInputSelection", buildSelectionReplacement("innerInputs." + innerInputName + ".selection", innerAtts)); replacements.put("innerInputAssignments", buildMappedAssignmentReplacements( "innerInputs." + innerInputName + ".outAtts", "outRec.", "", mergedAtts, innerAtts)); replacements.put("numInnerInputAtts", "" + innerAtts.size()); replacements.put("functionDeclarations", "" + buildFunctionDeclarations()); replacements.put("innerInputID", "" + innerInputID); // get the primary hash for the inner input record. String hh = ""; if (innerAtts.keySet().containsAll(sortedOutputAtts) && !sortedOutputAtts.isEmpty()) { for (String sa : sortedOutputAtts) { hh += " ^ " + innerAtts.get(sa) + ".getHashCode()"; } } else { hh = " ^ getSortAttribute()"; } replacements.put("innerPrimaryHash", hh); return replacements; } // we override the jar building operation because we are dealing with multiple files. public String buildJarFile(RuntimeParameter paramsIn) { // cast the parameters. ExampleRuntimeParameter params = (ExampleRuntimeParameter) paramsIn; // get a name for the jar file, java template names and work directory String newJarFileName = getOperatorName() + "_" + RelOp.COUNT_OP + ".jar"; String javaFileName = "/simsql/runtime/Macro" + getOperatorName() + RelOp.COUNT_OP + ".java"; String workDirectory = "work_" + getOperatorName() + "_" + RelOp.COUNT_OP; RelOp.COUNT_OP++; // destroy the temp directory, if it's there rmdir(new File(workDirectory)); // make the directories with their runtime/functions substructure if (!(new File(workDirectory + "/simsql/runtime").mkdirs()) || !(new File(workDirectory + "/simsql/functions").mkdirs())) { throw new RuntimeException("Could not prepare/create the work directories for macro replacement"); } // get the macro replacements for the inner. ArrayList<String> javaFileNames = new ArrayList<String>(); ParsedRHS innerInputs = getValue("innerInputs"); if (innerInputs != null) { int yx = 1; // id=0 for the outer input. for (String innerName : innerInputs.getVarList().keySet()) { String newFileName = "/simsql/runtime/Macro" + getOperatorName() + RelOp.COUNT_OP + innerName + ".java"; MacroReplacer innerReplacer = new MacroReplacer(getInnerTemplateFile(), workDirectory + "/" + newFileName, buildInnerMacroReplacements(innerName, yx)); javaFileNames.add(newFileName); yx++; } } javaFileNames.add(javaFileName); return buildJarFile(paramsIn, newJarFileName, workDirectory, javaFileName, javaFileNames.toArray(new String[0])); } // returns the name of the template Java source file. public String getTemplateFile() { return "VGWrapperRecords.javat"; } public String getInnerTemplateFile() { return "VGWrapperInnerRecord.javat"; } // returns the mapper class. public Class<? extends Mapper> getMapperClass() { return VGWrapperMapper.class; } // returns the reducer class. public Class<? extends Reducer> getReducerClass() { return VGWrapperReducer.class; } public Class getOutputValueClass() { return VGOutputRecord.class; } public VGWrapperOp(Map<String, ParsedRHS> inValues) { super(inValues); // get the seed/merge attribute name seedAtt = getValue("seedAtt").getIdentifier(); // the default sorting is on the seed. sortedOutputAtts.add(seedAtt); allSortedOutputAtts.add(sortedOutputAtts); } // seed attribute name String seedAtt; // set of cross product relations. private ArrayList<String> crossFiles = new ArrayList<String>(); private ArrayList<String> crossTypeCodes = new ArrayList<String>(); private ArrayList<String> crossAttCounts = new ArrayList<String>(); // set of inner relations. private ArrayList<String> sortedInnerFiles = new ArrayList<String>(); private ArrayList<String> sortedTypeCodes = new ArrayList<String>(); private ArrayList<String> sortedAttCounts = new ArrayList<String>(); // set of sorted output attributes. by default, our seed! private Set<String> sortedOutputAtts = new HashSet<String>(); private Set<Set<String>> allSortedOutputAtts = new HashSet<Set<String>>(); // false means "run a map-side VGW." private boolean runVGWrapperReducer = true; public Set<Set<String>> getAllSortAtts(String fName, Map<String, Set<Set<String>>> sortingAttsIn) { Set<Set<String>> ret = new HashSet<Set<String>>(); String feeder = getNetworkOnInputSide().getFeeder(fName); if (sortingAttsIn.containsKey(feeder)) { ret.addAll(sortingAttsIn.get(feeder)); } ret.addAll(getNetworkOnInputSide().findAnyAdditionalSortAttributes(feeder, ret)); return ret; } // selects which inner inputs will be read directly from the pipeline. public void determineWhatAlgorithmToRun(Set<Set<String>> votes, Map<String, Set<String>> fds, Map<String, Set<Set<String>>> sortingAttsIn) { System.out.println("-------------------------------------------"); System.out.println("VGWRAPPER DECIDING ON WHAT ALGORITHM TO USE"); // first, we'll get all the sorting attributes from the outer // input and walk through the pipeline. Set<Set<String>> outerSortAtts = getAllSortAtts(getOuterFile(), sortingAttsIn); System.out.println("OUTER INPUT SORTED ON: " + outerSortAtts); // then, we will examine the inner inputs and exclude those that are cross products. Map<String, Set<Set<String>>> mergedInputs = new HashMap<String, Set<Set<String>>>(); System.out.println("INNER INPUTS: "); for (String s : getInnerInputs()) { String innerF = getInnerFile(s); System.out.print(" " + s + " [" + innerF + "] "); String feederF = getNetworkOnInputSide().getFeeder(innerF); if (isCrossProduct(s)) { System.out.println("cross product, excluded."); if (!crossFiles.contains(feederF)) { crossFiles.add(getNetworkOnInputSide().getFeeder(innerF)); crossTypeCodes.add(getInnerTypeCode(s)); crossAttCounts.add(getInnerAttCount(s)); } } else { if (feederF.equals(getOuterFile())) { System.out.println("stems from outer, excluded"); } else { Set<Set<String>> innerSortAtts = sortedFDs(seedAtt, fds, getAllSortAtts(innerF, sortingAttsIn)); mergedInputs.put(s, innerSortAtts); System.out.println("merged on " + innerSortAtts + " " + getAllSortAtts(innerF, sortingAttsIn)); } } } // then, we will group the merged inputs into equivalence classes // based on their compatible sorting atts. // map from [sort orders] -> [set of inputs] Map<Set<Set<String>>, Set<String>> eqClasses = new HashMap<Set<Set<String>>, Set<String>>(); for (String s : mergedInputs.keySet()) { // first, add them all to the map. Set<Set<String>> innerOrders = mergedInputs.get(s); if (eqClasses.containsKey(innerOrders)) { // update sets. eqClasses.get(innerOrders).add(s); } else { // add singleton Set<String> ss = new HashSet<String>(); ss.add(s); eqClasses.put(innerOrders, ss); } } // now, continuously update the map boolean changedMap = true; while (changedMap) { changedMap = false; // maybe we'll have to break. if (eqClasses.size() < 2) { break; } // try a pair of sort orders for (Set<Set<String>> sx : eqClasses.keySet()) { for (Set<Set<String>> sy : eqClasses.keySet()) { // don't do equals. if (sx == sy) { continue; } // check if the sorts are compatible. if (equalSorts(sx, sy) != null) { // excellent! merge them. Set<Set<String>> newKey = new HashSet<Set<String>>(); newKey.addAll(sx); newKey.addAll(sy); Set<String> newValue = new HashSet<String>(); newValue.addAll(eqClasses.get(sx)); newValue.addAll(eqClasses.get(sy)); // change the map eqClasses.remove(sx); eqClasses.remove(sy); eqClasses.put(newKey, newValue); changedMap = true; // break out break; } } // did we find a match? then let's loop back. if (changedMap) { break; } } } System.out.println("Merged input equivalence classes: " + eqClasses); // at this point, we can check for the possibility of a pure. // map-side vgwrapper. // // first, check if there are no merged inputs. if (mergedInputs.isEmpty()) { System.out.println("No merged inner inputs to exclude! Map-side VGWrapper."); // set the sorting. if (!outerSortAtts.isEmpty()) { Set<String> someSort = usableSort(outerSortAtts, getValue("outerInput.inAtts").getIdentifierList()); if (someSort != null) { sortedOutputAtts = someSort; allSortedOutputAtts = outerSortAtts; } } runVGWrapperReducer = false; return; } // then, check if there is a sort order that covers every merged // inner input, and that the outer input is sorted like that too. // look at the map for (Set<Set<String>> ss : eqClasses.keySet()) { // check the two conditions. if (eqClasses.get(ss).containsAll(mergedInputs.keySet()) && equalSorts(ss, outerSortAtts) != null) { // bingo! generate our sorting attributes. allSortedOutputAtts.clear(); allSortedOutputAtts.addAll(outerSortAtts); allSortedOutputAtts.addAll(ss); sortedOutputAtts = usableSort(allSortedOutputAtts, getValue("outerInput.inAtts").getIdentifierList()); System.out.println("Compatible sorts across all merged inner inputs! Map-side VGWrapper."); // see if we want to swap the outer input for something larger String largestInput = null; long largestInputSize = -1; for (String s : mergedInputs.keySet()) { String feederF = getNetworkOnInputSide().getFeeder(getInnerFile(s)); long mySize = getPathsActualSize(new String[] { feederF }); if (mySize > largestInputSize) { largestInput = s; largestInputSize = mySize; } } long outerInputSize = getPathsActualSize(new String[] { getOuterFile() }); /*** DEBUG: I'm disabling this feature because it interferes with the GMM with imputation. */ largestInputSize = -1; if (outerInputSize > 0 && largestInputSize > 0 && largestInputSize > outerInputSize) { System.out.println("Excluding the outer input to make " + largestInput + " the main branch."); sortedInnerFiles.add(getOuterFile()); sortedTypeCodes.add("" + getValue("outerInput.typeCode").getInteger()); sortedAttCounts.add("" + getValue("outerInput.inAtts").getIdentifierList().size()); largestInput = getNetworkOnInputSide().getFeeder(getInnerFile(largestInput)); } else { largestInput = "<nix>"; } // exclude all the inputs. for (String s : mergedInputs.keySet()) { String feederF = getNetworkOnInputSide().getFeeder(getInnerFile(s)); if (!sortedInnerFiles.contains(feederF) && !feederF.equals(largestInput)) { sortedInnerFiles.add(feederF); sortedTypeCodes.add(getInnerTypeCode(s)); sortedAttCounts.add(getInnerAttCount(s)); } } // we're done. runVGWrapperReducer = false; return; } } // if we get here, then we have to run a reducer. System.out.println("VGWrapper has to reduce. Trying to exclude as many inner inputs as possible."); // we'll look at the map and find the sort with the most inputs. Set<Set<String>> biggest = null; int biggestSize = 0; // note: this cannot be done if the empty equivalence class is // present. that basically means there is at least one relation // that had no sorts with a functional dependency compatible with // the outer input, e.g. a little join. if (!eqClasses.containsKey(new HashSet<String>())) { for (Set<Set<String>> ss : eqClasses.keySet()) { // check the conditions Set<String> ssInners = eqClasses.get(ss); if (usableSort(ss, getValue("outerInput.inAtts").getIdentifierList()) != null && biggestSize < ssInners.size()) { biggest = ss; biggestSize = ssInners.size(); } } } // did we find one? good! exclude, then. if (biggest != null) { System.out.println("VGWrapper sorting on " + biggest + " with excluded inputs."); for (String s : eqClasses.get(biggest)) { String feederF = getNetworkOnInputSide().getFeeder(getInnerFile(s)); if (!sortedInnerFiles.contains(feederF)) { sortedInnerFiles.add(feederF); sortedTypeCodes.add(getInnerTypeCode(s)); sortedAttCounts.add(getInnerAttCount(s)); } } allSortedOutputAtts.clear(); allSortedOutputAtts.addAll(biggest); sortedOutputAtts = usableSort(allSortedOutputAtts, getValue("outerInput.inAtts").getIdentifierList()); return; } // if we could not find such a sort, then we'll have to use the // seed to merge. this is the worst case and default behavior. System.out.println("VGWrapper could not find any optimized sort orders. Using default seed-based merging."); } // returns the name of the outer input. private String getOuterFile() { String[] ofArr = getValue("outerInput.inFiles").getStringList().toArray(new String[0]); if (ofArr.length > 1) { throw new RuntimeException("Not supporting more than one outer file!"); } return getNetworkOnInputSide().getFeeder(ofArr[0]); } // true if a file is the outer. private boolean isFileOuter(String fName) { return getOuterFile().equals(fName); } // returns true if a given input file is pipelined private boolean isFilePipelined(String fName) { return !getNetworkOnInputSide().getFeeder(fName).equals(fName); } // gets the file of an inner input by name private String getInnerFile(String innerName) { ParsedRHS innerInputs = getValue("innerInputs"); if (innerInputs != null) { String[] ofArr = getValue("innerInputs." + innerName + ".inFiles").getStringList() .toArray(new String[0]); if (ofArr.length > 1) { throw new RuntimeException("Not supporting more than one inner file in the array!"); } return ofArr[0]; } return null; } // returns the type code of a given inner input. private String getInnerTypeCode(String innerName) { return "" + getDB() .getTypeCode(getDB().getTableName(getNetworkOnInputSide().getFeeder(getInnerFile(innerName)))); } // returns the number of atts of a given inner input. private String getInnerAttCount(String innerName) { return "" + getDB() .getNumAtts(getDB().getTableName(getNetworkOnInputSide().getFeeder(getInnerFile(innerName)))); } // gets the name of an inner input by file. private String getInnerName(String fName) { ParsedRHS innerInputs = getValue("innerInputs"); if (innerInputs != null) { for (String innerName : getInnerInputs()) { if (getInnerFile(innerName).equals(fName)) { return innerName; } } } return null; } // returns true if a given inner input is a cross product private boolean isCrossProduct(String innerName) { ParsedRHS innerInputs = getValue("innerInputs"); if (innerInputs != null && innerName != null) { return !getValue("innerInputs." + innerName + ".inAtts").getIdentifierList().contains(seedAtt); } return false; } // returns the set of all inner input names. private Set<String> getInnerInputs() { ParsedRHS innerInputs = getValue("innerInputs"); if (innerInputs != null) { HashSet<String> kx = new HashSet<String>(); kx.addAll(innerInputs.getVarList().keySet()); return kx; } else return new HashSet<String>(); } // do we actually have inner inputs? private boolean haveInnerInputs() { return getValue("innerInputs") != null; } // returns the subset of sort orders that are functionally // determined by whichAtts private Set<Set<String>> sortedFDs(String whichAtt, Map<String, Set<String>> fds, Set<Set<String>> sortOrder) { Set<Set<String>> sortKeys = new HashSet<Set<String>>(); if (fds.containsKey(whichAtt) && sortOrder != null) { Set<String> allFDs = fds.get(whichAtt); for (Set<String> ss : sortOrder) { if (allFDs.containsAll(ss) || (ss.size() == 1 && ss.contains(whichAtt))) { sortKeys.add(ss); } } } return sortKeys; } // returns non-null if two sets of sorting atts are equivalent. private Set<String> equalSorts(Set<Set<String>> set1, Set<Set<String>> set2) { if (set1 == null || set2 == null) { return null; } for (Set<String> ss : set1) { if (set2.contains(ss)) { return ss; } } return null; } // returns a sort order that can be obtained from a set of attributes. otherwise, null. private Set<String> usableSort(Set<Set<String>> set1, Collection<String> set2) { for (Set<String> ss : set1) { if (set2.containsAll(ss)) { return ss; } } return null; } public Set<Set<String>> getSortedOutputAtts() { HashSet<Set<String>> ret = new HashSet<Set<String>>(); ret.addAll(allSortedOutputAtts); return ret; } // this operator doesn't really have a "possible" set of sorting // atts to offer the next operator. we just give whatever we get. public Set<Set<String>> getAllPossibleSortingAtts() { return new HashSet<Set<String>>(); } // our set of preferred attributes for each input is always dependent on whatever the // seed FDs can give us and on the type of input that we have. private Set<Set<String>> pastVotes = new LinkedHashSet<Set<String>>(); public Set<String> getPreferredSortOrder(String fName, Map<String, Set<String>> fds, Set<Set<String>> allPossibleSortingAtts) { System.out.println("==> VOTING ON PREFERRED SORT ORDERS THROUGH THE VGWRAPPER."); // is this the outer input? if (isFileOuter(fName)) { System.out.println("VGW OUTER INPUT: VOTING null"); return null; } // is this a cross product inner input? if (isCrossProduct(getInnerName(fName))) { System.out.println("VGW CROSS PRODUCT INPUT: VOTING null"); return null; } // if we're here, it's a merged inner input. Set<Set<String>> usefulSortingAtts = sortedFDs(seedAtt, fds, allPossibleSortingAtts); // did we find something? if not, then we can't do much. if (usefulSortingAtts.isEmpty()) { System.out.println("VGW: INNER INPUT. NO USEFUL SORTS FOUND! VOTING null"); return null; } // if we did, then check if there are previous votes. if (pastVotes.isEmpty()) { // if not, then we'll vote for the most likely... System.out.println("VGW: INNER INPUT. NO PREVIOUS VOTES, VOTING FOR MOST LIKELY... "); Set<String> mostLikely = findMostLikelyVote(usefulSortingAtts); System.out.println("...VOTED FOR: " + mostLikely); pastVotes.add(mostLikely); return mostLikely; } // if we did, check if we have a previous vote. for (Set<String> s : usefulSortingAtts) { if (pastVotes.contains(s)) { System.out.println("VGW: INNER INPUT. FOUND PREVIOUS VOTE ON " + s + ". RETURNING"); return s; } } // if we are here, then it means that none of the sort orders have // been seen before and that there will have to be a reduce. System.out.println("VGW: INNER INPUT. PREVIOUS VOTES NOT USEFUL, VOTING FOR MOST LIKELY..."); Set<String> mostLikely = findMostLikelyVote(usefulSortingAtts); System.out.println("...VOTED FOR: " + mostLikely); pastVotes.add(mostLikely); return mostLikely; } private Set<String> findMostLikelyVote(Set<Set<String>> possibleVotes) { // is the outer input already sorted? String outerInFile = getOuterFile(); System.out.println("PHIL: " + outerInFile); if (outerInFile != null && getDB() != null && getDB().isTableSorted(getDB().getTableName(outerInFile))) { // get the sorting attribute and check. String outerSortAtt = getValue("outerInput.inAtts").getIdentifierList() .get(getDB().getTableSortingAttribute(getDB().getTableName(outerInFile))); System.out.println("ATTE: " + outerSortAtt); for (Set<String> ss : possibleVotes) { if (ss.size() == 1 && ss.contains(outerSortAtt)) { System.out.println("OUTER SORT IS COMPATIBLE!"); return ss; } } } // otherwise, get the first one. // is there only one possible vote? if so, return it. if (possibleVotes.size() == 1) { System.out.println("SINGLE OPTION!"); } for (Set<String> ss : possibleVotes) { return ss; } return null; } // exclude the cross product and sorted input files. public String[] excludeAnyWhoWillNotBeMapped(String[] inFiles) { LinkedHashSet<String> files = new LinkedHashSet<String>(); for (int i = 0; i < inFiles.length; i++) { if (crossFiles.contains(inFiles[i]) || sortedInnerFiles.contains(inFiles[i])) { continue; } files.add(inFiles[i]); } System.out.println("excluded cross product: " + crossFiles); System.out.println("excluded merged: " + sortedInnerFiles); System.out.println("actual input: " + files); return files.toArray(new String[0]); } }