Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mrql; import org.apache.mrql.gen.*; import java.io.*; import java.net.URI; import java.util.List; import java.util.Vector; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.filecache.DistributedCache; /** The CrossProduct physical operation (similar to block-nested loop) */ final public class CrossProductOperation extends MapReducePlan { /** The mapper for the CrossProduct operation */ private final static class crossProductMapper extends Mapper<MRContainer, MRContainer, MRContainer, MRContainer> { private static String counter; // a Hadoop user-defined counter used in the repeat operation private static Function reduce_fnc; // the reduce function private static Function map_fnc; // the mapper function private static DataSet cached_dataset; private final static List<MRData> outer = new Vector<MRData>(Config.map_cache_size); // fix-size cache for the outer private static int index; private static MRContainer last_key; private static URI[] uris; private static Path[] local_paths; private static Function acc_fnc; // aggregator private static MRData result; // aggregation result private static Tuple pair = new Tuple(2); private static MRContainer container = new MRContainer(new MR_int(0)); private void write(MRContainer key, MRData value, Context context) throws IOException, InterruptedException { if (result != null) { // aggregation pair.set(0, result); pair.set(1, value); result = acc_fnc.eval(pair); } else if (counter.equals("-")) { container.set(value); context.write(key, container); } else { // increment the repetition counter if the repeat condition is true Tuple t = (Tuple) value; if (((MR_bool) t.second()).get()) context.getCounter("mrql", counter).increment(1); container.set(t.first()); context.write(key, container); } } @Override public void map(MRContainer key, MRContainer value, Context context) throws IOException, InterruptedException { try { last_key = key; for (MRData x : (Bag) map_fnc.eval(value.data())) if (index++ == Config.map_cache_size) { for (MRData y : cached_data(context.getConfiguration())) { pair.set(1, y); for (MRData z : outer) { pair.set(0, z); for (MRData v : (Bag) reduce_fnc.eval(pair)) write(key, v, context); } } ; index = 0; outer.clear(); } else outer.add(x); } catch (Exception e) { throw new Error("Cannot perform the crossProduct: " + e); } } protected Bag cached_data(final Configuration conf) { try { Bag res = new Bag(); final FileSystem fs = FileSystem.getLocal(conf); for (int i = 0; i < local_paths.length; i++) { // hadoop 0.20.2 distributed cache doesn't work in stand-alone final Path path = (conf.get("mapred.job.tracker").equals("local")) ? new Path(uris[i].toString()) : local_paths[i]; if (path.getName().endsWith(".jar")) continue; res = res.union(new Bag(new BagIterator() { final SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); final MRContainer key = new MRContainer(new MR_int(0)); final MRContainer value = new MRContainer(new MR_int(0)); public boolean hasNext() { try { boolean done = reader.next(key, value); if (!done) reader.close(); return done; } catch (IOException e) { throw new Error("Cannot collect values from distributed cache"); } } public MRData next() { return value.data(); } })); } ; return res; } catch (Exception e) { throw new Error("Cannot setup the cross product: " + e); } } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); try { conf = context.getConfiguration(); Plan.conf = conf; Config.read(Plan.conf); Tree code = Tree.parse(conf.get("mrql.reducer")); reduce_fnc = functional_argument(conf, code); code = Tree.parse(conf.get("mrql.mapper")); map_fnc = functional_argument(conf, code); if (conf.get("mrql.zero") != null) { code = Tree.parse(conf.get("mrql.zero")); result = Interpreter.evalE(code); code = Tree.parse(conf.get("mrql.accumulator")); acc_fnc = functional_argument(conf, code); } else result = null; counter = conf.get("mrql.counter"); uris = DistributedCache.getCacheFiles(conf); local_paths = DistributedCache.getLocalCacheFiles(conf); index = 0; } catch (Exception e) { throw new Error("Cannot setup the crossProduct: " + e); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { if (index > 0) try { for (MRData y : cached_data(context.getConfiguration())) { pair.set(1, y); for (MRData z : outer) { pair.set(0, z); for (MRData v : (Bag) reduce_fnc.eval(pair)) write(last_key, v, context); } } ; } catch (Exception e) { throw new Error("Cannot cleanup the crossProduct: " + e); } ; index = 0; outer.clear(); if (result != null) // emit the result of aggregation context.write(new MRContainer(new MR_int(0)), new MRContainer(result)); super.cleanup(context); } } /** The CrossProduct physical operator (similar to block-nested loop) * @param mx left mapper * @param my right mapper * @param reduce_fnc reducer * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param X the left source * @param Y the right source (stored in distributed cache) * @param stop_counter optional counter used in repeat operation * @return a new data source that contains the result */ public final static DataSet crossProduct(Tree mx, // left mapper Tree my, // right mapper Tree reduce_fnc, // reducer Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet X, // the left source DataSet Y, // the right source (stored in distributed cache) String stop_counter) // optional counter used in repeat operation throws Exception { DataSet ds = MapOperation.cMap(my, null, null, Y, "-"); conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.reducer", reduce_fnc.toString()); conf.set("mrql.mapper", mx.toString()); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); PathFilter pf = new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; for (DataSource p : ds.source) { Path path = new Path(p.path); for (FileStatus s : path.getFileSystem(conf).listStatus(path, pf)) DistributedCache.addCacheFile(s.getPath().toUri(), job.getConfiguration()); } ; for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, crossProductMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setNumReduceTasks(0); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job)); } }