Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer; import java.io.IOException; import java.io.OutputStream; import org.apache.pig.data.Tuple; import java.text.NumberFormat; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.pig.StoreConfig; import org.apache.pig.StoreFunc; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.io.FileLocalizer; import org.apache.pig.impl.io.FileSpec; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.util.ObjectSerializer; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.util.PlanHelper; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStoreImpl; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigOutputFormat; /** * This class is used to have a POStore write to DFS via a output * collector/record writer. It sets up a modified job configuration to * force a write to a specific subdirectory of the main output * directory. This is done so that multiple output directories can be * used in the same job. Since the hadoop framework requires a * reporter to be available to create the record writer the main * function (createStoreFunc) has to be called from within a map or * reduce function. */ public class MapReducePOStoreImpl extends POStoreImpl { private PigContext pc; private StoreFunc storer; private FileSpec sFile; private Reporter reporter; private RecordWriter writer; private JobConf job; private final Log log = LogFactory.getLog(getClass()); public static final String PIG_STORE_CONFIG = "pig.store.config"; public MapReducePOStoreImpl(JobConf job) { this.job = job; } public void setReporter(Reporter reporter) { this.reporter = reporter; } @Override public StoreFunc createStoreFunc(FileSpec sFile, Schema schema) throws IOException { // set up a new job conf JobConf outputConf = new JobConf(job); String tmpPath = PlanHelper.makeStoreTmpPath(sFile.getFileName()); // If the StoreFunc associate with the POStore is implements // getStorePreparationClass() and returns a non null value, // then it could be wanting to implement OutputFormat for writing out to hadoop // Check if this is the case, if so, use the OutputFormat class the // StoreFunc gives us else use our default PigOutputFormat Object storeFunc = PigContext.instantiateFuncFromSpec(sFile.getFuncSpec()); Class sPrepClass = null; try { sPrepClass = ((StoreFunc) storeFunc).getStorePreparationClass(); } catch (AbstractMethodError e) { // this is for backward compatibility wherein some old StoreFunc // which does not implement getStorePreparationClass() is being // used. In this case, we want to just use PigOutputFormat sPrepClass = null; } if (sPrepClass != null && OutputFormat.class.isAssignableFrom(sPrepClass)) { outputConf.setOutputFormat(sPrepClass); } else { outputConf.setOutputFormat(PigOutputFormat.class); } // PigOuputFormat will look for pig.storeFunc to actually // write stuff out. // serialize the store func spec using ObjectSerializer // ObjectSerializer.serialize() uses default java serialization // and then further encodes the output so that control characters // get encoded as regular characters. Otherwise any control characters // in the store funcspec would break the job.xml which is created by // hadoop from the jobconf. outputConf.set("pig.storeFunc", ObjectSerializer.serialize(sFile.getFuncSpec().toString())); // We set the output dir to the final location of the output, // the output dir set in the original job config points to the // temp location for the multi store. Path outputDir = new Path(sFile.getFileName()).makeQualified(FileSystem.get(outputConf)); outputConf.set("mapred.output.dir", outputDir.toString()); // Set the schema outputConf.set(PIG_STORE_CONFIG, ObjectSerializer.serialize(new StoreConfig(outputDir.toString(), schema))); // The workpath is set to a unique-per-store subdirectory of // the current working directory. String workPath = outputConf.get("mapred.work.output.dir"); outputConf.set("mapred.work.output.dir", new Path(workPath, tmpPath).toString()); OutputFormat outputFormat = outputConf.getOutputFormat(); // Generate a unique part name (part-<task_partition_number>). String fileName = getPartName(outputConf); // create a new record writer writer = outputFormat.getRecordWriter(FileSystem.get(outputConf), outputConf, fileName, reporter); // return an output collector using the writer we just created. return new StoreFuncAdaptor(new OutputCollector() { @SuppressWarnings({ "unchecked" }) public void collect(Object key, Object value) throws IOException { writer.write(key, value); } }); } @Override public void tearDown() throws IOException { if (writer != null) { writer.close(reporter); writer = null; } } @Override public void cleanUp() throws IOException { if (writer != null) { writer.close(reporter); writer = null; } } private String getPartName(JobConf conf) { int partition = conf.getInt("mapred.task.partition", -1); NumberFormat numberFormat = NumberFormat.getInstance(); numberFormat.setMinimumIntegerDigits(5); numberFormat.setGroupingUsed(false); return "part-" + numberFormat.format(partition); } /** * This is a simple adaptor class to allow the physical store operator * to be used in the map reduce case. It will allow to use an output * collector instead of an output stream to write tuples. */ private class StoreFuncAdaptor implements StoreFunc { private OutputCollector collector; public StoreFuncAdaptor(OutputCollector collector) { this.collector = collector; } @Override public void bindTo(OutputStream os) throws IOException { } @Override public void putNext(Tuple f) throws IOException { collector.collect(null, f); } @Override public void finish() throws IOException { } @Override public Class getStorePreparationClass() throws IOException { return null; } } }