Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.aliyun.odps.mapred.bridge; import java.io.BufferedInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Collections; import org.apache.commons.lang.ArrayUtils; import com.aliyun.odps.Column; import com.aliyun.odps.OdpsType; import com.aliyun.odps.counter.Counter; import com.aliyun.odps.data.Record; import com.aliyun.odps.data.TableInfo; import com.aliyun.odps.data.VolumeInfo; import com.aliyun.odps.io.Text; import com.aliyun.odps.io.Writable; import com.aliyun.odps.mapred.Mapper; import com.aliyun.odps.mapred.Reducer; import com.aliyun.odps.mapred.TaskContext; import com.aliyun.odps.mapred.TaskId; import com.aliyun.odps.mapred.bridge.utils.ResourceUtils; import com.aliyun.odps.mapred.conf.BridgeJobConf; import com.aliyun.odps.mapred.conf.JobConf; import com.aliyun.odps.mapred.utils.OutputUtils; import com.aliyun.odps.pipeline.Pipeline; import com.aliyun.odps.pipeline.Pipeline.TransformNode; import com.aliyun.odps.udf.ExecutionContext; /** * */ public abstract class UDTFTaskContextImpl implements TaskContext { protected BridgeJobConf conf; protected TaskId taskId; protected ExecutionContext ctx; // pipeline mode protected Pipeline pipeline; protected boolean pipeMode; // use pipeline mapreduce protected int pipeIndex = -1; protected TransformNode pipeNode; protected int reducerNum = 0; // for inner output protected boolean innerOutput; public UDTFTaskContextImpl(BridgeJobConf conf) { this.conf = conf; this.pipeline = Pipeline.fromJobConf(conf); if (this.pipeline != null) { this.pipeMode = true; } innerOutput = isMapper() ? (conf.getInnerOutputEnable() && conf.getMapperInnerOutputEnable()) : conf.getInnerOutputEnable(); initOutputSchema(); } boolean isMapper() { return false; } @Override public Class<? extends Reducer> getCombinerClass() { return conf.getCombinerClass(); } @Override public Column[] getMapOutputKeySchema() { return conf.getMapOutputKeySchema(); } @Override public Column[] getMapOutputValueSchema() { return conf.getMapOutputValueSchema(); } @Override public Class<? extends Mapper> getMapperClass() { return conf.getMapperClass(); } @Override public int getNumReduceTasks() { return reducerNum; } @Override public Class<? extends Reducer> getReducerClass() { return conf.getReducerClass(); } @Override public Record createOutputRecord() throws IOException { return createOutputRecord(TableInfo.DEFAULT_LABEL); } @Override public Record createOutputRecord(String label) throws IOException { if (!hasLabel(label)) { throw new IOException(ErrorCode.NO_SUCH_LABEL.toString() + " " + label); } return new WritableRecord(conf.getOutputSchema(label)); } @Override public Counter getCounter(Enum<?> key) { return ctx.getCounter(key); } @Override public Counter getCounter(String groupName, String counterName) { return ctx.getCounter(groupName, counterName); } @Override public TaskId getTaskID() { return taskId; } @Override public void progress() { ctx.claimAlive(); } @Override public BufferedInputStream readResourceFileAsStream(String name) throws IOException { return ResourceUtils.readResourceFileAsStream(name); } @Override public Iterable<BufferedInputStream> readResourceArchiveAsStream(String resourceName) throws IOException { return ResourceUtils.readResourceArchiveFileAsStream(resourceName); } @Override public Iterable<BufferedInputStream> readResourceArchiveAsStream(String resourceName, String relativePath) throws IOException { return ResourceUtils.readResourceArchiveFileAsStream(resourceName, relativePath); } @Override public Iterator<Record> readResourceTable(String tbl) throws IOException { return ResourceUtils.readResourceTable(tbl); } @Override public JobConf getJobConf() { return conf; } @Override public Record createMapOutputKeyRecord() throws IOException { if (pipeMode && pipeNode != null && pipeNode.getType().equals("map")) { return new WritableRecord(pipeNode.getOutputKeySchema()); } else { return new WritableRecord(conf.getMapOutputKeySchema()); } } @Override public Record createMapOutputValueRecord() throws IOException { if (pipeMode && pipeNode != null && pipeNode.getType().equals("map")) { return new WritableRecord(pipeNode.getOutputValueSchema()); } else { return new WritableRecord(conf.getMapOutputValueSchema()); } } /** * for pipeline mode */ @Override public Record createOutputKeyRecord() throws IOException { if (pipeMode && pipeNode != null) { return new WritableRecord(pipeNode.getOutputKeySchema()); } else { return null; } } /** * for pipeline mode */ @Override public Record createOutputValueRecord() throws IOException { if (pipeMode && pipeNode != null) { return new WritableRecord(pipeNode.getOutputValueSchema()); } else { return null; } } public boolean isPipelineMode() { return this.pipeMode; } public Pipeline getPipeline() { return this.pipeline; } public TransformNode getCurrentNode() { return this.pipeNode; } @Override public TableInfo[] getOutputTableInfo() { return OutputUtils.getTables(conf); } /** * *********************************** * Support for multi insert and inner output * * ************************************ */ protected Map<String, Integer> label2offset = new HashMap<String, Integer>(); protected Column[] packagedOutputSchema; private static final String MULTIDEST_LABEL = "MULTIDEST_LABEL"; private static final String INNEROUTPUT_LABEL = "INNEROUTPUT_LABEL"; protected int innerOutputIndex = 0; private void initOutputSchema() { TableInfo[] tables = getOutputTableInfo(); if (tables == null) { packagedOutputSchema = new Column[] { new Column("nil", OdpsType.STRING) }; return; } List<Column[]> schemas = new ArrayList<Column[]>(); List<OdpsType> outputColumnTypes = new ArrayList<OdpsType>(); boolean multiInsert = tables.length > 1; int length = 0; for (TableInfo t : tables) { Column[] output; if (t.getLabel() == null) { output = conf.getOutputSchema(); } else { output = conf.getOutputSchema(t.getLabel()); } List<OdpsType> tbColumnTypes = new ArrayList<OdpsType>(); for (Column col : output) { tbColumnTypes.add(col.getType()); } // check if the same columns already exists int idx = Collections.indexOfSubList(outputColumnTypes, tbColumnTypes); if (idx >= 0) { // merge columns for tableinfos with the same schema label2offset.put(t.getLabel(), idx); continue; } label2offset.put(t.getLabel(), length); for (Column col : output) { outputColumnTypes.add(col.getType()); } length += output.length; schemas.add(output); } // If multi insert, add 1 additional label field length += (multiInsert ? 1 : 0); // If inner output, add 1 additional label field length += (innerOutput ? 1 : 0); Column[] outputFields = new Column[length]; length = 0; for (Column[] r : schemas) { for (Column f : r) { outputFields[length] = f; length++; } } if (multiInsert) { outputFields[length] = new Column(MULTIDEST_LABEL, OdpsType.STRING); length++; } if (innerOutput) { outputFields[length] = new Column(INNEROUTPUT_LABEL, OdpsType.STRING); } packagedOutputSchema = outputFields; } protected Column[] getPackagedOutputSchema() { return packagedOutputSchema; } protected Column[] getIntermediateOutputSchema() { Column[] intermediateFields = (Column[]) ArrayUtils.addAll(getMapOutputKeySchema(), getMapOutputValueSchema()); return intermediateFields; } @Override public String[] getGroupingColumns() { return conf.getOutputGroupingColumns(); } protected void configure(ExecutionContext ctx) { taskId = new TaskId(ctx.getStageID(), ctx.getWorkerID()); this.ctx = ctx; if (this.pipeMode) { // TODO: hack task index here String tid = getTaskID().toString(); System.out.println("Task ID: " + tid); this.pipeIndex = Integer.parseInt(tid.split("_")[0].substring(1)) - 1; this.pipeNode = pipeline.getNode(pipeIndex); } if (pipeMode && pipeNode != null) { if (pipeNode.getNextNode() != null) { reducerNum = pipeNode.getNextNode().getNumTasks(); } else if (pipeIndex > 0) { // the last but not the first node of pipeline, must be reduce node reducerNum = pipeNode.getNumTasks(); } else { reducerNum = 0; } } else { reducerNum = conf.getNumReduceTasks(); } } protected boolean hasLabel(String label) { if (label2offset.get(label) == null) { return false; } return true; } protected Writable[] createOutputRow(Record r, String label) { if (label2offset.size() == 1 && !innerOutput) { // not a multi-insert return ((WritableRecord) r).toWritableArray(); } else { // assert validateLabel(label) == true; Writable[] row = new Writable[packagedOutputSchema.length]; Writable[] output = ((WritableRecord) r).toWritableArray(); System.arraycopy(output, 0, row, label2offset.get(label), output.length); int index = row.length - 1; if (innerOutput) { row[index] = new Text("INNER_OUTPUT"); index--; } if (label2offset.size() > 1) { row[index] = new Text(label); } return row; } } protected Writable[] createInnerOutputRow(Writable[] w, boolean isInnerOutput, String innerOutputLabel, String multiOutputLabel) { if (label2offset.size() == 1 && !innerOutput) { // not a multi-insert return w; } else { // assert validateLabel(label) == true; Writable[] row = new Writable[packagedOutputSchema.length]; Writable[] output = w; if (isInnerOutput) { System.arraycopy(output, 0, row, innerOutputIndex + label2offset.get(multiOutputLabel), output.length); } else { System.arraycopy(output, 0, row, 0, output.length); } int index = row.length - 1; if (innerOutput) { row[index] = new Text(innerOutputLabel); index--; } if (label2offset.size() > 1) { row[index] = new Text(multiOutputLabel); } return row; } } protected long getNextCntr(long cntr, boolean isMem) { long statBase = isMem ? 10000 : 1000000; if (cntr >= statBase) { return cntr + statBase; } return 10 * cntr; } @Override public VolumeInfo getInputVolumeInfo() throws IOException { return ctx.getInputVolumeInfo(); } @Override public VolumeInfo getInputVolumeInfo(String label) throws IOException { return ctx.getInputVolumeInfo(label); } @Override public VolumeInfo getOutputVolumeInfo() throws IOException { return ctx.getOutputVolumeInfo(); } @Override public VolumeInfo getOutputVolumeInfo(String label) throws IOException { return ctx.getOutputVolumeInfo(label); } @Override public com.aliyun.odps.volume.FileSystem getInputVolumeFileSystem() throws IOException { return ctx.getInputVolumeFileSystem(); } @Override public com.aliyun.odps.volume.FileSystem getInputVolumeFileSystem(String label) throws IOException { return ctx.getInputVolumeFileSystem(label); } @Override public com.aliyun.odps.volume.FileSystem getOutputVolumeFileSystem() throws IOException { return ctx.getOutputVolumeFileSystem(); } @Override public com.aliyun.odps.volume.FileSystem getOutputVolumeFileSystem(String label) throws IOException { return ctx.getOutputVolumeFileSystem(label); } }