Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.aliyun.odps.mapred.bridge.streaming; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.aliyun.odps.Column; import com.aliyun.odps.OdpsType; import com.aliyun.odps.conf.Configuration; import com.aliyun.odps.data.Record; import com.aliyun.odps.io.Text; import com.aliyun.odps.mapred.Mapper; import com.aliyun.odps.mapred.Reducer; import com.aliyun.odps.mapred.TaskContext; import com.aliyun.odps.mapred.bridge.streaming.io.InputWriter; import com.aliyun.odps.mapred.bridge.streaming.io.LineReader; import com.aliyun.odps.mapred.bridge.streaming.io.OutputReader; import com.aliyun.odps.mapred.bridge.streaming.io.RecordOutputReader; import com.aliyun.odps.mapred.bridge.streaming.io.TextInputWriter; import com.aliyun.odps.mapred.bridge.streaming.io.TextOutputReader; import com.aliyun.odps.mapred.conf.BridgeJobConf; import com.aliyun.odps.mapred.conf.JobConf; import com.aliyun.odps.utils.ReflectionUtils; /** * Shared functionality for PipeMapper, PipeReducer. */ public abstract class PipeMapRed { protected static final Log LOG = LogFactory.getLog(PipeMapRed.class.getName()); /** * Returns the Configuration. */ public Configuration getConfiguration() { return job_; } /** * Returns the DataOutput to which the client input is written. */ public DataOutput getClientOutput() { return clientOut_; } /** * Returns the DataInput from which the client output is read. */ public DataInput getClientInput() { return clientIn_; } /** * Returns the input separator to be used. */ public abstract byte[] getInputSeparator(); /** * Returns the field separator to be used. */ public abstract byte[] getFieldSeparator(); /** * Returns the number of key fields. */ public abstract int getNumOfKeyFields(); abstract boolean getDoPipe(); /** * Returns the command to be spawned as a subprocess. * Mapper/Reducer operations will delegate to it */ abstract String getPipeCommand(JobConf job); public int getNumOfOutputFields() { return job_.getOutputSchema().length; } final static int OUTSIDE = 1; final static int SINGLEQ = 2; final static int DOUBLEQ = 3; private final static int BUFFER_SIZE = 128 * 1024; static String[] splitArgs(String args) { ArrayList argList = new ArrayList(); char[] ch = args.toCharArray(); int clen = ch.length; int state = OUTSIDE; int argstart = 0; for (int c = 0; c <= clen; c++) { boolean last = (c == clen); int lastState = state; boolean endToken = false; if (!last) { if (ch[c] == '\'') { if (state == OUTSIDE) { state = SINGLEQ; } else if (state == SINGLEQ) { state = OUTSIDE; } endToken = (state != lastState); } else if (ch[c] == '"') { if (state == OUTSIDE) { state = DOUBLEQ; } else if (state == DOUBLEQ) { state = OUTSIDE; } endToken = (state != lastState); } else if (ch[c] == ' ') { if (state == OUTSIDE) { endToken = true; } } } if (last || endToken) { if (c == argstart) { // unquoted space } else { String a; a = args.substring(argstart, c); argList.add(a); } argstart = c + 1; lastState = state; } } return (String[]) argList.toArray(new String[0]); } public void configure(JobConf job) { try { String argv = getPipeCommand(job); if (argv == null) { throw new RuntimeException("streaming pipe cmd is null"); } joinDelay_ = job.getLong("stream.joindelay.milli", 0); job_ = new BridgeJobConf(job); mapInputWriterClass_ = job_.getClass("stream.map.input.writer.class", TextInputWriter.class, InputWriter.class); mapOutputReaderClass_ = job_.getClass("stream.map.output.reader.class", TextOutputReader.class, OutputReader.class); reduceInputWriterClass_ = job_.getClass("stream.reduce.input.writer.class", TextInputWriter.class, InputWriter.class); reduceOutputReaderClass_ = job_.getClass("stream.reduce.output.reader.class", TextOutputReader.class, OutputReader.class); nonZeroExitIsFailure_ = job_.getBoolean("stream.non.zero.exit.is.failure", true); doPipe_ = getDoPipe(); if (!doPipe_) { return; } setStreamJobDetails(job); String[] argvSplit = splitArgs(argv); String prog = argvSplit[0]; //File currentDir = new File(".").getAbsoluteFile(); //if (new File(prog).isAbsolute()) { // // we don't own it. Hope it is executable //} else { // FileUtil.chmod(new File(currentDir, prog).toString(), "a+x"); //} // // argvSplit[0]: // An absolute path should be a preexisting valid path on all TaskTrackers // A relative path is converted into an absolute pathname by looking // up the PATH env variable. If it still fails, look it up in the // tasktracker's local working directory // //if (!new File(argvSplit[0]).isAbsolute()) { // PathFinder finder = new PathFinder("PATH"); // finder.prependPathComponent(currentDir.toString()); // File f = finder.getAbsolutePath(argvSplit[0]); // if (f != null) { // argvSplit[0] = f.getAbsolutePath(); // } // f = null; //} LOG.info("PipeMapRed exec " + Arrays.asList(argvSplit)); Properties childEnv = new Properties(); addJobConfToEnvironment(job_, childEnv); addEnvironment(childEnv, job_.get("stream.addenvironment")); // add TMPDIR environment variable with the value of java.io.tmpdir // FIXME envPut(childEnv, "TMPDIR", System.getProperty("java.io.tmpdir")); envPut(childEnv, "TABLE_RESOURCE_READER", "../table_resource_reader"); final Map<String, String> envMap = new HashMap<String, String>(); for (String key : childEnv.stringPropertyNames()) { envMap.put(key, childEnv.getProperty(key)); } // Start the process sim = StreamSecurityHelper.startChildProcess(argvSplit, envMap); clientOut_ = new DataOutputStream(new BufferedOutputStream(sim.getOutputStream(), BUFFER_SIZE)); clientIn_ = new DataInputStream(new BufferedInputStream(sim.getInputStream(), BUFFER_SIZE)); clientErr_ = new DataInputStream(new BufferedInputStream(sim.getErrorStream())); startTime_ = System.currentTimeMillis(); } catch (IOException e) { LOG.error("configuration exception", e); throw new RuntimeException("configuration exception", e); //} catch (InterruptedException e) { // LOG.error("configuration exception", e); // throw new RuntimeException("configuration exception", e); } } void setStreamJobDetails(JobConf job) { String s = job.get("stream.minRecWrittenToEnableSkip_"); if (s != null) { minRecWrittenToEnableSkip_ = Long.parseLong(s); LOG.info("JobConf set minRecWrittenToEnableSkip_ =" + minRecWrittenToEnableSkip_); } } void addJobConfToEnvironment(JobConf conf, Properties env) { Iterator it = conf.iterator(); while (it.hasNext()) { Map.Entry en = (Map.Entry) it.next(); String name = (String) en.getKey(); //String value = (String)en.getValue(); // does not apply variable expansion String value = conf.get(name); // does variable expansion name = safeEnvVarName(name); envPut(env, name, value); } } String safeEnvVarName(String var) { StringBuffer safe = new StringBuffer(); int len = var.length(); for (int i = 0; i < len; i++) { char c = var.charAt(i); char s; if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { s = c; } else { s = '_'; } safe.append(s); } return safe.toString(); } void addEnvironment(Properties env, String nameVals) { // encoding "a=b c=d" from StreamJob if (nameVals == null) { return; } String[] nv = nameVals.split(" "); for (int i = 0; i < nv.length; i++) { String[] pair = nv[i].split("=", 2); if (pair.length != 2) { LOG.info("Skip env entry:" + nv[i]); } else { envPut(env, pair[0], pair[1]); } } } void envPut(Properties env, String name, String value) { if (LOG.isDebugEnabled()) { LOG.debug("Add env entry:" + name + "=" + value); } env.put(name, value); } void startOutputThreads(TaskContext context) throws IOException { inWriter_ = createInputWriter(); outReader_ = createOutputReader(); outThread_ = new MROutputThread(outReader_, context); outThread_.start(); errThread_ = new MRErrorThread(); errThread_.setReporter(context); errThread_.start(); } void waitOutputThreads() throws IOException { try { if (outThread_ == null) { // This happens only when reducer has empty input(So reduce() is not // called at all in this task). If reducer still generates output, // which is very uncommon and we may not have to support this case. // So we don't write this output to HDFS, but we consume/collect // this output just to avoid reducer hanging forever. //OutputCollector collector = new OutputCollector() { // public void collect(Object key, Object value) // throws IOException { // //just consume it, no need to write the record anywhere // } //}; //Reporter reporter = null;//dummy reporter startOutputThreads(null); } int exitVal = sim.waitFor(); System.err.println("Streaming subprocess exited with code " + exitVal); if (outThread_ != null) { outThread_.join(joinDelay_); } if (errThread_ != null) { errThread_.join(joinDelay_); } if (outerrThreadsThrowable != null) { throw new RuntimeException(outerrThreadsThrowable); } if (exitVal != 0 && nonZeroExitIsFailure_) { throw new RuntimeException("Streaming subprocess failed with code " + exitVal + ", see stderr of failed worker for perhaps more info."); } } catch (InterruptedException e) { //ignore } } abstract InputWriter createInputWriter() throws IOException; InputWriter createInputWriter(Class<? extends InputWriter> inputWriterClass) throws IOException { InputWriter inputWriter = ReflectionUtils.newInstance(inputWriterClass, job_); inputWriter.initialize(this); return inputWriter; } abstract OutputReader createOutputReader() throws IOException; OutputReader createOutputReader(Class<? extends OutputReader> outputReaderClass) throws IOException { OutputReader outputReader = ReflectionUtils.newInstance(outputReaderClass, job_); outputReader.initialize(this); return outputReader; } class MROutputThread extends Thread { MROutputThread(OutputReader outReader, TaskContext context) { setDaemon(true); this.outReader = outReader; this.context = context; if (this.context != null) { badDataBehavior = context.getJobConf().get("stream.bad.data.behavior", "strict"); ignoreBadCast = !badDataBehavior.equals("strict"); } } public void run() { try { // 3/4 Tool to Hadoop while (outReader.readKeyValue()) { Object key = outReader.getCurrentKey(); Object value = outReader.getCurrentValue(); // XXX dummy read and discard, as explained in waitOutputThreads() if (context == null || key == null) { continue; } if ((!context.getTaskID().isMap()) || context.getNumReduceTasks() == 0) { // TODO split key-value to fields // FIXME key-value to table record mapping // TODO convert string fileds to output schema type Record record = context.createOutputRecord(); if (outReader instanceof RecordOutputReader) { Text[] fields = (Text[]) value; fillOutputRecord(record, fields, ignoreBadCast); } else { record.setString(0, key.toString()); record.setString(1, value.toString()); } if (context.getTaskID().isMap()) { ((Mapper.TaskContext) context).write(record); } else { ((Reducer.TaskContext) context).write(record); } } else { // FIXME shuffle key-value both just single Text field now. Record keyRecord = context.createMapOutputKeyRecord(); keyRecord.setString(0, key.toString()); Record valueRecord = context.createMapOutputValueRecord(); valueRecord.setString(0, value.toString()); ((Mapper.TaskContext) context).write(keyRecord, valueRecord); } numRecWritten_++; long now = System.currentTimeMillis(); if (now - lastStdoutReport > reporterOutDelay_) { lastStdoutReport = now; String hline = "Records R/W=" + numRecRead_ + "/" + numRecWritten_; if (!processProvidedStatus_) { //reporter.setStatus(hline); } else { context.progress(); } LOG.info(hline); } } } catch (Throwable th) { outerrThreadsThrowable = th; LOG.warn(th); } finally { try { if (clientIn_ != null) { clientIn_.close(); clientIn_ = null; } } catch (IOException io) { LOG.info(io); } } } /** * Fill Text fields into table output record. * Type convertion rule: * STRING: just set Text, no NULL processing * ALL OTHER: empty field as NULL * BIGINT,DOUBLE,BOOLEAN: parse by Java type, bad as NULL * DATETIME: * DECIMAL: */ void fillOutputRecord(Record record, Text[] fields, boolean ignoreBadCast) { if (fields.length != record.getColumnCount()) { // should never happen, have checked in RecordOutputReader throw new RuntimeException("output record not match output schema..."); } Column[] columns = record.getColumns(); for (int i = 0; i < fields.length; i++) { Column col = columns[i]; Text field = fields[i]; Object val = null; if (col.getType().equals(OdpsType.STRING)) { val = field; } else if (field.getLength() == 0) { val = null; } else { String fieldStr = null; try { fieldStr = field.toString(); switch (col.getType()) { case BIGINT: val = Long.valueOf(fieldStr); break; case DOUBLE: val = Double.valueOf(fieldStr); break; case BOOLEAN: val = Boolean.valueOf(fieldStr); break; default: // FIXME should have checked at client side? throw new RuntimeException("output column " + col.getName() + "'s type " + col.getType() + " not supported by streaming job"); } } catch (Exception e) { if (!ignoreBadCast) { if (fieldStr == null) { // invalid utf-8 // TODO provide escaped field content with error msg? throw new RuntimeException( "Failed to decode streaming field as UTF-8 for column " + col.getName()); } throw new RuntimeException("Invalid streaming field value for " + col.getType() + " column " + col.getName() + ":" + fieldStr, e); } } } record.set(i, val); } } OutputReader outReader = null; TaskContext context = null; long lastStdoutReport = 0; String badDataBehavior; boolean ignoreBadCast; } class MRErrorThread extends Thread { public MRErrorThread() { this.reporterPrefix = job_.get("stream.stderr.reporter.prefix", "reporter:"); this.counterPrefix = reporterPrefix + "counter:"; this.statusPrefix = reporterPrefix + "status:"; setDaemon(true); } public void setReporter(TaskContext reporter) { this.reporter = reporter; } public void run() { Text line = new Text(); LineReader lineReader = null; try { lineReader = new LineReader((InputStream) clientErr_, job_); while (lineReader.readLine(line) > 0) { String lineStr = line.toString(); if (matchesReporter(lineStr)) { if (reporter != null) { if (matchesCounter(lineStr)) { incrCounter(lineStr); } else if (matchesStatus(lineStr)) { processProvidedStatus_ = true; setStatus(lineStr); } else { LOG.warn("Cannot parse reporter line: " + lineStr); } } } else { System.err.println(lineStr); } long now = System.currentTimeMillis(); if (reporter != null && now - lastStderrReport > reporterErrDelay_) { lastStderrReport = now; reporter.progress(); } line.clear(); } if (lineReader != null) { lineReader.close(); } if (clientErr_ != null) { clientErr_.close(); clientErr_ = null; LOG.info("MRErrorThread done"); } } catch (Throwable th) { outerrThreadsThrowable = th; LOG.warn(th); try { if (lineReader != null) { lineReader.close(); } if (clientErr_ != null) { clientErr_.close(); clientErr_ = null; } } catch (IOException io) { LOG.info(io); } } } private boolean matchesReporter(String line) { return line.startsWith(reporterPrefix); } private boolean matchesCounter(String line) { return line.startsWith(counterPrefix); } private boolean matchesStatus(String line) { return line.startsWith(statusPrefix); } private void incrCounter(String line) { String trimmedLine = line.substring(counterPrefix.length()).trim(); String[] columns = trimmedLine.split(","); if (columns.length == 3) { try { reporter.getCounter(columns[0], columns[1]).increment(Long.parseLong(columns[2])); } catch (NumberFormatException e) { LOG.warn("Cannot parse counter increment '" + columns[2] + "' from line: " + line); } } else { LOG.warn("Cannot parse counter line: " + line); } } private void setStatus(String line) { //reporter.setStatus(line.substring(statusPrefix.length()).trim()); } long lastStderrReport = 0; volatile TaskContext reporter; private final String reporterPrefix; private final String counterPrefix; private final String statusPrefix; } public void mapRedFinished() { try { if (!doPipe_) { LOG.info("mapRedFinished"); return; } if (clientOut_ != null) { try { clientOut_.flush(); clientOut_.close(); } catch (IOException io) { LOG.warn(io); } } try { waitOutputThreads(); } catch (IOException io) { LOG.warn(io); } if (sim != null) { sim.destroy(); } LOG.info("mapRedFinished"); } catch (RuntimeException e) { LOG.info("PipeMapRed failed!", e); throw e; } } void maybeLogRecord() { if (numRecRead_ >= nextRecReadLog_) { String info = numRecInfo(); LOG.info(info); if (nextRecReadLog_ < 100000) { nextRecReadLog_ *= 10; } else { nextRecReadLog_ += 100000; } } } public String getContext() { String s = numRecInfo() + "\n"; if (outThread_ != null) { s += "last tool output: |" + outReader_.getLastOutput() + "|\n"; } return s; } String envline(String var) { return var + "=" + StreamUtil.env().get(var) + "\n"; } String numRecInfo() { long elapsed = (System.currentTimeMillis() - startTime_) / 1000; return "R/W/S=" + numRecRead_ + "/" + numRecWritten_ + "/" + numRecSkipped_ + " in:" + safeDiv(numRecRead_, elapsed) + " [rec/s]" + " out:" + safeDiv(numRecWritten_, elapsed) + " [rec/s]"; } String safeDiv(long n, long d) { return (d == 0) ? "NA" : "" + n / d + "=" + n + "/" + d; } long startTime_; long numRecRead_ = 0; long numRecWritten_ = 0; long numRecSkipped_ = 0; long nextRecReadLog_ = 1; long minRecWrittenToEnableSkip_ = Long.MAX_VALUE; long reporterOutDelay_ = 10 * 1000L; long reporterErrDelay_ = 10 * 1000L; long joinDelay_; BridgeJobConf job_; boolean doPipe_; Class<? extends InputWriter> mapInputWriterClass_; Class<? extends OutputReader> mapOutputReaderClass_; Class<? extends InputWriter> reduceInputWriterClass_; Class<? extends OutputReader> reduceOutputReaderClass_; boolean nonZeroExitIsFailure_; Process sim; InputWriter inWriter_; OutputReader outReader_; MROutputThread outThread_; MRErrorThread errThread_; DataOutputStream clientOut_; DataInputStream clientErr_; DataInputStream clientIn_; // set in PipeMapper/PipeReducer subclasses int numExceptions_; protected volatile Throwable outerrThreadsThrowable; volatile boolean processProvidedStatus_ = false; }