com.cloudera.sqoop.mapreduce.MySQLDumpMapper.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.sqoop.mapreduce.MySQLDumpMapper.java

Source

/**
 * Licensed to Cloudera, Inc. under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Cloudera, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cloudera.sqoop.mapreduce;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import com.cloudera.sqoop.lib.DelimiterSet;
import com.cloudera.sqoop.lib.FieldFormatter;
import com.cloudera.sqoop.lib.RecordParser;
import com.cloudera.sqoop.manager.MySQLUtils;
import com.cloudera.sqoop.util.AsyncSink;
import com.cloudera.sqoop.util.ErrorableAsyncSink;
import com.cloudera.sqoop.util.ErrorableThread;
import com.cloudera.sqoop.util.JdbcUrl;
import com.cloudera.sqoop.util.LoggingAsyncSink;
import com.cloudera.sqoop.util.PerfCounters;

/**
 * Mapper that opens up a pipe to mysqldump and pulls data directly.
 */
public class MySQLDumpMapper extends Mapper<String, NullWritable, String, NullWritable> {

    public static final Log LOG = LogFactory.getLog(MySQLDumpMapper.class.getName());

    private Configuration conf;

    // AsyncSinks used to import data from mysqldump directly into HDFS.

    /**
     * Copies data directly from mysqldump into HDFS, after stripping some
     * header and footer characters that are attached to each line in mysqldump.
     */
    static class CopyingAsyncSink extends ErrorableAsyncSink {
        private final MySQLDumpMapper.Context context;
        private final PerfCounters counters;

        CopyingAsyncSink(final MySQLDumpMapper.Context context, final PerfCounters ctrs) {
            this.context = context;
            this.counters = ctrs;
        }

        public void processStream(InputStream is) {
            child = new CopyingStreamThread(is, context, counters);
            child.start();
        }

        private static class CopyingStreamThread extends ErrorableThread {
            public static final Log LOG = LogFactory.getLog(CopyingStreamThread.class.getName());

            private final MySQLDumpMapper.Context context;
            private final InputStream stream;
            private final PerfCounters counters;

            CopyingStreamThread(final InputStream is, final Context c, final PerfCounters ctrs) {
                this.context = c;
                this.stream = is;
                this.counters = ctrs;
            }

            public void run() {
                BufferedReader r = null;

                try {
                    r = new BufferedReader(new InputStreamReader(this.stream));

                    // Actually do the read/write transfer loop here.
                    int preambleLen = -1; // set to this for "undefined"
                    while (true) {
                        String inLine = r.readLine();
                        if (null == inLine) {
                            break; // EOF.
                        }

                        // this line is of the form "INSERT .. VALUES ( actual value text
                        // );" strip the leading preamble up to the '(' and the trailing
                        // ');'.
                        if (preambleLen == -1) {
                            // we haven't determined how long the preamble is. It's constant
                            // across all lines, so just figure this out once.
                            String recordStartMark = "VALUES (";
                            preambleLen = inLine.indexOf(recordStartMark) + recordStartMark.length();
                        }

                        // chop off the leading and trailing text as we write the
                        // output to HDFS.
                        int len = inLine.length() - 2 - preambleLen;
                        context.write(inLine.substring(preambleLen, inLine.length() - 2), null);
                        context.write("\n", null);
                        counters.addBytes(1 + len);
                    }
                } catch (IOException ioe) {
                    LOG.error("IOException reading from mysqldump: " + ioe.toString());
                    // flag this error so we get an error status back in the caller.
                    setError();
                } catch (InterruptedException ie) {
                    LOG.error("InterruptedException reading from mysqldump: " + ie.toString());
                    // flag this error so we get an error status back in the caller.
                    setError();
                } finally {
                    if (null != r) {
                        try {
                            r.close();
                        } catch (IOException ioe) {
                            LOG.info("Error closing FIFO stream: " + ioe.toString());
                        }
                    }
                }
            }
        }
    }

    /**
     * The ReparsingAsyncSink will instantiate a RecordParser to read mysqldump's
     * output, and re-emit the text in the user's specified output format.
     */
    static class ReparsingAsyncSink extends ErrorableAsyncSink {
        private final MySQLDumpMapper.Context context;
        private final Configuration conf;
        private final PerfCounters counters;

        ReparsingAsyncSink(final MySQLDumpMapper.Context c, final Configuration conf, final PerfCounters ctrs) {
            this.context = c;
            this.conf = conf;
            this.counters = ctrs;
        }

        public void processStream(InputStream is) {
            child = new ReparsingStreamThread(is, context, conf, counters);
            child.start();
        }

        private static class ReparsingStreamThread extends ErrorableThread {
            public static final Log LOG = LogFactory.getLog(ReparsingStreamThread.class.getName());

            private final MySQLDumpMapper.Context context;
            private final Configuration conf;
            private final InputStream stream;
            private final PerfCounters counters;

            ReparsingStreamThread(final InputStream is, final MySQLDumpMapper.Context c, Configuration conf,
                    final PerfCounters ctrs) {
                this.context = c;
                this.conf = conf;
                this.stream = is;
                this.counters = ctrs;
            }

            private static final char MYSQL_FIELD_DELIM = ',';
            private static final char MYSQL_RECORD_DELIM = '\n';
            private static final char MYSQL_ENCLOSE_CHAR = '\'';
            private static final char MYSQL_ESCAPE_CHAR = '\\';
            private static final boolean MYSQL_ENCLOSE_REQUIRED = false;

            private static final RecordParser MYSQLDUMP_PARSER;

            static {
                // build a record parser for mysqldump's format
                MYSQLDUMP_PARSER = new RecordParser(DelimiterSet.MYSQL_DELIMITERS);
            }

            public void run() {
                BufferedReader r = null;

                try {
                    r = new BufferedReader(new InputStreamReader(this.stream));

                    // Configure the output with the user's delimiters.
                    char outputFieldDelim = (char) conf.getInt(MySQLUtils.OUTPUT_FIELD_DELIM_KEY,
                            DelimiterSet.NULL_CHAR);
                    String outputFieldDelimStr = "" + outputFieldDelim;
                    char outputRecordDelim = (char) conf.getInt(MySQLUtils.OUTPUT_RECORD_DELIM_KEY,
                            DelimiterSet.NULL_CHAR);
                    String outputRecordDelimStr = "" + outputRecordDelim;
                    char outputEnclose = (char) conf.getInt(MySQLUtils.OUTPUT_ENCLOSED_BY_KEY,
                            DelimiterSet.NULL_CHAR);
                    char outputEscape = (char) conf.getInt(MySQLUtils.OUTPUT_ESCAPED_BY_KEY,
                            DelimiterSet.NULL_CHAR);
                    boolean outputEncloseRequired = conf.getBoolean(MySQLUtils.OUTPUT_ENCLOSE_REQUIRED_KEY, false);

                    DelimiterSet delimiters = new DelimiterSet(outputFieldDelim, outputRecordDelim, outputEnclose,
                            outputEscape, outputEncloseRequired);

                    // Actually do the read/write transfer loop here.
                    int preambleLen = -1; // set to this for "undefined"
                    while (true) {
                        String inLine = r.readLine();
                        if (null == inLine) {
                            break; // EOF.
                        }

                        // this line is of the form "INSERT .. VALUES ( actual value text
                        // );" strip the leading preamble up to the '(' and the trailing
                        // ');'.
                        if (preambleLen == -1) {
                            // we haven't determined how long the preamble is. It's constant
                            // across all lines, so just figure this out once.
                            String recordStartMark = "VALUES (";
                            preambleLen = inLine.indexOf(recordStartMark) + recordStartMark.length();
                        }

                        // Wrap the input string in a char buffer that ignores the leading
                        // and trailing text.
                        CharBuffer charbuf = CharBuffer.wrap(inLine, preambleLen, inLine.length() - 2);

                        // Pass this along to the parser
                        List<String> fields = null;
                        try {
                            fields = MYSQLDUMP_PARSER.parseRecord(charbuf);
                        } catch (RecordParser.ParseError pe) {
                            LOG.warn("ParseError reading from mysqldump: " + pe.toString() + "; record skipped");
                            continue; // Skip emitting this row.
                        }

                        // For all of the output fields, emit them using the delimiters
                        // the user chooses.
                        boolean first = true;
                        int recordLen = 1; // for the delimiter.
                        for (String field : fields) {
                            if (!first) {
                                context.write(outputFieldDelimStr, null);
                            } else {
                                first = false;
                            }

                            String fieldStr = FieldFormatter.escapeAndEnclose(field, delimiters);
                            context.write(fieldStr, null);
                            recordLen += fieldStr.length();
                        }

                        context.write(outputRecordDelimStr, null);
                        counters.addBytes(recordLen);
                    }
                } catch (IOException ioe) {
                    LOG.error("IOException reading from mysqldump: " + ioe.toString());
                    // flag this error so the parent can handle it appropriately.
                    setError();
                } catch (InterruptedException ie) {
                    LOG.error("InterruptedException reading from mysqldump: " + ie.toString());
                    // flag this error so we get an error status back in the caller.
                    setError();
                } finally {
                    if (null != r) {
                        try {
                            r.close();
                        } catch (IOException ioe) {
                            LOG.info("Error closing FIFO stream: " + ioe.toString());
                        }
                    }
                }
            }
        }
    }

    // TODO(aaron): Refactor this method to be much shorter.
    // CHECKSTYLE:OFF
    /**
     * Import the table into HDFS by using mysqldump to pull out the data from
     * the database and upload the files directly to HDFS.
     */
    public void map(String splitConditions, NullWritable val, Context context)
            throws IOException, InterruptedException {

        LOG.info("Beginning mysqldump fast path import");

        ArrayList<String> args = new ArrayList<String>();
        String tableName = conf.get(MySQLUtils.TABLE_NAME_KEY);

        // We need to parse the connect string URI to determine the database name.
        // Using java.net.URL directly on the connect string will fail because
        // Java doesn't respect arbitrary JDBC-based schemes. So we chop off the
        // scheme (everything before '://') and replace it with 'http', which we
        // know will work.
        String connectString = conf.get(MySQLUtils.CONNECT_STRING_KEY);
        String databaseName = JdbcUrl.getDatabaseName(connectString);
        String hostname = JdbcUrl.getHostName(connectString);
        int port = JdbcUrl.getPort(connectString);

        if (null == databaseName) {
            throw new IOException("Could not determine database name");
        }

        LOG.info("Performing import of table " + tableName + " from database " + databaseName);

        args.add(MySQLUtils.MYSQL_DUMP_CMD); // requires that this is on the path.

        String password = conf.get(MySQLUtils.PASSWORD_KEY);
        String passwordFile = null;

        Process p = null;
        AsyncSink sink = null;
        AsyncSink errSink = null;
        PerfCounters counters = new PerfCounters();
        try {
            // --defaults-file must be the first argument.
            if (null != password && password.length() > 0) {
                passwordFile = MySQLUtils.writePasswordFile(conf);
                args.add("--defaults-file=" + passwordFile);
            }

            // Don't use the --where="<whereClause>" version because spaces in it can
            // confuse Java, and adding in surrounding quotes confuses Java as well.
            String whereClause = conf.get(MySQLUtils.WHERE_CLAUSE_KEY, "(1=1)") + " AND (" + splitConditions + ")";
            args.add("-w");
            args.add(whereClause);

            args.add("--host=" + hostname);
            if (-1 != port) {
                args.add("--port=" + Integer.toString(port));
            }
            args.add("--skip-opt");
            args.add("--compact");
            args.add("--no-create-db");
            args.add("--no-create-info");
            args.add("--quick"); // no buffering
            args.add("--single-transaction");

            String username = conf.get(MySQLUtils.USERNAME_KEY);
            if (null != username) {
                args.add("--user=" + username);
            }

            // If the user supplied extra args, add them here.
            String[] extra = conf.getStrings(MySQLUtils.EXTRA_ARGS_KEY);
            if (null != extra) {
                for (String arg : extra) {
                    args.add(arg);
                }
            }

            args.add(databaseName);
            args.add(tableName);

            // begin the import in an external process.
            LOG.debug("Starting mysqldump with arguments:");
            for (String arg : args) {
                LOG.debug("  " + arg);
            }

            // Actually start the mysqldump.
            p = Runtime.getRuntime().exec(args.toArray(new String[0]));

            // read from the stdout pipe into the HDFS writer.
            InputStream is = p.getInputStream();

            if (MySQLUtils.outputDelimsAreMySQL(conf)) {
                LOG.debug("Output delimiters conform to mysqldump; " + "using straight copy");
                sink = new CopyingAsyncSink(context, counters);
            } else {
                LOG.debug("User-specified delimiters; using reparsing import");
                LOG.info("Converting data to use specified delimiters.");
                LOG.info("(For the fastest possible import, use");
                LOG.info("--mysql-delimiters to specify the same field");
                LOG.info("delimiters as are used by mysqldump.)");
                sink = new ReparsingAsyncSink(context, conf, counters);
            }

            // Start an async thread to read and upload the whole stream.
            counters.startClock();
            sink.processStream(is);

            // Start an async thread to send stderr to log4j.
            errSink = new LoggingAsyncSink(LOG);
            errSink.processStream(p.getErrorStream());
        } finally {

            // block until the process is done.
            int result = 0;
            if (null != p) {
                while (true) {
                    try {
                        result = p.waitFor();
                    } catch (InterruptedException ie) {
                        // interrupted; loop around.
                        continue;
                    }

                    break;
                }
            }

            // Remove the password file.
            if (null != passwordFile) {
                if (!new File(passwordFile).delete()) {
                    LOG.error("Could not remove mysql password file " + passwordFile);
                    LOG.error("You should remove this file to protect your credentials.");
                }
            }

            // block until the stream sink is done too.
            int streamResult = 0;
            if (null != sink) {
                while (true) {
                    try {
                        streamResult = sink.join();
                    } catch (InterruptedException ie) {
                        // interrupted; loop around.
                        continue;
                    }

                    break;
                }
            }

            // Try to wait for stderr to finish, but regard any errors as advisory.
            if (null != errSink) {
                try {
                    if (0 != errSink.join()) {
                        LOG.info("Encountered exception reading stderr stream");
                    }
                } catch (InterruptedException ie) {
                    LOG.info("Thread interrupted waiting for stderr to complete: " + ie.toString());
                }
            }

            LOG.info("Transfer loop complete.");

            if (0 != result) {
                throw new IOException("mysqldump terminated with status " + Integer.toString(result));
            }

            if (0 != streamResult) {
                throw new IOException("Encountered exception in stream sink");
            }

            counters.stopClock();
            LOG.info("Transferred " + counters.toString());
        }
    }
    // CHECKSTYLE:ON

    @Override
    protected void setup(Context context) {
        this.conf = context.getConfiguration();
    }
}