com.splout.db.engine.MySQLOutputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.splout.db.engine.MySQLOutputFormat.java

Source

package com.splout.db.engine;

/*
 * #%L
 * Splout SQL Hadoop library
 * %%
 * Copyright (C) 2012 - 2013 Datasalt Systems S.L.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Schema.Field.Type;
import com.splout.db.common.CompressorUtil;
import com.splout.db.common.PortUtils;
import com.splout.db.common.PortUtils.PortLock;
import com.splout.db.engine.EmbeddedMySQL.EmbeddedMySQLConfig;
import com.splout.db.hadoop.TableSpec;
import com.splout.db.hadoop.engine.SploutSQLOutputFormat;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.Statement;
import java.util.HashMap;
import java.util.Map;

@SuppressWarnings("serial")
public class MySQLOutputFormat extends SploutSQLOutputFormat implements Serializable {

    public static Log LOG = LogFactory.getLog(MySQLOutputFormat.class);

    public static String STRING_FIELD_SIZE_PANGOOL_FIELD_PROP = "com.splout.db.engine.MySQLOutputFormat.string.field.size";
    public static String AUTO_TRIM_STRING_PANGOOL_FIELD_PROP = "com.splout.db.engine.MySQLOutputFormat.auto.trim.string";

    public static String GLOBAL_ENGINE_CONF_PROP = "com.splout.db.engine.MySQLOutputFormat.engine";
    public static String GLOBAL_CHARSET_CONF_PROP = "com.splout.db.engine.MySQLOutputFormat.charset";
    public static String GLOBAL_MEMORY_AVAILABLE_FOR_INDEXING = "com.splout.db.engine.MySQLOutputFormat.memory.indexing";

    public static String GLOBAL_STRING_FIELD_SIZE = "com.splout.db.engine.MySQLOutputFormat.string.field.size";
    public static String GLOBAL_AUTO_TRIM_STRING = "com.splout.db.engine.MySQLOutputFormat.auto.trim.string";

    public static String GENERATED_DB_NAME = "splout";

    // Keep track of all opened Mysqlds so we can kill them in any case
    private Map<Integer, EmbeddedMySQL> mySQLs = new HashMap<Integer, EmbeddedMySQL>();

    protected Integer globalStringFieldSize;
    protected Boolean globalAutoTrim = null;

    public MySQLOutputFormat(Integer batchSize, TableSpec... dbSpec) throws SploutSQLOutputFormatException {
        super(batchSize, dbSpec);
    }

    @Override
    public String getCreateTable(TableSpec tableSpec) throws SploutSQLOutputFormatException {
        loadGlobalConf();
        String engine = getConf().get(GLOBAL_ENGINE_CONF_PROP, "MyIsam");
        String charset = getConf().get(GLOBAL_CHARSET_CONF_PROP, "UTF8");

        String createTable = "CREATE TABLE " + tableSpec.getSchema().getName() + " (";
        for (Field field : tableSpec.getSchema().getFields()) {
            int fieldSize = fixedSizeStringField(field);
            if (field.getName().equals(PARTITION_TUPLE_FIELD)) {
                continue;
            }
            createTable += "`" + field.getName() + "` ";
            switch (field.getType()) {
            case INT:
                createTable += "INTEGER, ";
                break;
            case LONG:
                createTable += "LONG, ";
                break;
            case DOUBLE:
                createTable += "DOUBLE, ";
                break;
            case FLOAT:
                createTable += "FLOAT, ";
                break;
            case STRING:
                if (fieldSize > -1) {
                    createTable += "VARCHAR(" + fieldSize + "), ";
                } else {
                    createTable += "TEXT, ";
                }
                break;
            case BOOLEAN:
                createTable += "BOOLEAN, ";
                break;
            default:
                throw new SploutSQLOutputFormatException("Unsupported field type: " + field.getType());
            }
        }
        createTable = createTable.substring(0, createTable.length() - 2);
        return createTable += ") ENGINE=" + engine + " DEFAULT CHARSET=" + charset;
    }

    // Map of prepared statements per Schema and per Partition
    private Map<Integer, Map<String, PreparedStatement>> stCache = new HashMap<Integer, Map<String, PreparedStatement>>();
    private Map<Integer, Connection> connCache = new HashMap<Integer, Connection>();

    private long records = 0;

    // This method is called one time per each partition
    public void initPartition(int partition, Path local) throws IOException {

        Path mysqlDb = new Path(local.getParent(), partition + "");

        LOG.info("Initializing SQL connection [" + partition + "]");
        try {
            PortLock portLock = PortUtils.getNextAvailablePort(EmbeddedMySQLConfig.DEFAULT_PORT);

            EmbeddedMySQL mySQL = null;
            EmbeddedMySQLConfig config = null;
            HashMap<String, Object> customConfig = new HashMap<String, Object>();

            // Fixing memory for indexation. Main important parameters is myisam_sort_buffer_size
            // and key_buffer_size. See http://dev.mysql.com/doc/refman/5.0/en/server-system-variables.html
            long totalMem = getConf().getLong(GLOBAL_MEMORY_AVAILABLE_FOR_INDEXING, 100 * 1024 * 1024);
            double shareForSortBuffer = 0.9;
            customConfig.put("myisam_sort_buffer_size", (long) (shareForSortBuffer * totalMem));
            customConfig.put("key_buffer_size", (long) ((1 - shareForSortBuffer) * totalMem));
            customConfig.put("myisam_max_sort_file_size", 9223372036854775807l);

            try {
                File mysqlDir = new File(mysqlDb.toString());
                LOG.info("Going to instantiate a MySQLD in: " + mysqlDir + ", port [" + portLock.getPort()
                        + "] (partition: " + partition + ")");

                config = new EmbeddedMySQLConfig(portLock.getPort(), EmbeddedMySQLConfig.DEFAULT_USER,
                        EmbeddedMySQLConfig.DEFAULT_PASS, mysqlDir, customConfig);

                mySQL = new EmbeddedMySQL(config);
                mySQL.start(true);
            } catch (Exception e) {
                throw e;
            } finally {
                portLock.release();
            }

            mySQLs.put(partition, mySQL);

            // MySQL is successfully started at this point, or an Exception would have been thrown.
            Class.forName(EmbeddedMySQL.DRIVER);
            Connection conn = DriverManager.getConnection(config.getLocalJDBCConnection(GENERATED_DB_NAME),
                    config.getUser(), config.getPass());
            conn.setAutoCommit(false);
            connCache.put(partition, conn);
            Statement st = conn.createStatement();

            // Init transaction
            for (String sql : getPreSQL()) {
                LOG.info("Executing: " + sql);
                st.execute(sql);
            }
            st.execute("BEGIN");
            st.close();

            Map<String, PreparedStatement> stMap = new HashMap<String, PreparedStatement>();
            stCache.put(partition, stMap);
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    /**
     * Loads global variable configuration
     */
    protected void loadGlobalConf() {
        Configuration conf = getConf();
        if (conf.get(GLOBAL_AUTO_TRIM_STRING) != null) {
            globalAutoTrim = conf.getBoolean(GLOBAL_AUTO_TRIM_STRING, true);
        }
        if (conf.get(GLOBAL_STRING_FIELD_SIZE) != null) {
            globalStringFieldSize = conf.getInt(GLOBAL_STRING_FIELD_SIZE, 255);
        }
    }

    protected boolean fixedSizedStringField(Field f) {
        return fixedSizeStringField(f) != -1;
    }

    protected int fixedSizeStringField(Field f) {
        if (globalStringFieldSize != null)
            return globalStringFieldSize;
        else {
            if (f.getProp(STRING_FIELD_SIZE_PANGOOL_FIELD_PROP) != null) {
                return Integer.parseInt(f.getProp(STRING_FIELD_SIZE_PANGOOL_FIELD_PROP));
            }
        }
        return -1;
    }

    protected boolean autoTrim(Field f) {
        if (globalAutoTrim != null) {
            return globalAutoTrim;
        } else {
            if (f.getProp(AUTO_TRIM_STRING_PANGOOL_FIELD_PROP) != null) {
                return new Boolean(f.getProp(AUTO_TRIM_STRING_PANGOOL_FIELD_PROP));
            }
        }
        return false;
    }

    @Override
    public void write(ITuple tuple) throws IOException, InterruptedException {
        int partition = (Integer) tuple.get(PARTITION_TUPLE_FIELD);

        try {
            /*
             * Key performance trick: Cache PreparedStatements when possible. We will have one PreparedStatement per each
                * different Tuple Schema (table).
                */
            Map<String, PreparedStatement> stMap = stCache.get(partition);

            PreparedStatement pS = stMap.get(tuple.getSchema().getName());
            if (pS == null) {
                Connection conn = connCache.get(partition);
                // Create a PreparedStatement according to the received Tuple
                String preparedStatement = "INSERT INTO " + tuple.getSchema().getName() + " VALUES (";
                // NOTE: tuple.getSchema().getFields().size() - 1 : quick way of skipping "_partition" fields here
                for (int i = 0; i < tuple.getSchema().getFields().size() - 1; i++) {
                    preparedStatement += "?, ";
                }
                preparedStatement = preparedStatement.substring(0, preparedStatement.length() - 2) + ");";
                pS = conn.prepareStatement(preparedStatement);
                stMap.put(tuple.getSchema().getName(), pS);
            }

            int count = 1, tupleCount = 0;
            for (Field field : tuple.getSchema().getFields()) {
                if (field.getName().equals(PARTITION_TUPLE_FIELD)) {
                    tupleCount++;
                    continue;
                }
                if (field.getType().equals(Type.STRING)) {
                    boolean autoTrim = autoTrim(field);
                    int fieldSize = fixedSizeStringField(field);
                    String str = tuple.getString(tupleCount);
                    if (fieldSize > -1 && autoTrim && str != null && str.length() > fieldSize) {
                        str = str.substring(0, fieldSize);
                    }
                    pS.setObject(count, str);
                } else {
                    pS.setObject(count, tuple.get(tupleCount));
                }
                count++;
                tupleCount++;
            }
            pS.execute();

            records++;
            if (records == getBatchSize()) {
                Connection conn = connCache.get(partition);
                Statement st = conn.createStatement();
                st.execute("COMMIT");
                st.execute("BEGIN");
                st.close();
                records = 0;
            }
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    @Override
    public void close() throws IOException, InterruptedException {
        try {
            for (Map.Entry<Integer, Connection> entry : connCache.entrySet()) {
                LOG.info("Closing SQL connection [" + entry.getKey() + "]");
                //
                Connection conn = entry.getValue();
                Statement st = conn.createStatement();
                st.execute("COMMIT");
                if (getPostSQL() != null) {
                    LOG.info("Executing end SQL statements.");
                    for (String sql : getPostSQL()) {
                        LOG.info("Executing: " + sql);
                        st.execute(sql);
                    }
                }
                st.close();
                conn.close();
                // close MySQL before copying files (so mysql.sock disappears!)
                EmbeddedMySQL msql = mySQLs.get(entry.getKey());

                msql.stop();
                File resident = msql.getConfig().getResidentFolder();
                File zipDest = new File(resident.getParentFile(), entry.getKey() + ".db");

                // Create a "partition.db" zip with the needed files.
                CompressorUtil.createZip(resident, zipDest,
                        new WildcardFileFilter(
                                new String[] { "ib*", "*.frm", "*.MYD", "*.MYI", "db.opt", "*.ibd" }),
                        FileFilterUtils.or(FileFilterUtils.nameFileFilter("data"),
                                FileFilterUtils.nameFileFilter("splout")));
                // Delete all files except the generated zip "partition.db"
                FileUtils.deleteDirectory(new File(resident, "bin"));
                FileUtils.deleteDirectory(new File(resident, "data"));
                FileUtils.deleteDirectory(new File(resident, "share"));
            }
        } catch (Exception e) {
            throw new IOException(e);
        } finally { // in any case, destroy the HeartBeater
            for (Map.Entry<Integer, EmbeddedMySQL> entry : mySQLs.entrySet()) {
                entry.getValue().stop();
            }
        }
    }
}