madgik.exareme.master.queryProcessor.analyzer.stat.ExternalStat.java Source code

Java tutorial

Introduction

Here is the source code for madgik.exareme.master.queryProcessor.analyzer.stat.ExternalStat.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package madgik.exareme.master.queryProcessor.analyzer.stat;

import madgik.exareme.master.queryProcessor.analyzer.fanalyzer.OptiqueAnalyzer;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.log4j.Logger;

import java.sql.*;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

/**
 * @author jim
 */
public class ExternalStat implements StatExtractor {
    // public static final int LIMIT = 10000;
    // public static final double LIMIT_FACTOR = 0.3;
    private static final int BLOB_SIZE = 500000;
    private static final int NUM_SIZE = 8;
    private static final int MAX_STRING_SAMPLE = 20;
    // public static final String SAMPLE = "_sample";
    private static final Logger log = Logger.getLogger(ExternalStat.class);

    private final Connection con;
    private String sch;
    private String tblName;

    public ExternalStat(Connection con, String tbl, String schName) {
        sch = schName;
        this.con = con;
        this.tblName = tbl;
    }

    // schema map
    private Map<String, Table> schema = new HashMap<String, Table>();

    @Override
    public Map<String, Table> extractStats() throws Exception {

        DatabaseMetaData dbmd = con.getMetaData(); // dtabase metadata object

        // listing tables and columns
        String catalog = null;
        String schemaPattern = sch;
        String tableNamePattern = tblName;
        String columnNamePattern = "%";
        if (con.getClass().getName().contains("postgresql")) {
            // tableNamePattern="\""+tableNamePattern+"\"";
            schemaPattern = "public";
        }

        // ResultSet resultTables = dbmd.getTables(catalog, "public",
        // tableNamePattern, types);
        ResultSet resultColumns = dbmd.getColumns(catalog, schemaPattern, tableNamePattern, columnNamePattern);
        if (con.getClass().getName().contains("postgresql")) {
            tableNamePattern = "\"" + tableNamePattern + "\"";
        } else if (con.getClass().getName().contains("oracle")) {
            tableNamePattern = schemaPattern + "." + tableNamePattern;
        }
        log.debug("Starting extracting stats");
        // while (resultTables.next()) {
        Map<String, Column> columnMap = new HashMap<String, Column>();
        // StringEscapeUtils.escapeJava(resultTables.getString(3));
        log.debug("Analyzing table " + tblName);

        int toupleSize = 0; // in bytes

        // tableNamePattern = tableName;

        int columnCount = resultColumns.getMetaData().getColumnCount();
        Statement st = con.createStatement();
        ResultSet rs = st.executeQuery("select count(*) from " + tableNamePattern);
        int count = 0;
        if (rs.next()) {
            count = rs.getInt(1);
        } else {
            log.error("could not get count for table " + tableNamePattern);
        }
        rs.close();
        st.close();

        ResultSet pkrs = dbmd.getExportedKeys("", "", tblName);
        String pkey = "DEFAULT_KEY";

        while (pkrs.next()) {
            pkey = pkrs.getString("PKCOLUMN_NAME");
            break;
        }
        pkrs.close();
        if (count == 0) {
            log.debug("Empty table");
            Table t = new Table(tblName, columnCount, toupleSize, columnMap, count, pkey);
            schema.put(tblName, t);
            return schema;
        }

        while (resultColumns.next()) {

            String columnName = StringEscapeUtils.escapeJava(resultColumns.getString(4));
            try {
                String colNamePattern = columnName;
                if (con.getClass().getName().contains("postgresql")) {
                    colNamePattern = "\"" + columnName + "\"";
                }
                int columnType = resultColumns.getInt(5);

                // computing column's size in bytes
                int columnSize = computeColumnSize(colNamePattern, columnType, tableNamePattern);
                toupleSize += columnSize;

                // execute queries for numberOfDiffValues, minVal, maxVal
                // Map<String, Integer> diffValFreqMap = new HashMap<String,
                // Integer>();

                // computing column's min and max values
                String minVal = "0";
                String maxVal = "0";
                if (columnType != Types.BLOB) {
                    MinMax mm = computeMinMax(tableNamePattern, colNamePattern);
                    minVal = mm.getMin();
                    maxVal = mm.getMax();
                }
                Map<String, Integer> diffValFreqMap = new HashMap<String, Integer>();
                //only for equidepth!

                // for (ValFreq k : freqs) {
                // diffValFreqMap.put(k.getVal(), k.getFreq());

                // }

                // /add min max diff vals in the sampling values

                int minOcc = 1;
                int maxOcc = 1;
                int diffVals = 0;
                boolean equidepth = false;
                if (equidepth) {

                    //diffValFreqMap is used only in equidepth, do not compute it
                    //if we have primitive
                    diffValFreqMap = computeDistinctValuesFrequency(tableNamePattern, colNamePattern);

                    String minValChar = minVal;
                    String maxValChar = maxVal;
                    if (columnType == Types.VARCHAR || columnType == Types.CHAR || columnType == Types.LONGNVARCHAR
                            || columnType == Types.DATE) {
                        minValChar = "\'" + minVal + "\'";
                        maxValChar = "\'" + maxVal + "\'";
                    }
                    try {
                        minOcc = computeValOccurences(tableNamePattern, colNamePattern, minValChar);
                    } catch (Exception e) {
                        log.error("Could not compute value occurences for column:" + colNamePattern + " and value:"
                                + minValChar);
                    }
                    if (equidepth && !diffValFreqMap.containsKey(minVal))
                        diffValFreqMap.put(minVal, minOcc);

                    try {
                        maxOcc = computeValOccurences(tableNamePattern, colNamePattern, maxValChar);
                    } catch (Exception e) {
                        log.error("Could not compute value occurences for column:" + colNamePattern + " and value:"
                                + maxValChar);
                    }
                    if (diffValFreqMap.containsKey(maxVal))
                        diffValFreqMap.put(maxVal, maxOcc);

                    diffVals = diffValFreqMap.size();
                } else {
                    diffVals = computeDiffVals(tableNamePattern, colNamePattern, columnType);
                }
                if (diffVals == 0) {
                    //all values are null!
                    continue;
                }
                Column c = new Column(columnName, columnType, columnSize, diffVals, minVal, maxVal, diffValFreqMap);
                columnMap.put(columnName, c);
            } catch (Exception ex) {
                log.error("could not analyze column " + columnName + ":" + ex.getMessage());
            }

        }

        Table t = new Table(tblName, columnCount, toupleSize, columnMap, count, pkey);
        schema.put(tblName, t);

        // }
        // resultTables.close();
        resultColumns.close();
        return schema;

    }

    private int computeDiffVals(String tableNamePattern, String colNamePattern, int columnType)
            throws SQLException {
        String query = "select count(*) as freq from (select distinct " + colNamePattern + " from "
                + tableNamePattern + " where " + colNamePattern + " is not null) A";
        Statement stmt = con.createStatement();
        if (columnType == Types.BLOB) {
            query = "select count(*) as freq from (select " + colNamePattern + " from " + tableNamePattern
                    + " where " + colNamePattern + " is not null) A";
        }
        log.debug("executing distinct values query:" + query);
        ResultSet rs = stmt.executeQuery(query);
        int result = 0;
        while (rs.next()) {
            result = rs.getInt("freq");

        }

        rs.close();
        stmt.close();

        return result;
    }

    /* private-helper methods */
    private int computeColumnSize(String columnName, int columnType, String table_sample) throws Exception {
        int columnSize = 0;
        if (columnType == Types.INTEGER || columnType == Types.REAL || columnType == Types.DOUBLE
                || columnType == Types.DECIMAL || columnType == Types.FLOAT || columnType == Types.NUMERIC) {
            columnSize = NUM_SIZE;
        } else if (columnType == Types.VARCHAR) {
            String query0 = "select max(length(" + columnName + ")) as length from (select " + columnName + " from "
                    + table_sample + ") A" + " where " + columnName + " is not null limit " + MAX_STRING_SAMPLE;

            if (con.getClass().getName().contains("oracle")) {
                query0 = "select max(length(" + columnName + ")) as length from (select " + columnName + " from "
                        + table_sample + ") A" + " where " + columnName + " is not null and ROWNUM< "
                        + MAX_STRING_SAMPLE;
            }
            log.debug("executing col size query:" + query0);
            Statement stmt0 = con.createStatement();
            ResultSet rs0 = stmt0.executeQuery(query0);

            while (rs0.next()) {
                columnSize = rs0.getInt("length");
            }
            rs0.close();
            stmt0.close();

        } else if (columnType == Types.BLOB)
            columnSize = BLOB_SIZE;

        return columnSize;
    }

    private MinMax computeMinMax(String tableName, String columnName) throws Exception {
        String query1 = "select min(" + columnName + ") as minVal, max(" + columnName + ") " + "as maxVal  from "
                + tableName + " where " + columnName + " is not null";

        String minVal = "", maxVal = "";
        log.debug("executing minmax query:" + query1);
        Statement stmt1 = con.createStatement();
        ResultSet rs1 = stmt1.executeQuery(query1);
        while (rs1.next()) {
            minVal = rs1.getString("minVal");
            maxVal = rs1.getString("maxVal");
        }
        rs1.close();
        stmt1.close();

        return new MinMax(minVal, maxVal);
    }

    private int computeValOccurences(String tableName, String columnName, String value) throws Exception {
        String queryDf = "select count(*) as valCount " + "from " + tableName + " where " + columnName
                + " is not null and  " + columnName + " = " + value + "";
        log.debug("executing value occur. query:" + queryDf);
        Statement stmt = con.createStatement();
        ResultSet rs = stmt.executeQuery(queryDf);
        int diffValCount = 0;
        while (rs.next()) {
            diffValCount = rs.getInt("valCount");
        }
        rs.close();
        stmt.close();

        return diffValCount;
    }

    private Map<String, Integer> computeDistinctValuesFrequency(String table_sample, String columnName)
            throws Exception {
        Map<String, Integer> freqs = new HashMap<String, Integer>();

        String query = "select " + columnName + " as val, count(*) as freq from " + table_sample + " where "
                + columnName + " is not null group by " + columnName + "";
        log.debug("executing distinct values freq. query:" + query);
        Statement stmt = con.createStatement();
        ResultSet rs = stmt.executeQuery(query);

        while (rs.next()) {

            freqs.put(rs.getString("val"), rs.getInt("freq"));

        }

        rs.close();
        stmt.close();

        return freqs;
    }

    /* inner - helper classes */
    private final class MinMax {
        private final String min;
        private final String max;

        public MinMax(String min, String max) {
            this.min = min;
            this.max = max;
        }

        public String getMin() {
            return min;
        }

        public String getMax() {
            return max;
        }

        @Override
        public String toString() {
            return "MinMax{" + "min=" + min + ", max=" + max + '}';
        }

    }

    public void setSch(String s) {
        this.sch = s;
    }
}