org.cartoweb.stats.imports.Import.java Source code

Java tutorial

Introduction

Here is the source code for org.cartoweb.stats.imports.Import.java

Source

/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * @copyright 2008 Camptocamp SA
 */

package org.cartoweb.stats.imports;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.cartoweb.stats.BaseStats;
import org.cartoweb.stats.Utils;
import org.pvalsecc.jdbc.BeanDbMapper;
import org.pvalsecc.jdbc.BeansDbMapper;
import org.pvalsecc.jdbc.JdbcUtilities;
import org.pvalsecc.jdbc.StatementUtils;
import org.pvalsecc.log.Progress;
import org.pvalsecc.misc.UnitUtilities;
import org.pvalsecc.opts.Option;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.security.MessageDigest;
import java.sql.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Convert the log files into DB tables usable by the statistics system.
 */
public class Import extends BaseStats {
    public static final Log LOGGER = LogFactory.getLog(Import.class);

    public static final BeanDbMapper<StatsRecord> MAPPER = BeansDbMapper.getMapper(StatsRecord.class);

    private static final boolean DB_SOLVE_HITS = false;

    @Option(desc = "Directory where the log files are stored", mandatory = true, environment = "STATS_LOG_DIR")
    private String logDir = "";

    @Option(desc = "Regular expression to be matched (on the absolute path) for a log file to be taken", environment = "STATS_LOG_REGEXP")
    private String logRegexp = "\\.log(\\.gz)?$";

    @Option(desc = "If true, will delete everything and start from scratch", environment = "STATS_INITIALIZE")
    private boolean initialize = false;

    @Option(desc = "If true, double imports will be ignored", environment = "STATS_IGNORE")
    private boolean ignore = false;

    @Option(desc = "If true, a layers table is generated", environment = "STATS_WANT_LAYERS")
    private boolean wantLayers = false;

    @Option(desc = "Format of the log files ('CartoWeb', 'SyslogCartoWeb', 'WMS', 'SecureWMS', 'HaproxyWMS', 'SquidTilecache', 'VarnishTilecache')", mandatory = true, environment = "STATS_FORMAT")
    private String format = "CartoWeb";

    @Option(desc = "Used only if format='WMS'. Regular expression to capture the mapId (project) from the log file.", environment = "STATS_FORMAT")
    private String mapIdRegExp = null;

    @Option(desc = "Used only if format='WMS'. Configuration file name (.ini) that contains the strings to search for the map IDs and the name to use.", environment = "STATS_FORMAT")
    private String mapIdConfig = null;

    @Option(desc = "Used only if format='*WMS'. Map resolution.", environment = "STATS_FORMAT")
    private Integer resolution = null;

    @Option(desc = "Used only if format='*Tilecache'. Configuration file name (.ini) that contains the options for referers and tilecache.", environment = "STATS_FORMAT")
    private String tilecacheConfig = null;

    @Option(desc = "Continue the import in case of parsing error", environment = "STATS_SKIP_ERRORS")
    private boolean skipErrors = false;

    /**
     * List of all the side tables used to categorize some fields.
     */
    private final SideTables sideTables;

    /**
     * Used for the primary key of the main table.
     */
    private long curId = 0;

    /**
     * Cache hits to solve.
     */
    private final Set<String> hits = new HashSet<String>();

    public Import(String[] args) {
        super();
        parseArgs(args);

        if (format.equalsIgnoreCase("WMS")) {
            if (mapIdRegExp == null && mapIdConfig == null) {
                printUsage("Missing parameter 'mapIdRegExp' or 'mapIdConfig'");
            }
            if (mapIdRegExp != null && mapIdConfig != null) {
                printUsage("You cannot set both 'mapIdRegExp' and 'mapIdConfig'");
            }
        }
        if (format.equalsIgnoreCase("WMS") || format.equalsIgnoreCase("SecureWMS")
                || format.equalsIgnoreCase("HaproxyWMS")) {
            if (resolution == null) {
                resolution = 96;
            }
        }
        if (format.equalsIgnoreCase("SquidTilecache") || format.equalsIgnoreCase("VarnishTilecache")) {
            if (tilecacheConfig == null) {
                printUsage("Missing parameter 'tilecacheConfig'");
            }
        }
        sideTables = new SideTables(tableName);
    }

    private void findLastId(Connection con) throws SQLException {
        JdbcUtilities.runSelectQuery("finding the last id in table " + tableName,
                "SELECT max(id) FROM " + tableName, con, new JdbcUtilities.SelectTask() {
                    public void setupStatement(PreparedStatement stmt) throws SQLException {
                    }

                    public void run(ResultSet rs) throws SQLException {
                        while (rs.next()) {
                            curId = rs.getLong(1) + 1;
                        }
                    }
                });
    }

    protected void runImpl() throws ClassNotFoundException, SQLException, IOException {
        long startTime = System.currentTimeMillis();
        Connection con = getConnection();
        if (initialize) {
            dropStructure(con);
            createStructure(con);
            con.commit();
        }

        //get the list of files to load
        List<File> files = getFiles(new File(logDir));
        files = checkFiles(con, files);

        if (!files.isEmpty()) {
            if (!initialize) {
                findLastId(con);
                sideTables.load(con);
                sideTables.dropForeignKeys(con, tableName);
            }
            Progress progress = new Progress(10 * 1000, files.size(), "Files reading", LOGGER);
            for (int i = 0; i < files.size(); i++) {
                File file = files.get(i);
                progress.update(i);
                if (LOGGER.isDebugEnabled()) {
                    LOGGER.debug("Reading file " + (i + 1) + "/" + files.size() + ": " + file);
                }
                convertFile(con, file);
            }

            sideTables.save(con);

            LOGGER.info("Time to import " + files.size() + " files: "
                    + UnitUtilities.toElapsedTime(System.currentTimeMillis() - startTime));
            if (initialize) {
                createIndexes(con);
            } else {
                Progress progressI = new Progress(10 * 1000, sideTables.size(), "Foreign key creation", LOGGER);
                sideTables.createForeignKeys(con, progressI, 0, tableName);
            }
            fillCacheHits(con);
            vacuum(con);
        } else {
            LOGGER.info("No new file to import");
        }

        con.close();
    }

    private List<File> checkFiles(Connection con, List<File> files) throws SQLException {
        LOGGER.info("Checking files to import.");

        List<File> result;
        final Map<String, File> fileByMd5 = new HashMap<String, File>();
        final Map<File, String> md5ByFile = new HashMap<File, String>();
        final boolean hasNoHistory = initialize
                || JdbcUtilities.countTable(con, tableName + "_all_files", null) == 0;
        result = new ArrayList<File>(files.size());

        Progress progress = new Progress(10 * 1000, files.size(), "Files checking", LOGGER);
        for (int i = 0; i < files.size(); i++) {
            progress.update(i);
            final File file = files.get(i);
            final long fileSize = file.length();
            final boolean toImport = hasNoHistory || checkFileNeedsImport(con, file, fileSize);
            if (toImport) {
                if (checkHasNotSameFile(con, file, fileByMd5, md5ByFile, hasNoHistory)) {
                    result.add(file);
                }
            }
        }

        JdbcUtilities.runInsertQuery("inserting entries in " + tableName + "_all_files",
                "INSERT INTO " + tableName
                        + "_all_files (imported_date, size, path, checksum) VALUES (now(),?,?,?)",
                con, result, 500, new JdbcUtilities.InsertTask<File>() {
                    public boolean marshall(PreparedStatement stmt, File item) throws SQLException {
                        stmt.setLong(1, item.length());
                        stmt.setString(2, item.getAbsolutePath());
                        stmt.setString(3, md5ByFile.get(item));
                        return true;
                    }
                });

        LOGGER.info("Files checking done.");

        return result;
    }

    private boolean checkHasNotSameFile(Connection con, final File file, Map<String, File> fileByMd5,
            Map<File, String> md5ByFile, boolean hasNoHistory) throws SQLException {
        final String md5 = getMD5(file);
        final File known = fileByMd5.get(md5);
        if (known != null) {
            LOGGER.warn("The same file is twice in the directory. It's name has now changed! (old path=" + known
                    + ", new path=" + file + ")");
            if (!ignore) {
                throw new RuntimeException();
            }
            return false;
        }
        fileByMd5.put(md5, file);
        md5ByFile.put(file, md5);
        final boolean[] sameFileFound = new boolean[] { false };
        if (!hasNoHistory) {
            JdbcUtilities.runSelectQuery("checking no file has checksum=" + md5,
                    "SELECT imported_date, path FROM " + tableName + "_all_files WHERE checksum=?", con,
                    new JdbcUtilities.SelectTask() {
                        public void setupStatement(PreparedStatement stmt) throws SQLException {
                            stmt.setString(1, md5);
                        }

                        public void run(ResultSet rs) throws SQLException {
                            while (rs.next()) {
                                Timestamp importedWhen = rs.getTimestamp(1);
                                String path = rs.getString(2);
                                LOGGER.warn("On " + importedWhen
                                        + ", same file was already imported. It's name has now changed! (old path="
                                        + path + ", new path=" + file + ")");
                                if (!ignore) {
                                    throw new RuntimeException();
                                }
                                sameFileFound[0] = true;
                            }
                        }
                    });
        }
        return !sameFileFound[0];
    }

    private boolean checkFileNeedsImport(Connection con, final File file, final long fileSize) throws SQLException {
        final boolean[] toImport = new boolean[] { true };
        JdbcUtilities.runSelectQuery("checking file " + file.getAbsolutePath() + " is known",
                "SELECT imported_date, size FROM " + tableName + "_all_files WHERE path=?", con,
                new JdbcUtilities.SelectTask() {
                    public void setupStatement(PreparedStatement stmt) throws SQLException {
                        stmt.setString(1, file.getAbsolutePath());
                    }

                    public void run(ResultSet rs) throws SQLException {
                        while (rs.next()) {
                            Timestamp importedWhen = rs.getTimestamp(1);
                            long size = rs.getLong(2);
                            if (size != fileSize) {
                                LOGGER.warn("On " + importedWhen + " file " + file
                                        + " was already imported and it's size has changed");
                                if (!ignore) {
                                    throw new RuntimeException();
                                }
                            } else {
                                LOGGER.debug("On " + importedWhen + " file " + file + " was already imported");
                            }
                            toImport[0] = false;
                        }
                    }
                });
        return toImport[0];
    }

    private String getMD5(File file) {
        try {
            MessageDigest md5 = MessageDigest.getInstance("MD5");
            InputStream fin = new FileInputStream(file);
            byte[] buffer = new byte[1024];
            int read;
            do {
                read = fin.read(buffer);
                if (read > 0) {
                    md5.update(buffer, 0, read);
                }
            } while (read != -1);
            fin.close();
            byte[] digest = md5.digest();
            if (digest == null) {
                return "";
            }
            String strDigest = "0x";
            for (int i = 0; i < digest.length; i++) {
                strDigest += Integer.toString((digest[i] & 0xff) + 0x100, 16).substring(1).toUpperCase();
            }
            return strDigest;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    private List<File> getFiles(File dir) {
        Pattern filter = Pattern.compile(logRegexp);
        if (!dir.exists() || !dir.isDirectory()) {
            throw new RuntimeException(logDir + " is not a valid directory");
        }
        final File[] list = dir.listFiles();
        final List<File> result = new ArrayList<File>(list.length);
        for (int i = 0; i < list.length; ++i) {
            File file = list[i];
            if (file.isDirectory()) {
                result.addAll(getFiles(file));
            } else {
                Matcher matcher = filter.matcher(file.getAbsolutePath());
                if (matcher.find()) {
                    result.add(file);
                }
            }
        }
        return result;
    }

    /**
     * Imports one file into the DB.
     */
    private void convertFile(final Connection con, File file) throws IOException, SQLException {
        try {
            final String query = "INSERT INTO " + tableName + " (" + MAPPER.getFieldNames() + ") VALUES ("
                    + MAPPER.getInsertPlaceHolders() + ")";
            final PreparedStatement layerStmt = wantLayers
                    ? con.prepareStatement("INSERT INTO " + tableName + "_layers (id, layer) VALUES (?,?)")
                    : null;

            StatsReader reader = createReader(file);

            JdbcUtilities.runInsertQuery("inserting stats", query, con, reader, 500,
                    new JdbcUtilities.InsertTask<StatsRecord>() {
                        private int cptLayers = 0;

                        public boolean marshall(PreparedStatement stmt, StatsRecord item) throws SQLException {
                            if (item != null) {
                                item.setId(curId++);
                                MAPPER.saveToDb(stmt, item, 1);

                                if (wantLayers && item.getLayerArray() != null) {
                                    for (int i = 0; i < item.getLayerArray().size(); i++) {
                                        Integer val = item.getLayerArray().get(i);
                                        layerStmt.setLong(1, item.getId());
                                        layerStmt.setInt(2, val);
                                        layerStmt.addBatch();
                                        if ((++cptLayers % 500) == 0) {
                                            layerStmt.executeBatch();
                                        }
                                    }
                                }
                                return true;
                            } else {
                                return false;
                            }
                        }
                    });

            if (layerStmt != null) {
                layerStmt.executeBatch();
                layerStmt.close();
            }
        } catch (BatchUpdateException ex) {
            ex.getNextException().printStackTrace();
            throw ex;
        }
    }

    private StatsReader createReader(File file) throws IOException {
        if (format.equalsIgnoreCase("WMS")) {
            MapIdExtractor mapIdExtractor = createMapIdExtractor();
            return new WmsReader(file, sideTables, wantLayers, resolution, mapIdExtractor, skipErrors);
        } else if (format.equalsIgnoreCase("SecureWMS")) {
            return new SecureWmsReader(file, sideTables, wantLayers, resolution, skipErrors);
        } else if (format.equalsIgnoreCase("HaproxyWMS")) {
            return new HaproxyWmsReader(file, sideTables, wantLayers, resolution, skipErrors);
        } else if (format.equalsIgnoreCase("CartoWeb")) {
            return new CartoWebReader(file, sideTables, wantLayers, skipErrors);
        } else if (format.equalsIgnoreCase("SyslogCartoWeb")) {
            return new SyslogCartoWebReader(file, sideTables, wantLayers, skipErrors);
        } else if (format.equalsIgnoreCase("SquidTilecache")) {
            TilecacheExtractor tilecacheExtractor = new TilecacheExtractor(tilecacheConfig);
            return new SquidTilecacheReader(file, sideTables, wantLayers, tilecacheExtractor, skipErrors);
        } else if (format.equalsIgnoreCase("VarnishTilecache")) {
            TilecacheExtractor tilecacheExtractor = new TilecacheExtractor(tilecacheConfig);
            return new VarnishTilecacheReader(file, sideTables, wantLayers, tilecacheExtractor, skipErrors);
        } else {
            throw new RuntimeException("Format not supported: " + format);
        }
    }

    private MapIdExtractor createMapIdExtractor() throws IOException {
        if (mapIdRegExp != null) {
            return new RegExpMapIdExtractor(mapIdRegExp);
        } else {
            return new ConfigMapIdExtractor(mapIdConfig);
        }
    }

    public void dropStructure(Connection con) throws SQLException {
        Utils.dropAllReportTables(con, tableName);
        Utils.dropTable(con, tableName + "_all_files", true);
        Utils.dropTable(con, tableName, true);
        Utils.dropTable(con, tableName + "_layers", true);
        Utils.dropSequence(con, tableName + "_id_seq", true);

        sideTables.dropStructure(con);
    }

    public void createStructure(Connection con) throws SQLException {
        JdbcUtilities.runDeleteQuery("creating table " + tableName, "CREATE TABLE " + tableName
                + " (id bigint NOT NULL, general_browser_info integer, exportpdf_format integer, general_ip text, general_mapid integer, images_mainmap_height integer, images_mainmap_size integer, general_direct_access boolean, general_security_user integer, general_cache_id text, general_elapsed_time real, general_export_plugin integer, general_ua integer, query_results_count integer, general_cache_hit text, location_scale real, general_sessid int, images_mainmap_width integer, exportpdf_resolution integer, general_time timestamp without time zone, bbox_minx real, bbox_miny real, bbox_maxx real, bbox_maxy real, layers_switch_id integer, general_client_version integer, layers text, query_results_table_count text, general_request_id text)",
                con, null);
        if (wantLayers) {
            JdbcUtilities.runDeleteQuery("creating table " + tableName + "_layers",
                    "CREATE TABLE " + tableName + "_layers (id bigint NOT NULL, layer integer NOT NULL)", con,
                    null);
        }
        JdbcUtilities.runDeleteQuery("creating table all_files", "CREATE TABLE " + tableName
                + "_all_files (imported_date timestamp NOT NULL, size bigint NOT NULL, path text NOT NULL, checksum text NOT NULL)",
                con, null);
        JdbcUtilities.runDeleteQuery("creating sequence " + tableName + "_id_seq",
                "CREATE SEQUENCE " + tableName + "_id_seq", con, null);

        sideTables.createStructure(con);
    }

    private void createIndexes(Connection con) throws SQLException {
        Progress progress = new Progress(10 * 1000, 6 + sideTables.size(), "Index creation", LOGGER);
        JdbcUtilities.runDeleteQuery("creating primary key", "ALTER TABLE " + tableName + " ADD PRIMARY KEY (id)",
                con, null);
        progress.update(1);
        JdbcUtilities.runDeleteQuery("indexing general_mapid",
                "CREATE INDEX i_" + tableName + "_general_mapid ON " + tableName + " (general_mapid)", con, null);
        progress.update(2);
        JdbcUtilities.runDeleteQuery("indexing general_time",
                "CREATE INDEX i_" + tableName + "_general_time ON " + tableName + " (general_time)", con, null);
        progress.update(3);
        JdbcUtilities.runDeleteQuery("indexing layers_switch_id",
                "CREATE INDEX i_" + tableName + "_layers_switch_id ON " + tableName + " (layers_switch_id)", con,
                null);
        progress.update(4);
        JdbcUtilities.runDeleteQuery("indexing location_scale",
                "CREATE INDEX i_" + tableName + "_location_scale ON " + tableName + " (location_scale)", con, null);
        progress.update(5);
        JdbcUtilities.runDeleteQuery("indexing general_cache_id",
                "CREATE INDEX i_" + tableName + "_general_cache_id ON " + tableName + " (general_cache_id)", con,
                null);
        if (!DB_SOLVE_HITS) {
            JdbcUtilities.runDeleteQuery("indexing general_cache_hit",
                    "CREATE INDEX i_" + tableName + "_general_cache_hit ON " + tableName + " (general_cache_hit)",
                    con, null);
        }
        progress.update(6);
        sideTables.createForeignKeys(con, progress, 7, tableName);
        con.commit();
        LOGGER.info("Indexes created");
    }

    private void fillCacheHits(Connection con) throws SQLException {
        con.commit();
        con.setAutoCommit(true);
        JdbcUtilities.runDeleteQuery("vacuuming " + tableName, "VACUUM ANALYZE " + tableName, con, null);
        con.setAutoCommit(false);

        if (DB_SOLVE_HITS) {
            //take around 55m for 4M records and is not greate for incremental updates...
            JdbcUtilities.runDeleteQuery("solving cache hits", "UPDATE " + tableName
                    + " f SET general_elapsed_time=s.general_elapsed_time, images_mainmap_width=s.images_mainmap_width, images_mainmap_height=s.images_mainmap_height, layers=s.layers, layers_switch_id=s.layers_switch_id, bbox_minx=s.bbox_minx, bbox_miny=s.bbox_miny, bbox_maxx=s.bbox_maxx, bbox_maxy=s.bbox_maxy, location_scale=s.location_scale, query_results_count=s.query_results_count, query_results_table_count=s.query_results_table_count FROM "
                    + tableName
                    + " s WHERE s.general_cache_id=f.general_cache_hit AND f.general_cache_hit IS NOT NULL AND f.general_elapsed_time IS NULL AND f.layers IS NULL",
                    con, null);
        } else {
            //takes around 21m for the same 4M records and is optimal for incremental updates...
            try {
                final PreparedStatement updateStmt = con.prepareStatement("UPDATE " + tableName
                        + " SET general_elapsed_time=?, images_mainmap_width=?, images_mainmap_height=?, layers=?, layers_switch_id=?, bbox_minx=?, bbox_miny=?, bbox_maxx=?, bbox_maxy=?, location_scale=?, query_results_count=?, query_results_table_count=? WHERE general_cache_hit=?");
                if (hits.size() == 0) {
                    return;
                }

                JdbcUtilities.runSelectQuery("reading cached values",
                        "SELECT general_cache_id, general_elapsed_time, images_mainmap_width, images_mainmap_height, layers, layers_switch_id, bbox_minx, bbox_miny, bbox_maxx, bbox_maxy, location_scale, query_results_count, query_results_table_count FROM "
                                + tableName + " WHERE general_cache_id IS NOT NULL",
                        con, new JdbcUtilities.SelectTask() {
                            private int cpt = 0;

                            public void setupStatement(PreparedStatement stmt) throws SQLException {
                            }

                            public void run(ResultSet rs) throws SQLException {
                                int count = 0;
                                final int todo = hits.size();
                                Progress progress = new Progress(10 * 1000, todo, "Cache hit record updating",
                                        LOGGER);
                                while (rs.next()) {
                                    String cacheId = rs.getString(1);
                                    //We can have the same general_cache_id multiple times.
                                    //So we have to remove it from the set.
                                    if (hits.remove(cacheId)) {
                                        StatementUtils.copyFloat(rs, 2, updateStmt, 1);
                                        StatementUtils.copyInt(rs, 3, updateStmt, 2);
                                        StatementUtils.copyInt(rs, 4, updateStmt, 3);
                                        StatementUtils.copyString(rs, 5, updateStmt, 4);
                                        StatementUtils.copyInt(rs, 6, updateStmt, 5);
                                        StatementUtils.copyFloat(rs, 7, updateStmt, 6);
                                        StatementUtils.copyFloat(rs, 8, updateStmt, 7);
                                        StatementUtils.copyFloat(rs, 9, updateStmt, 8);
                                        StatementUtils.copyFloat(rs, 10, updateStmt, 9);
                                        StatementUtils.copyFloat(rs, 11, updateStmt, 10);
                                        StatementUtils.copyInt(rs, 12, updateStmt, 11);
                                        StatementUtils.copyString(rs, 13, updateStmt, 12);
                                        updateStmt.setString(13, cacheId);
                                        updateStmt.addBatch();

                                        if (++cpt % 50 == 0) {
                                            int[] counts = updateStmt.executeBatch();
                                            for (int i = 0; i < counts.length; ++i) {
                                                count += counts[i];
                                            }
                                        }

                                        progress.update(todo - hits.size());
                                    }
                                }
                                ++cpt;
                                int[] counts = updateStmt.executeBatch();
                                for (int i = 0; i < counts.length; ++i) {
                                    count += counts[i];
                                }

                                LOGGER.info(count + " cache hit records updated from " + cpt + " cached values");
                            }
                        });

                updateStmt.close();
            } catch (BatchUpdateException ex) {
                LOGGER.error(ex.getNextException());
                throw ex;
            }
        }
        con.commit();
    }

    private void vacuum(Connection con) throws SQLException {
        con.setAutoCommit(true);
        LOGGER.info("Doing full vacuum.");
        JdbcUtilities.runDeleteQuery("vacuuming " + tableName, "VACUUM FULL ANALYZE " + tableName, con, null);
        sideTables.vacuum(con);
        LOGGER.info("Vacuum done.");
    }

    public SideTables getSideTables() {
        return sideTables;
    }

    public static void main(String[] args) {
        Import imports = new Import(args);
        imports.run();
    }
}