qa.qcri.nadeef.core.pipeline.Updater.java Source code

Java tutorial

Introduction

Here is the source code for qa.qcri.nadeef.core.pipeline.Updater.java

Source

/*
 * QCRI, NADEEF LICENSE
 * NADEEF is an extensible, generalized and easy-to-deploy data cleaning platform built at QCRI.
 * NADEEF means "Clean" in Arabic
 *
 * Copyright (c) 2011-2013, Qatar Foundation for Education, Science and Community Development (on
 * behalf of Qatar Computing Research Institute) having its principle place of business in Doha,
 * Qatar with the registered address P.O box 5825 Doha, Qatar (hereinafter referred to as "QCRI")
 *
 * NADEEF has patent pending nevertheless the following is granted.
 * NADEEF is released under the terms of the MIT License, (http://opensource.org/licenses/MIT).
 */

package qa.qcri.nadeef.core.pipeline;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import qa.qcri.nadeef.core.datamodel.Cell;
import qa.qcri.nadeef.core.datamodel.Column;
import qa.qcri.nadeef.core.datamodel.Fix;
import qa.qcri.nadeef.core.datamodel.NadeefConfiguration;
import qa.qcri.nadeef.core.util.sql.DBConnectionPool;
import qa.qcri.nadeef.tools.CommonTools;
import qa.qcri.nadeef.tools.PerfReport;
import qa.qcri.nadeef.tools.Tracer;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.Statement;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.ConcurrentMap;

/**
 * Updater fixes the source data and exports it in the database.
 */
public class Updater extends Operator<Collection<Fix>, Collection<Fix>> {
    private static Tracer tracer = Tracer.getTracer(Updater.class);
    private ConcurrentMap<Cell, String> updateHistory;
    private ConcurrentMap<Cell, Boolean> unknownTag;

    /**
     * Constructor.
     */
    public Updater(ExecutionContext context) {
        super(context);
        updateHistory = Maps.newConcurrentMap();
        unknownTag = Maps.newConcurrentMap();
    }

    /**
     * Apply the fixes from EQ and modify the original source.
     *
     * @param fixes real fix collection.
     * @return real fix collection.
     */
    @Override
    public Collection<Fix> execute(Collection<Fix> fixes) throws Exception {
        int count = 0;
        Connection sourceConn = null;
        Connection nadeefConn = null;
        Statement sourceStat = null;
        PreparedStatement auditStat = null;
        String auditTableName = NadeefConfiguration.getAuditTableName();
        String rightValue;
        String oldValue;
        ExecutionContext context = getCurrentContext();
        DBConnectionPool connectionPool = context.getConnectionPool();
        List<Fix> realFixes = Lists.newArrayList();
        try {
            nadeefConn = connectionPool.getNadeefConnection();
            sourceConn = connectionPool.getSourceConnection();
            sourceStat = sourceConn.createStatement();
            auditStat = nadeefConn.prepareStatement(
                    "INSERT INTO " + auditTableName + " VALUES (default, ?, ?, ?, ?, ?, ?, current_timestamp)");
            for (Fix fix : fixes) {
                Cell cell = fix.getLeft();
                Object oldValue_ = cell.getValue();
                if (oldValue_ == null) {
                    oldValue = null;
                } else {
                    oldValue = oldValue_.toString();
                }

                // this cell has already been changed to unknown
                if (unknownTag.containsKey(cell)) {
                    continue;
                }

                realFixes.add(fix);
                // check whether this cell has been changed before
                if (updateHistory.containsKey(cell)) {
                    String value = updateHistory.get(cell);
                    if (value.equals(fix.getRightValue())) {
                        continue;
                    }
                    // when a cell is set twice with different value,
                    // we set it to null for ambiguous value.
                    unknownTag.put(cell, true);
                    rightValue = "?";
                } else {
                    rightValue = fix.getRightValue();
                    updateHistory.put(cell, rightValue);
                }

                // check for numerical type.
                if (rightValue != null && !CommonTools.isNumericalString(rightValue)) {
                    rightValue = '\'' + rightValue + '\'';
                }

                if (oldValue != null && !CommonTools.isNumericalString(oldValue)) {
                    oldValue = '\'' + oldValue + '\'';
                }

                Column column = cell.getColumn();
                String tableName = column.getTableName();
                String updateSql = "UPDATE " + tableName + " SET " + column.getColumnName() + " = " + rightValue
                        + " WHERE tid = " + cell.getTid();
                tracer.verbose(updateSql);
                sourceStat.addBatch(updateSql);
                auditStat.setInt(1, fix.getVid());
                auditStat.setInt(2, cell.getTid());
                auditStat.setString(3, column.getTableName());
                auditStat.setString(4, column.getColumnName());
                auditStat.setString(5, oldValue);
                auditStat.setString(6, rightValue);
                auditStat.addBatch();
                if (count % 4096 == 0) {
                    auditStat.executeBatch();
                    nadeefConn.commit();
                    sourceStat.executeBatch();
                    sourceConn.commit();
                }
                count++;
                setPercentage(count / fixes.size());
            }
            sourceStat.executeBatch();
            auditStat.executeBatch();
            sourceConn.commit();
            nadeefConn.commit();
            PerfReport.appendMetric(PerfReport.Metric.UpdatedCellNumber, count);
        } finally {
            if (auditStat != null) {
                auditStat.close();
            }

            if (sourceStat != null) {
                sourceStat.close();
            }

            if (nadeefConn != null) {
                nadeefConn.close();
            }

            if (sourceConn != null) {
                sourceConn.close();
            }
        }
        return realFixes;
    }
}