Java tutorial
/* * * Copyright 2014-2015, Armenak Grigoryan, and individual contributors as indicated * by the @authors tag. See the copyright.txt in the distribution for a * full listing of individual contributors. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * */ package com.strider.datadefender; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.text.DateFormat; import java.text.DecimalFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.regex.Pattern; import static java.lang.Double.parseDouble; import static java.util.regex.Pattern.compile; import org.apache.commons.collections.ListUtils; import org.apache.log4j.Logger; import static org.apache.log4j.Logger.getLogger; import com.strider.datadefender.database.IDBFactory; import com.strider.datadefender.database.metadata.IMetaData; import com.strider.datadefender.database.metadata.MatchMetaData; import com.strider.datadefender.database.sqlbuilder.ISQLBuilder; import com.strider.datadefender.functions.Utils; import com.strider.datadefender.report.ReportUtil; import com.strider.datadefender.specialcase.SpecialCase; import com.strider.datadefender.utils.CommonUtils; import com.strider.datadefender.utils.Score; import opennlp.tools.util.Span; /** * * @author Armenak Grigoryan */ public class DatabaseDiscoverer extends Discoverer { private static final Logger log = getLogger(DatabaseDiscoverer.class); private static final String YES = "yes"; private static String[] modelList; /** * Calls a function defined as an extention * @param function * @param data * @param text * @return * @throws SQLException * @throws NoSuchMethodException * @throws SecurityException * @throws IllegalAccessException * @throws IllegalArgumentException * @throws InvocationTargetException */ private Object callExtention(final String function, final MatchMetaData data, final String text) throws SQLException, NoSuchMethodException, SecurityException, IllegalAccessException, IllegalArgumentException, InvocationTargetException { if ((function == null) || function.equals("")) { log.warn("Function " + function + " is not defined"); return null; } Object value = null; try { final String className = Utils.getClassName(function); final String methodName = Utils.getMethodName(function); final Method method = Class.forName(className).getMethod(methodName, new Class[] { MatchMetaData.class, String.class }); final SpecialCase instance = (SpecialCase) Class.forName(className).newInstance(); final Map<String, Object> paramValues = new HashMap<>(2); paramValues.put("metadata", data); paramValues.put("text", text); value = method.invoke(instance, data, text); } catch (InstantiationException | ClassNotFoundException ex) { log.error(ex.toString()); } return value; } @SuppressWarnings("unchecked") public List<MatchMetaData> discover(final IDBFactory factory, final Properties dataDiscoveryProperties, String vendor) throws ParseException, DatabaseDiscoveryException { log.info("Data discovery in process"); // Get the probability threshold from property file final double probabilityThreshold = parseDouble( dataDiscoveryProperties.getProperty("probability_threshold")); String calculate_score = dataDiscoveryProperties.getProperty("score_calculation"); if (CommonUtils.isEmptyString(calculate_score)) { calculate_score = "false"; } log.info("Probability threshold [" + probabilityThreshold + "]"); // Get list of models used in data discovery final String models = dataDiscoveryProperties.getProperty("models"); modelList = models.split(","); log.info("Model list [" + Arrays.toString(modelList) + "]"); List<MatchMetaData> finalList = new ArrayList<>(); for (final String model : modelList) { log.info("********************************"); log.info("Processing model " + model); log.info("********************************"); final Model modelPerson = createModel(dataDiscoveryProperties, model); matches = discoverAgainstSingleModel(factory, dataDiscoveryProperties, modelPerson, probabilityThreshold, vendor); finalList = ListUtils.union(finalList, matches); } final DecimalFormat decimalFormat = new DecimalFormat("#.##"); log.info("List of suspects:"); log.info(String.format("%20s %20s %20s %20s", "Table*", "Column*", "Probability*", "Model*")); final Score score = new Score(); int highRiskColumns = 0; int rowCount = 0; for (final MatchMetaData data : finalList) { // Row count if (YES.equals(calculate_score)) { log.debug("Skipping table rowcount..."); rowCount = ReportUtil.rowCount(factory, data.getTableName(), Integer.valueOf(dataDiscoveryProperties.getProperty("limit"))); } // Getting 5 sample values final List<String> sampleDataList = ReportUtil.sampleData(factory, data.getTableName(), data.getColumnName()); // Output log.info("Column : " + data.toString()); log.info(CommonUtils.fixedLengthString('=', data.toString().length() + 30)); log.info("Number of rows in the table: " + rowCount); log.info("Score : " + score.columnScore(rowCount)); log.info("Model : " + data.getModel()); if (YES.equals(calculate_score)) { log.info("Number of rows in the table : " + rowCount); log.info("Score : " + score.columnScore(rowCount)); } else { log.info("Number of rows in the table : N/A"); log.info("Score : N/A"); } log.info("Sample data"); log.info(CommonUtils.fixedLengthString('-', 11)); for (final String sampleData : sampleDataList) { log.info(sampleData); } log.info(""); final List<Probability> probabilityList = data.getProbabilityList(); Collections.sort(probabilityList, Comparator.comparingDouble(Probability::getProbabilityValue).reversed()); int y = 0; if (data.getProbabilityList().size() >= 5) { y = 5; } else { y = data.getProbabilityList().size(); } for (int i = 0; i < y; i++) { final Probability p = data.getProbabilityList().get(i); log.info(p.getSentence() + ":" + p.getProbabilityValue()); } log.info(""); // Score calculation is evaluated with score_calculation parameter if (YES.equals(calculate_score) && score.columnScore(rowCount).equals("High")) { highRiskColumns++; } } // Only applicable when parameter table_rowcount=yes otherwise score calculation should not be done if (YES.equals(calculate_score)) { log.info("Overall score: " + score.dataStoreScore()); log.info(""); if ((finalList != null) && (finalList.size() > 0)) { log.info("============================================"); final int threshold_count = Integer.valueOf(dataDiscoveryProperties.getProperty("threshold_count")); if (finalList.size() > threshold_count) { log.info("Number of PI [" + finalList.size() + "] columns is higher than defined threashold [" + threshold_count + "]"); } else { log.info("Number of PI [" + finalList.size() + "] columns is lower or equal than defined threashold [" + threshold_count + "]"); } final int threshold_highrisk = Integer .valueOf(dataDiscoveryProperties.getProperty("threshold_highrisk")); if (highRiskColumns > threshold_highrisk) { log.info("Number of High risk PI [" + highRiskColumns + "] columns is higher than defined threashold [" + threshold_highrisk + "]"); } else { log.info("Number of High risk PI [" + highRiskColumns + "] columns is lower or equal than defined threashold [" + threshold_highrisk + "]"); } } } else { log.info("Overall score: N/A"); } log.info("matches: " + matches.toString()); return matches; } private List<MatchMetaData> discoverAgainstSingleModel(final IDBFactory factory, final Properties dataDiscoveryProperties, final Model model, final double probabilityThreshold, final String vendor) throws ParseException, DatabaseDiscoveryException { final IMetaData metaData = factory.fetchMetaData(); final List<MatchMetaData> map = metaData.getMetaData(vendor); // Start running NLP algorithms for each column and collect percentage matches = new ArrayList<>(); MatchMetaData specialCaseData = null; final List<MatchMetaData> specialCaseDataList = new ArrayList(); boolean specialCase = false; final String extentionList = dataDiscoveryProperties.getProperty("extentions"); String[] specialCaseFunctions = null; log.info("Extention list: " + extentionList); if (!CommonUtils.isEmptyString(extentionList)) { specialCaseFunctions = extentionList.split(","); if ((specialCaseFunctions != null) && (specialCaseFunctions.length > 0)) { specialCase = true; } } final ISQLBuilder sqlBuilder = factory.createSQLBuilder(); List<Probability> probabilityList; for (final MatchMetaData data : map) { final String tableName = data.getTableName(); final String columnName = data.getColumnName(); log.info(data.getPkeys().toString()); if (data.getPkeys().contains(columnName.toLowerCase(Locale.ENGLISH))) { log.info("Column [" + columnName + "] is Primary Key. Slipping this column."); continue; } log.info(data.getFkeys().toString()); if (data.getFkeys().contains(columnName.toLowerCase(Locale.ENGLISH))) { log.info("Column [" + columnName + "] is Foreign Key. Slipping this column."); continue; } log.debug("Column type: [" + data.getColumnType() + "]"); probabilityList = new ArrayList<>(); log.info("Analyzing column [" + tableName + "].[" + columnName + "]"); final String tableNamePattern = dataDiscoveryProperties.getProperty("table_name_pattern"); if (!CommonUtils.isEmptyString(tableNamePattern)) { final Pattern p = compile(tableNamePattern); if (!p.matcher(tableName).matches()) { continue; } } final String table = sqlBuilder.prefixSchema(tableName); final int limit = Integer.parseInt(dataDiscoveryProperties.getProperty("limit")); final String query = sqlBuilder.buildSelectWithLimit( "SELECT " + columnName + " FROM " + table + " WHERE " + columnName + " IS NOT NULL ", limit); log.debug("Executing query against database: " + query); try (Statement stmt = factory.getConnection().createStatement(); ResultSet resultSet = stmt.executeQuery(query);) { while (resultSet.next()) { if (data.getColumnType().equals("BLOB") || data.getColumnType().equals("GEOMETRY")) { continue; } if (model.getName().equals("location") && data.getColumnType().contains("INT")) { continue; } final String sentence = resultSet.getString(1); log.debug(sentence); log.debug("special case:" + specialCase); if (specialCase) { try { for (int i = 0; i < specialCaseFunctions.length; i++) { if ((sentence != null) && !sentence.isEmpty()) { log.debug("sentence: " + sentence); log.debug("data: " + data); specialCaseData = (MatchMetaData) callExtention(specialCaseFunctions[i], data, sentence); if (specialCaseData != null) { log.info("Adding new special case data: " + specialCaseData.toString()); specialCaseDataList.add(specialCaseData); } else { log.debug("No special case data found"); } } } } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { log.error(e.toString()); } } if ((sentence != null) && !sentence.isEmpty()) { String processingValue = ""; if (data.getColumnType().equals("DATE") || data.getColumnType().equals("TIMESTAMP") || data.getColumnType().equals("DATETIME")) { final DateFormat originalFormat = new SimpleDateFormat(sentence, Locale.ENGLISH); final DateFormat targetFormat = new SimpleDateFormat("MMM d, yy", Locale.ENGLISH); final java.util.Date date = originalFormat.parse(sentence); processingValue = targetFormat.format(date); } else { processingValue = sentence; } // log.debug(sentence); // Convert sentence into tokens final String tokens[] = model.getTokenizer().tokenize(processingValue); // Find names final Span nameSpans[] = model.getNameFinder().find(tokens); // find probabilities for names final double[] spanProbs = model.getNameFinder().probs(nameSpans); // Collect top X tokens with highest probability // display names for (int i = 0; i < nameSpans.length; i++) { final String span = nameSpans[i].toString(); if (span.length() > 2) { log.debug("Span: " + span); log.debug("Covered text is: " + tokens[nameSpans[i].getStart()]); log.debug("Probability is: " + spanProbs[i]); probabilityList.add(new Probability(tokens[nameSpans[i].getStart()], spanProbs[i])); } } // From OpenNLP documentation: // After every document clearAdaptiveData must be called to clear the adaptive data in the feature generators. // Not calling clearAdaptiveData can lead to a sharp drop in the detection rate after a few documents. model.getNameFinder().clearAdaptiveData(); } } } catch (SQLException sqle) { log.error(sqle.toString()); } final double averageProbability = calculateAverage(probabilityList); if (averageProbability >= probabilityThreshold) { data.setAverageProbability(averageProbability); data.setModel(model.getName()); data.setProbabilityList(probabilityList); matches.add(data); } } // Special processing if ((specialCaseDataList != null) && !specialCaseDataList.isEmpty()) { log.info("Special case data is processed :" + specialCaseDataList.toString()); for (final MatchMetaData specialData : specialCaseDataList) { matches.add(specialData); } } return matches; } }