Java tutorial
/*L * Copyright SAIC * * Distributed under the OSI-approved BSD 3-Clause License. * See http://ncip.github.com/cabio/LICENSE.txt for details. */ package gov.nih.nci.grididloader; import gov.nih.nci.grididloader.HandleInterfaceFactory.HandleInterfaceType; import java.io.FileWriter; import java.io.InputStream; import java.net.URI; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import java.util.Properties; import java.util.Queue; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import javax.sql.DataSource; import net.handle.server.HandleRepositoryIDInterface; import net.handle.server.ResourceIdInfo; import oracle.jdbc.pool.OracleConnectionPoolDataSource; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; /** * Creates and assigns Big Id's to grid entities. For each entity mapping * defined in the configuration file, up to 50 threads are dispatched to process * all its rows. For each row, a Big Id is generated using the ID Service * (which registers the id in another table), and then the Big Id is saved into * the entity in the "BIG_ID" column. */ /** * @author caBIO Team * @version 1.0 */ public class BigIdCreator { /** Entity mapping */ private final Config config = new Config(); /** The data source where BigEntity's are stored */ private final DataSource dataSource; /** Factory that is provided to BatchUpdates */ private final HandleInterfaceFactory hiFactory; /** Number of threads to use when updating an entity */ private final int numThreads; /** Executor for running batch updates in parallel */ private final ExecutorService parallelExecutor; /** Executor for running batch updates in serial */ private final ExecutorService serialExecutor; /** Filter what classes to process */ private Set<String> classFilter = new HashSet<String>(); /** If using a filter, include or exclude the entities in the filter? */ private boolean include = true; /** * Reads properties from a file called "loader.properties" in the classpath * and configures a new BigIdCreator. * @throws Exception If loader.properties cannot be found, or a property * is missing. */ public BigIdCreator() throws Exception { // Load the configuration files final Properties props = new Properties(); final Properties dbprops = new Properties(); final InputStream is = Thread.currentThread().getContextClassLoader() .getResourceAsStream("loader.properties"); final InputStream dis = Thread.currentThread().getContextClassLoader().getResourceAsStream("db.properties"); try { props.load(is); dbprops.load(dis); } catch (Exception e) { throw new Exception("Can't read the properties file. " + "Make sure loader.properties and db.properties " + "are in the CLASSPATH"); } finally { } // Setup database connection pool final OracleConnectionPoolDataSource pool = new OracleConnectionPoolDataSource(); pool.setURL(dbprops.getProperty("oracleloader.url")); pool.setUser(dbprops.getProperty("oracleloader.user")); pool.setPassword(dbprops.getProperty("oracleloader.password")); this.dataSource = pool; // load entities and mappings config.loadXMLMapping(props.getProperty("mapping.file")); // setup the handle interface factory this.hiFactory = new HandleInterfaceFactory(props, dataSource); // init the job executor this.numThreads = Integer.parseInt(props.getProperty("loader.threads")); this.parallelExecutor = Executors.newFixedThreadPool(numThreads); this.serialExecutor = Executors.newSingleThreadExecutor(); } /** * Same as createAndUpdate(), but only process entities that have class * names matching the entries in the specified classFilter array. */ public void createAndUpdate(String[] classFilterArray, boolean include) throws Exception { this.include = include; this.classFilter.clear(); if (classFilterArray != null) { for (String className : classFilterArray) { classFilter.add(className); } } createAndUpdate(); } /** * Create Big Id's for each entity and save them into the database. * Each entity is updated in parallel by several threads, but the entities * are processed in a serial fashion. */ public void createAndUpdate() throws Exception { if (hiFactory.getSystemType() == HandleInterfaceType.CLASSIC) { // Create site handle, if the database is empty. // This is necessary because otherwise 50 threads will try to create it // at once, resulting in duplicates and a subsequent avalanche of collisions final HandleRepositoryIDInterface idSvc = (HandleRepositoryIDInterface) hiFactory.getHandleInterface(); // create dummy id (also creates site handle) ResourceIdInfo rid = new ResourceIdInfo(new URI("urn://ncicb"), "dummy"); idSvc.createOrGetGlobalID(rid); // remove the id we created, the site handle will remain idSvc.removeGlobalID(rid); } Connection conn = null; FileWriter benchmarkFile = null; try { benchmarkFile = new FileWriter("timings.txt"); conn = dataSource.getConnection(); for (BigEntity entity : config.getEntities()) { final String className = entity.getClassName(); if (!classFilter.isEmpty() && ((include && !classFilter.contains(className)) || (!include && classFilter.contains(className)))) { System.err.println("Filtered out " + className); continue; } long start = System.currentTimeMillis(); final String table = entity.getTableName(); final String id = entity.getPrimaryKey(); Statement stmt = null; ResultSet rs = null; long numRows = 0; long minId = 0; long maxId = 0; try { // get number of rows and id space for the current entity stmt = conn.createStatement(); rs = stmt.executeQuery( "SELECT MIN(" + id + ") minId, MAX(" + id + ") maxId, COUNT(*) rowCount FROM " + table); rs.next(); numRows = rs.getLong("rowCount"); minId = rs.getLong("minId"); maxId = rs.getLong("maxId"); } catch (SQLException e) { System.err.println("Error processing " + table); e.printStackTrace(); continue; } finally { try { if (rs != null) rs.close(); if (stmt != null) stmt.close(); } catch (SQLException e) { e.printStackTrace(); } } /* This is an overly complicated formula to figure out the best * chunk size possible. * * First we determine the idealChunkSize for the amount of rows * we are dealing with, based on a linear step equation: *10000| ______ * 9500| : * | /: * | / : * 500 |/ : * ____|___:_____ * 0 500,000 * * In other words, the minimum chunk is 500. As the number of rows * increases, the chunk size grows up to 9500. But after 500000 * rows, the chunk size jumps to 10000 and stays constant so that * we don't overload each thread. Therefore, the chunk size is * always between 500 and 10000. * * Secondly, the identifier spread is calculated and multiplied by * the idealChunkSize to get the final chunkSize. If the ids are * equal to the row numbers, the spread is 1 and the chunk size is * ok. If, however, the id space is gigantic, then the chunk size * will be increased proportionally to the average distance between * ids (assuming the ids are uniformally distributed). * * This actually works perfectly only if the ids ARE uniformally * distributed. In other corner cases, where the ids are clustered * together within a huge id space, the id space must be * partitioned recursively. */ final float idealChunkSize = (numRows > 500000) ? 10000 : .018f * numRows + 500; final float spread = (float) (maxId - minId + 1) / (float) numRows; final long chunkSize = Math.round(idealChunkSize * spread); System.out.println("Processing " + entity + " (" + entity.getTableName() + ") rows(" + numRows + ") range(" + minId + "," + maxId + ") parallel(" + entity.isParallelLoadable() + ")"); System.out.println("Parameters: spread(" + spread + ") chunkSize(ideal=" + idealChunkSize + " actual=" + chunkSize + ")"); final Map<BatchUpdate, Future<Boolean>> futures = new HashMap<BatchUpdate, Future<Boolean>>(); final Queue<BatchUpdate> updates = new LinkedList<BatchUpdate>(); // start each chunk as a task on the executor for (long i = minId; i <= maxId; i += chunkSize) { BatchUpdate update = new BatchUpdate(dataSource, hiFactory, entity, i, i + chunkSize - 1); updates.add(update); Future<Boolean> future = entity.isParallelLoadable() ? parallelExecutor.submit(update) : serialExecutor.submit(update); futures.put(update, future); } // wait for all updates to finish while (!updates.isEmpty()) { final BatchUpdate update = updates.remove(); final Future<Boolean> future = futures.remove(update); try { // this get() blocks until the future is available Boolean success = future.get(); if (success == null || !success.booleanValue()) { System.err.println("FAILED: " + update); } else { int n = update.getNumUpdated(); if (n == 0) { System.out.println(" done " + update + " (no rows found)"); } else { int ut = (int) update.getAverageUpdateTime(); int ht = (int) update.getAverageHandleTime(); System.out.println(" done " + update + " rows(" + n + " rows) avg(handle=" + ht + "ms, update=" + ut + "ms)"); } } } catch (ExecutionException e) { System.err.println("Updated failed for entity: " + entity); e.printStackTrace(); } catch (InterruptedException e) { System.err.println("Updated failed for entity: " + entity); e.printStackTrace(); } } float time = System.currentTimeMillis() - start; System.out.println("Done " + entity + " (" + (time / 1000) + " sec)\n"); benchmarkFile.write(entity.getClassName() + "\t" + numRows + "\t" + time + "\n"); benchmarkFile.flush(); } } finally { try { if (conn != null) conn.close(); if (benchmarkFile != null) benchmarkFile.close(); } catch (SQLException e) { e.printStackTrace(); } } // Done parallelExecutor.shutdown(); serialExecutor.shutdown(); } /** * Debug method to print all SQL that will be used to query the entities * for unique keys. */ public void printSQL() { for (BigEntity entity : config.getEntities()) { StringBuffer sql = new StringBuffer("SELECT "); sql.append(entity.getSelectSQL()); sql.append("\n FROM "); sql.append(entity.getFromSQL()); sql.append("\n WHERE 1=1"); if (!entity.getJoins().isEmpty()) { sql.append("\n AND "); sql.append(entity.getWhereSQL()); } System.out.println(entity.getClassName() + "\n" + sql + "\n"); } } public static void argError() { System.err.println("Error: Include and exclude are mutually " + "exclusive parameters."); System.exit(1); } /** * Main program. * Two mutually exclusive command line options are possible: * -I include the following entities * -E exclude the following entities * The entities are passed as a space delimited list like: * -I Taxon Cytoband * @param args */ public static void main(String[] args) throws Exception { String[] classFilter = null; boolean isInclude = true; Options options = new Options(); options.addOption(OptionBuilder.withLongOpt("include").withDescription("classes to include") .hasOptionalArgs().create("I")); options.addOption(OptionBuilder.withLongOpt("exclude").withDescription("classes to exclude") .hasOptionalArgs().create("E")); CommandLineParser parser = new PosixParser(); CommandLine cmd = parser.parse(options, args); String[] include = cmd.getOptionValues("I"); String[] exclude = cmd.getOptionValues("E"); if (include != null) { if (exclude != null) argError(); isInclude = true; classFilter = include; } else if (exclude != null) { if (include != null) argError(); isInclude = false; classFilter = exclude; } long start = System.currentTimeMillis(); BigIdCreator loader = new BigIdCreator(); loader.createAndUpdate(classFilter, isInclude); //loader.printSQL(); long stop = System.currentTimeMillis(); System.err.println("All updates took " + (float) (stop - start) / 60000 + " min."); } }