List of usage examples for java.util.concurrent ForkJoinPool ForkJoinPool
private ForkJoinPool(byte forCommonPoolOnly)
From source file:com.hygenics.parser.BreakMultiple.java
/** * run the class/*from w ww . j a v a 2 s.co m*/ */ public void run() { int j = 0; checkTable(); rows = new ArrayList<String>(); log.info("Starting Break"); // the pool ForkJoinPool fjp = new ForkJoinPool(Runtime.getRuntime().availableProcessors() * procnum); // for returned results List<Future<ArrayList<String>>> futures = new ArrayList<Future<ArrayList<String>>>(); // for parsing Set<Callable<ArrayList<String>>> collect = new HashSet<Callable<ArrayList<String>>>(); // for querying Set<Callable<ArrayList<String>>> qcollect = new HashSet<Callable<ArrayList<String>>>(); // results ArrayList<String> jsons = new ArrayList<String>(); String condition = null; int size = (int) Math.ceil(pullsize / qnum); // get initial data from user for (int i = 0; i < qnum; i++) { condition = " WHERE " + idcolumn + " > " + Integer.toString(offset + (Math.round(pullsize / qnum) * i)) + " AND " + idcolumn + " <= " + Integer.toString(offset + (Math.round(pullsize / qnum) * (i + 1))); if (extracondition != null) { condition += " " + extracondition.trim(); } qcollect.add(new GetFromDB((select + condition), template)); log.info("SELECTING " + select + " " + condition); } log.info("Getting From DB @" + Calendar.getInstance().getTime().toString()); futures = fjp.invokeAll(qcollect); int w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited for " + w + "Cycles"); for (Future<ArrayList<String>> f : futures) { try { rows.addAll(f.get()); f.cancel(true); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ExecutionException e) { // TODO Auto-generated catch block e.printStackTrace(); } } qcollect = new HashSet<Callable<ArrayList<String>>>(); futures = null; log.info("Breaking"); // process while there is still data to process while (rows.size() > 0) { log.info("Iteration Contains " + rows.size() + " Rows"); // add to the commit size for future processing offset += pullsize; log.info("Submitting Tasks"); // submit for breaking apart for (String r : rows) { if (fjp.isShutdown()) { fjp = new ForkJoinPool(Runtime.getRuntime().availableProcessors() * procnum); } if (r != null) { if (mustcontain != null) { if (r.contains(mustcontain)) { if (cannotcontain != null) { if (r.contains(cannotcontain) == false) { Map<String, Json> rowmap = Json.read(r).asJsonMap(); // final getDAOTemplate template, final // String row, final String token, final // String replacementPattern, final // Map<String,String> positions,final String // date, final String table, final String // offenderhash if (rowmap.size() > 0) { collect.add(new Break(unescape, repeatkeys, template, rowmap.get(rowcolumn).asString(), token, replacementPattern, positions, (Calendar.getInstance().getTime().toString()), targettable, rowmap.get("offenderhash").asString(), maxpos, genhash)); } } } else { Map<String, Json> rowmap = Json.read(r).asJsonMap(); // final getDAOTemplate template, final String // row, final String token, final String // replacementPattern, final Map<String,String> // positions,final String date, final String // table, final String offenderhash if (rowmap.size() > 0) { collect.add(new Break(unescape, repeatkeys, template, rowmap.get(rowcolumn).asString(), token, replacementPattern, positions, (Calendar.getInstance().getTime().toString()), targettable, rowmap.get("offenderhash").asString(), maxpos, genhash)); } } } } else { if (cannotcontain != null) { if (r.contains(cannotcontain) == false) { Map<String, Json> rowmap = Json.read(r).asJsonMap(); // to ascend you must die, to die you must be // crucified; so get off your -- cross so that // we can nail down the nex martyr // final getDAOTemplate template, final String // row, final String token, final String // replacementPattern, final Map<String,String> // positions,final String date, final String // table, final String offenderhash if (rowmap.size() > 0) { collect.add(new Break(unescape, repeatkeys, template, rowmap.get(rowcolumn).asString(), token, replacementPattern, positions, (Calendar.getInstance().getTime().toString()), targettable, rowmap.get("offenderhash").asString(), maxpos, genhash)); } } } else { Map<String, Json> rowmap = Json.read(r).asJsonMap(); // final getDAOTemplate template, final String row, // final String token, final String // replacementPattern, final Map<String,String> // positions,final String date, final String table, // final String offenderhash if (rowmap.size() > 0) { collect.add(new Break(unescape, repeatkeys, template, rowmap.get(rowcolumn).asString(), token, replacementPattern, positions, (Calendar.getInstance().getTime().toString()), targettable, rowmap.get("offenderhash").asString(), maxpos, genhash)); } } } } } log.info("SUBMITTED " + collect.size() + " tasks"); futures = fjp.invokeAll(collect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited for " + w + " Cycles"); jsons.clear(); log.info("Getting Strings"); try { for (Future<ArrayList<String>> p : futures) { ArrayList<String> retlist = p.get(); if (retlist != null) { if (retlist.size() > 0) { jsons.addAll(retlist); } if (jsons.size() >= commit_size) { // send to db if (jsons.size() > SPLITSIZE) { log.info("Split True: Sending to DB @ " + Calendar.getInstance().getTime().toString()); postToDb(jsons, true); jsons = new ArrayList<String>(); log.info("Posted to DB @ " + Calendar.getInstance().getTime().toString()); } else { log.info("Split False: Sending to DB @ " + Calendar.getInstance().getTime().toString()); postToDb(jsons, false); jsons = new ArrayList<String>(); log.info("Posted to DB @ " + Calendar.getInstance().getTime().toString()); } } } p.cancel(true); } } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ExecutionException e) { // TODO Auto-generated catch block e.printStackTrace(); } futures = null; collect = new HashSet<Callable<ArrayList<String>>>(); // send to db if (jsons.size() > SPLITSIZE) { log.info("Split True: Sending to DB @" + Calendar.getInstance().getTime().toString()); postToDb(jsons, true); jsons = new ArrayList<String>(); log.info("Posted to DB @ " + Calendar.getInstance().getTime().toString()); } else { log.info("Split False: Sending to DB @" + Calendar.getInstance().getTime().toString()); postToDb(jsons, false); jsons = new ArrayList<String>(); log.info("Posted to DB @ " + Calendar.getInstance().getTime().toString()); } // get more information rows = new ArrayList<String>(); if (Runtime.getRuntime().freeMemory() < 500000 | ((loops % waitloops) == 0 & waitloops != 0)) { log.info("Paused Free Memory Left: " + Runtime.getRuntime().freeMemory()); System.gc(); Runtime.getRuntime().gc(); try { Thread.sleep(2000); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } while (Runtime.getRuntime().freeMemory() < 500000) { try { Thread.sleep(2000); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } log.info("Restart Free Memory Left: " + Runtime.getRuntime().freeMemory()); } rows = new ArrayList<String>(); // attempt to query the database from multiple threads for (int conn = 1; conn <= qnum; conn++) { // change condition condition = " WHERE " + idcolumn + " > " + Integer.toString(offset + (Math.round(pullsize / qnum) * conn)) + " AND " + idcolumn + " <= " + Integer.toString(offset + (Math.round(pullsize / qnum) * (conn + 1))); if (extracondition != null) { condition += " " + extracondition.trim(); } qcollect.add(new GetFromDB((select + condition), template)); log.info("SELECTING " + select + " " + condition); } futures = fjp.invokeAll(qcollect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited for " + w + " Cycles"); for (Future<ArrayList<String>> f : futures) { try { ArrayList<String> test = f.get(); if (test != null) { if (test.size() > 0) { rows.addAll(test); } } f.cancel(true); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ExecutionException e) { // TODO Auto-generated catch block e.printStackTrace(); } } futures = null; qcollect = new HashSet<Callable<ArrayList<String>>>(4); j++; Runtime.getRuntime().gc(); System.gc(); } // send to db if (jsons.size() > SPLITSIZE) { log.info("Split True: Sending to DB @" + Calendar.getInstance().getTime().toString()); postToDb(jsons, true); jsons = new ArrayList<String>(); } else if (jsons.size() > 0) { log.info("Split False: Sending to DB @" + Calendar.getInstance().getTime().toString()); postToDb(jsons, false); jsons = new ArrayList<String>(); } Runtime.getRuntime().gc(); System.gc(); log.info("Shutting Down Forkjoin Pool"); if (fjp.isShutdown() == false) { fjp.shutdownNow(); } }
From source file:com.hygenics.parser.GetImages.java
private void getImages() { // controls the web process from a removed method log.info("Setting Up Pull"); String[] proxyarr = (proxies == null) ? null : proxies.split(","); // cleanup//w ww . ja v a 2s .c o m if (cleanup) { cleanupDir(fpath); } // image grab CookieManager cm = new CookieManager(); cm.setCookiePolicy(CookiePolicy.ACCEPT_ALL); CookieHandler.setDefault(cm); int numimages = 0; InputStream is; byte[] bytes; int iter = 0; int found = 0; // set proxy if needed if (proxyuser != null) { proxy(proxyhost, proxyport, https, proxyuser, proxypass); } int i = 0; ArrayList<String> postImages = new ArrayList<String>(); ForkJoinPool fjp = new ForkJoinPool(Runtime.getRuntime().availableProcessors()); Set<Callable<String>> pulls = new HashSet<Callable<String>>(); Set<Callable<ArrayList<String>>> sqls = new HashSet<Callable<ArrayList<String>>>(); List<Future<String>> imageFutures; ArrayList<String> images; int chunksize = (int) Math.ceil(commitsize / numqueries); log.info("Chunksize: " + chunksize); if (baseurl != null || baseurlcolumn != null) { do { log.info("Offset: " + offset); log.info("Getting Images"); images = new ArrayList<String>(commitsize); log.info("Getting Columns"); for (int n = 0; n < numqueries; n++) { String tempsql = sql + " WHERE " + idString + " >= " + offset + " AND " + idString + " < " + (offset + chunksize); if (conditions != null) { tempsql += conditions; } sqls.add(new QueryDatabase( ((extracondition != null) ? tempsql + " " + extracondition : tempsql))); offset += chunksize; } List<Future<ArrayList<String>>> futures = fjp.invokeAll(sqls); int w = 0; while (fjp.isQuiescent() && fjp.getActiveThreadCount() > 0) { w++; } for (Future<ArrayList<String>> f : futures) { try { ArrayList<String> fjson; fjson = f.get(); if (fjson.size() > 0) { images.addAll(fjson); } if (f.isDone() == false) { f.cancel(true); } } catch (InterruptedException e) { e.printStackTrace(); } catch (ExecutionException e) { e.printStackTrace(); } } log.info(Integer.toString(images.size()) + " image links found. Pulling."); ArrayList<String> tempproxies = new ArrayList<String>(); if (proxyarr != null) { for (String proxy : proxyarr) { tempproxies.add(proxy.trim()); } } if (maxproxies > 0) { maxproxies -= 1;// 0 and 1 should be equivalent conditions // --num is not like most 0 based still due // to >= } // get images for (int num = 0; num < images.size(); num++) { String icols = images.get(num); int proxnum = (int) Math.random() * (tempproxies.size() - 1); String proxy = (tempproxies.size() == 0) ? null : tempproxies.get(proxnum); // add grab pulls.add(new ImageGrabber(icols, proxy)); if (proxy != null) { tempproxies.remove(proxy); } // check for execution if (num + 1 == images.size() || pulls.size() >= commitsize || tempproxies.size() == 0) { if (tempproxies.size() == 0 && proxies != null) { tempproxies = new ArrayList<String>(proxyarr.length); for (String p : proxyarr) { tempproxies.add(p.trim()); } } imageFutures = fjp.invokeAll(pulls); w = 0; while (fjp.isQuiescent() == false && fjp.getActiveThreadCount() > 0) { w++; } for (Future<String> f : imageFutures) { String add; try { add = f.get(); if (add != null) { postImages.add(add); } } catch (InterruptedException e) { e.printStackTrace(); } catch (ExecutionException e) { e.printStackTrace(); } } imageFutures = null;// garbage collect elligible pulls = new HashSet<Callable<String>>(commitsize); } if (postImages.size() >= commitsize && addtoDB == true) { if (addtoDB) { log.info("Posting to Database"); log.info("Found " + postImages.size() + " images"); numimages += postImages.size(); int size = (int) Math.floor(postImages.size() / numqueries); for (int n = 0; n < numqueries; n++) { if (((n + 1) * size) < postImages.size() && (n + 1) < numqueries) { fjp.execute(new ImagePost(postImages.subList(n * size, (n + 1) * size))); } else { fjp.execute(new ImagePost(postImages.subList(n * size, postImages.size() - 1))); } } w = 0; while (fjp.isQuiescent() && fjp.getActiveThreadCount() > 0) { w++; } } found += postImages.size(); postImages.clear(); } } if (postImages.size() > 0 && addtoDB == true) { log.info("Posting to Database"); numimages += postImages.size(); int size = (int) Math.floor(postImages.size() / numqueries); for (int n = 0; n < numqueries; n++) { if (((n + 1) * size) < postImages.size()) { fjp.execute(new ImagePost(postImages.subList(n * size, (n + 1) * size))); } else { fjp.execute(new ImagePost(postImages.subList(n * size, postImages.size()))); } } w = 0; while (fjp.isQuiescent() && fjp.getActiveThreadCount() > 0) { w++; } found += postImages.size(); postImages.clear(); } // handle iterations specs iter += 1; log.info("Iteration: " + iter); if ((iter < iterations && found < images.size()) || tillfound == true) { log.info("Not All Images Obtained Trying Iteration " + iter + " of " + iterations); offset -= commitsize; } else if ((iter < iterations && found >= images.size()) && tillfound == false) { log.info("Images Obtained in " + iter + " iterations. Continuing."); iter = 0; } else { // precautionary log.info("Images Obtained in " + iter + " iterations. Continuing"); iter = 0; } } while (images.size() > 0 && iter < iterations); if (fjp.isShutdown()) { fjp.shutdownNow(); } } log.info("Complete. Check for Errors \n " + numimages + " Images Found"); }
From source file:com.hygenics.parser.ParseJSoup.java
/** * Runs the Program//from w ww . ja va 2 s . co m */ public void run() { int its = 0; this.select = Properties.getProperty(this.select); this.extracondition = Properties.getProperty(this.extracondition); this.column = Properties.getProperty(this.column); createTables(); log.info("Starting Parse via JSoup @ " + Calendar.getInstance().getTime().toString()); ForkJoinPool fjp = new ForkJoinPool(Runtime.getRuntime().availableProcessors() * procs); Set<Callable<ArrayList<String>>> collection; List<Future<ArrayList<String>>> futures; ArrayList<String> data = new ArrayList<String>((commitsize + 10)); ArrayList<String> outdata = new ArrayList<String>(((commitsize + 10) * 3)); int offenderhash = offset; boolean run = true; int iteration = 0; int currpos = 0; do { collection = new HashSet<Callable<ArrayList<String>>>(qnums); log.info("Getting Data"); // get data currpos = iteration * commitsize + offset; iteration += 1; String query = select; if (extracondition != null) { query += " " + extracondition; } if (extracondition != null) { query += " WHERE " + extracondition + " AND "; } else { query += " WHERE "; } for (int i = 0; i < qnums; i++) { if (currpos + (Math.round(commitsize / qnums * (i + 1))) < currpos + commitsize) { collection.add(new SplitQuery((query + pullid + " >= " + Integer.toString(currpos + (Math.round(commitsize / qnums * (i)))) + " AND " + pullid + " < " + Integer.toString(currpos + (Math.round(commitsize / qnums * (i + 1))))))); } else { collection.add(new SplitQuery((query + pullid + " >= " + Integer.toString(currpos + (Math.round(commitsize / qnums * (i)))) + " AND " + pullid + " < " + Integer.toString(currpos + commitsize)))); } } if (collection.size() > 0) { futures = fjp.invokeAll(collection); int w = 0; while (fjp.isQuiescent() == false && fjp.getActiveThreadCount() > 0) { w++; } for (Future<ArrayList<String>> f : futures) { try { // TODO Get Pages to Parse data.addAll(f.get()); } catch (NullPointerException e) { log.info("Some Data Returned Null"); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ExecutionException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } collection = new HashSet<Callable<ArrayList<String>>>(data.size()); // checkstring if (data.size() == 0 && checkstring != null && its <= maxchecks) { its++; collection.add(new SplitQuery(checkstring)); futures = fjp.invokeAll(collection); int w = 0; while (fjp.isQuiescent() == false && fjp.getActiveThreadCount() > 0) { w++; } for (Future<ArrayList<String>> f : futures) { try { // TODO Get Pages to Parse data.addAll(f.get()); } catch (NullPointerException e) { log.info("Some Data Returned Null"); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ExecutionException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } if (data.size() == 0) { // set to stop if size is0 log.info("No Pages to Parse. Will Terminate"); run = false; } else { // parse log.info("Starting JSoup Parse @ " + Calendar.getInstance().getTime().toString()); for (String json : data) { // faster json reader is minimal json but faster parser is // Simple Json Map<String, Json> jMap = Json.read(json).asJsonMap(); if (jMap.containsKey("offenderhash")) { // string to int in case it is a string and has some // extra space offenderhash = Integer.parseInt(jMap.get("offenderhash").asString().trim()); } boolean allow = true; if (mustcontain != null) { if (jMap.get(column).asString().contains(mustcontain) == false) { allow = false; } } if (cannotcontain != null) { if (jMap.get(column).asString().contains(cannotcontain)) { allow = false; } } // this is the fastest way. I was learning before and will // rewrite when time permits. if (allow == true) { if (jMap.containsKey("offenderhash")) { if (this.singlepaths != null) { collection.add(new ParseSingle(Integer.toString(offenderhash), header, footer, pagenarrow, singlepaths, StringEscapeUtils.unescapeXml(jMap.get(column).asString()), replace, replaceSequence)); } if (this.multipaths != null) { collection.add(new ParseRows(Integer.toString(offenderhash), header, footer, pagenarrow, multipaths, StringEscapeUtils.unescapeXml(jMap.get(column).asString()), replace, replaceSequence)); } if (this.recordpaths != null) { collection.add(new ParseLoop(Integer.toString(offenderhash), header, footer, pagenarrow, recordpaths, StringEscapeUtils.unescapeXml(jMap.get(column).asString()), replace, replaceSequence)); } } } offenderhash += 1; } // complete parse log.info("Waiting for Parsing to Complete."); if (collection.size() > 0) { futures = fjp.invokeAll(collection); int w = 0; while (fjp.isQuiescent() && fjp.getActiveThreadCount() > 0) { w++; } log.info("Waited for " + Integer.toString(w) + " Cycles!"); for (Future<ArrayList<String>> f : futures) { try { outdata.addAll(f.get()); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ExecutionException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } log.info("Finished Parsing @ " + Calendar.getInstance().getTime().toString()); int cp = 0; // post data log.info("Posting Data @ " + Calendar.getInstance().getTime().toString()); if (outdata.size() > 0) { for (int i = 0; i < qnums; i++) { ArrayList<String> od = new ArrayList<String>( ((cp + (Math.round(outdata.size() / qnums) - cp)))); if (cp + (Math.round(outdata.size() / qnums)) < outdata.size()) { od.addAll(outdata.subList(cp, (cp + (Math.round(outdata.size() / qnums))))); } else { od.addAll(outdata.subList(cp, (outdata.size() - 1))); } fjp.execute(new SplitPost(template, od)); cp += Math.round(outdata.size() / qnums); } int w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited for " + Integer.toString(w) + " cycles!"); } log.info("Finished Posting to DB @ " + Calendar.getInstance().getTime().toString()); // size should remain same with 10 slot buffer room data.clear(); outdata.clear(); } // my favorite really desperate attempt to actually invoke garbage // collection because of MASSIVE STRINGS System.gc(); Runtime.getRuntime().gc(); } while (run); log.info("Shutting Down FJP"); // shutdown fjp if (fjp.isShutdown() == false) { fjp.shutdownNow(); } log.info("Finished Parsing @ " + Calendar.getInstance().getTime().toString()); }
From source file:com.hygenics.parser.KVParser.java
public void run() { log.info("Starting Parse @ " + Calendar.getInstance().getTime().toString()); ForkJoinPool fjp = new ForkJoinPool(Runtime.getRuntime().availableProcessors() * procs); Set<Callable<ArrayList<String>>> collection; List<Future<ArrayList<String>>> futures; ArrayList<String> data = new ArrayList<String>((commitsize + 10)); ArrayList<String> outdata = new ArrayList<String>(((commitsize + 10) * 3)); int currpos = 0; boolean run = true; while (run) { log.info("Getting Pages"); // get pages String query = select;/*from ww w .ja v a2 s . com*/ if (data.size() > 0) { data.clear(); } if (extracondition != null) { query += " " + extracondition; } if (extracondition != null) { query += " WHERE " + extracondition + " AND "; } else { query += " WHERE "; } collection = new HashSet<Callable<ArrayList<String>>>(qnums); for (int i = 0; i < qnums; i++) { if (currpos + (Math.round(commitsize / qnums * (i + 1))) < currpos + commitsize) { collection.add(new SplitQuery((query + pullid + " >= " + Integer.toString(currpos + (Math.round(commitsize / qnums * (i)))) + " AND " + pullid + " < " + Integer.toString(currpos + (Math.round(commitsize / qnums * (i + 1))))))); } else { collection.add(new SplitQuery((query + pullid + " >= " + Integer.toString(currpos + (Math.round(commitsize / qnums * (i)))) + " AND " + pullid + " < " + Integer.toString(currpos + commitsize)))); } } currpos += commitsize; if (collection.size() > 0) { futures = fjp.invokeAll(collection); int w = 0; while (fjp.isQuiescent() == false && fjp.getActiveThreadCount() > 0) { w++; } for (Future<ArrayList<String>> f : futures) { try { ArrayList<String> darr = f.get(); if (darr != null && darr.size() > 0) { data.addAll(darr); } } catch (NullPointerException e) { log.info("Some Data Returned Null"); } catch (InterruptedException e) { e.printStackTrace(); } catch (ExecutionException e) { e.printStackTrace(); } } } if (data.size() == 0 && checkString != null) { collection = new HashSet<Callable<ArrayList<String>>>(1); collection.add(new SplitQuery(checkString)); futures = fjp.invokeAll(collection); int w = 0; while (fjp.isQuiescent() == false && fjp.getActiveThreadCount() > 0) { w++; } for (Future<ArrayList<String>> f : futures) { try { ArrayList<String> arr = f.get(); if (arr != null) { for (String a : arr) { if (a != null) { data.add(a); } } } if (!f.isDone()) { f.cancel(true); } f = null; } catch (NullPointerException e) { log.info("Some Data Returned Null"); } catch (InterruptedException e) { e.printStackTrace(); } catch (ExecutionException e) { e.printStackTrace(); } } } // parse pages if (data.size() > 0) { log.info("Parsing " + Integer.toString(data.size()) + " Records"); collection = new HashSet<Callable<ArrayList<String>>>(data.size()); for (String json : data) { Map<String, Object> jmap = Json.read(json).asMap(); // for each table in the tags Map which is a key for (String k : tags.keySet()) { collection.add(new Parser(tags.get(k), jmap.get(htmlColumn).toString(), replacePattern, replacement, jmap.get(hashColumn).toString(), hashColumn, k)); if (collection.size() + 1 == data.size() || (collection.size() % commitsize == 0 && collection.size() >= commitsize)) { log.info("Waiting for Tasks to Complete"); futures = fjp.invokeAll(collection); // post data int w = 0; while (fjp.isQuiescent() == false && fjp.getActiveThreadCount() > 0) { w++; } for (Future<ArrayList<String>> future : futures) { try { outdata.addAll(future.get()); } catch (NullPointerException e) { log.info("Some Data Returned Null"); } catch (InterruptedException e) { e.printStackTrace(); } catch (ExecutionException e) { e.printStackTrace(); } } log.info("Parsed " + outdata.size() + " records!"); // post data int cp = 0; if (outdata.size() > 0) { checkTables(outdata); this.sendToDb(outdata, true); outdata = new ArrayList<String>(commitsize); } } } } data = new ArrayList<String>(commitsize); } else { log.info("No Records Found. Terminating!"); run = false; } } if (outdata.size() > 0) { log.info("Posting Last Records"); // post remaining pages for the iteration if (outdata.size() > 0) { int cp = 0; if (outdata.size() > 0) { checkTables(outdata); this.sendToDb(outdata, true); } data.clear(); outdata.clear(); } } // shutdown log.info("Complete! Shutting Down FJP."); fjp.shutdownNow(); log.info("Finished Parse @ " + Calendar.getInstance().getTime().toString()); }
From source file:com.hygenics.parser.GetImages.java
private void addFromFile() { File f = new File(fpath); ForkJoinPool fjp = new ForkJoinPool(Runtime.getRuntime().availableProcessors()); ArrayList<String> imageData = new ArrayList<String>(); int size = (int) Math.ceil(commitsize / numqueries); if (f.exists()) { // get the image data File[] list = f.listFiles(); int curr = 0; if (list != null) { for (File img : list) { curr += 1;/*from w w w. j ava2 s . com*/ if (img.isDirectory() == false && (img.getName().contains(".bmp") || img.getName().toLowerCase().contains(".jpg") || img.getName().toLowerCase().contains(".png") || img.getName().toLowerCase().contains("jpeg"))) { try { if (dbcondition == null || template .getJsonData(dbcondition.replace("$IMAGE$", img.getName().replaceAll("(?mis)" + imagepostfix, ""))) .size() > 0) { BufferedImage bi = ImageIO.read(img);// only // used // to // ensure // that // this // is an // image JsonObject jobj = new JsonObject(); jobj.add("offenderhash", img.getName().replaceAll("(?mis)" + imagepostfix, ""));// assumes // hash // is // file // name+postfix jobj.add("image", img.getName().replaceAll("(?mis)" + imagepostfix, "")); jobj.add("image_path", img.getName()); jobj.add("table", table); jobj.add("date", Calendar.getInstance().getTime().toString()); imageData.add(jobj.toString()); } } catch (IOException e) { log.info(img.getName() + " is not an Image!"); e.printStackTrace(); } catch (Exception e) { log.info("Error in Posting to Database."); e.printStackTrace(); } } // post if > commitsize if (imageData.size() >= commitsize || curr == list.length) { log.info("Posting to DB @ " + Calendar.getInstance().getTime().toString()); for (int i = 0; i < numqueries; i++) { if (((i + 1) * size) < imageData.size()) { fjp.execute(new ImagePost(imageData.subList((i * size), ((i + 1) * size)))); } else { fjp.execute(new ImagePost(imageData.subList((i * size), imageData.size()))); } } int w = 0; while (fjp.isQuiescent() == false && fjp.getActiveThreadCount() > 0) { w++; } log.info("Waited for " + w + " cycles"); imageData.clear(); log.info("Finished Posting to DB @ " + Calendar.getInstance().getTime().toString()); } } } } else { log.error("File Path does Not Exist.Please Check Image Pull!"); } fjp.shutdown(); fjp = null; }
From source file:com.hygenics.parser.ParseDispatcher.java
private void spl(ArrayList<String> json, boolean split) { if (json.size() > 0) log.info("Records to Add: " + json.size()); if (split) {/*from w w w.j a v a 2 s . c o m*/ ForkJoinPool f2 = new ForkJoinPool( (Runtime.getRuntime().availableProcessors() + ((int) Math.ceil(procnum * sqlnum)))); ArrayList<String> l; int size = (int) Math.ceil(json.size() / qnum); for (int conn = 0; conn < qnum; conn++) { l = new ArrayList<String>(); if (((conn + 1) * size) < json.size()) { l.addAll(json.subList((conn * size), ((conn + 1) * size))); } else { l.addAll(json.subList((conn * size), (json.size() - 1))); f2.execute(new SplitPost(template, l)); break; } f2.execute(new SplitPost(template, l)); } try { f2.awaitTermination(termtime, TimeUnit.MILLISECONDS); } catch (InterruptedException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } f2.shutdown(); int incrementor = 0; while (f2.isShutdown() == false && f2.getActiveThreadCount() > 0 && f2.isQuiescent() == false) { incrementor++; try { Thread.sleep(100); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } log.info("Shutting Down" + incrementor); } l = null; f2 = null; } else { for (String j : json) { boolean valid = false; try { Json.read(j); valid = true; } catch (Exception e) { log.info("ERROR: JSON NOT FORMATTED PROPERLY"); System.out.println(j); } try { this.template.postSingleJson(j); } catch (Exception e) { log.info("Failed to Post"); log.error(j); e.printStackTrace(); } } } }
From source file:com.hygenics.parser.ParseDispatcher.java
private void sendToDb(ArrayList<String> json, boolean split) { if (json.size() > 0) log.info("Records to Add: " + json.size()); if (split) {//from www . j ava2s . com ForkJoinPool f2 = new ForkJoinPool( (Runtime.getRuntime().availableProcessors() + ((int) Math.ceil(procnum * sqlnum)))); ArrayList<String> l; int size = (int) Math.ceil(json.size() / qnum); for (int conn = 0; conn < qnum; conn++) { l = new ArrayList<String>(); if (((conn + 1) * size) < json.size()) { l.addAll(json.subList((conn * size), ((conn + 1) * size))); } else { l.addAll(json.subList((conn * size), (json.size() - 1))); f2.execute(new SplitPost(template, l)); break; } f2.execute(new SplitPost(template, l)); } try { f2.awaitTermination(termtime, TimeUnit.MILLISECONDS); } catch (InterruptedException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } f2.shutdown(); int incrementor = 0; while (f2.isShutdown() == false && f2.getActiveThreadCount() > 0 && f2.isQuiescent() == false) { incrementor++; try { Thread.sleep(100); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } log.info("Shutting Down" + incrementor); } l = null; f2 = null; } else { for (String j : json) { boolean valid = false; try { Json.read(j); valid = true; } catch (Exception e) { log.info("ERROR: JSON NOT FORMATTED PROPERLY"); System.out.println(j); } try { this.template.postSingleJson(j); } catch (Exception e) { log.info("Failed to Post"); log.error(j); e.printStackTrace(); } } } }
From source file:com.hygenics.parser.ParseDispatcher.java
/** * Fork/Join Pool Solution Maximizes Speed. JSon increases ease of use * /* w w w . jav a 2 s .c o m*/ */ public void run() { log.info("Starting Clock and Parsing @" + Calendar.getInstance().getTime().toString()); long t = Calendar.getInstance().getTimeInMillis(); int pid = 0; int id = 0; int checkattempts = 0; String add = null; this.schema = Properties.getProperty(this.schema); this.select = Properties.getProperty(this.select); this.extracondition = Properties.getProperty(this.extracondition); this.column = Properties.getProperty(this.column); ArrayList<String> parsedrows = new ArrayList<String>(); Set<Callable<String>> collect = new HashSet<Callable<String>>(); List<Future<String>> futures; List<Future<ArrayList<String>>> qfutures; Set<Callable<ArrayList<String>>> qcollect = new HashSet<Callable<ArrayList<String>>>(4); ForkJoinPool fjp = new ForkJoinPool((int) Math.ceil(Runtime.getRuntime().availableProcessors() * procnum)); if (schema != null) { createTables(); } boolean run = true; String condition; int w = 0; int start = offset; int chunksize = (int) Math.ceil(pullsize / qnum); // attempt to query the database from multiple threads do { // query for pages pages = new ArrayList<String>(pullsize); log.info("Looking for Pages."); for (int conn = 0; conn < qnum; conn++) { // create condition condition = " WHERE " + pullid + " >= " + (start + (conn * chunksize)) + " AND " + pullid + " < " + Integer.toString(start + (chunksize * (conn + 1))); if (extracondition != null) { condition += " " + extracondition.trim(); } // get queries qcollect.add(new SplitQuery(template, (select + condition))); log.info("Fetching " + select + condition); } start += (chunksize * qnum); qfutures = fjp.invokeAll(qcollect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited for " + w + " cycles"); for (Future<ArrayList<String>> f : qfutures) { try { ArrayList<String> test = f.get(); if (test != null) { if (test.size() > 0) { pages.addAll(test); } } if (f.isDone() == false) { f.cancel(true); } f = null; } catch (Exception e) { log.warn("Encoding Error!"); e.printStackTrace(); } } qcollect = new HashSet<Callable<ArrayList<String>>>(4); qfutures = null; log.info("Finished Getting Pages"); // if no records then get records that may have been dropped if (pages.size() == 0 && checkstring != null && checkstring.trim().length() > 0 && checkattempts < reattempts) { checkattempts += 1; log.info("Checking for Drops"); qcollect.add(new SplitQuery(template, (checkstring))); qfutures = fjp.invokeAll(qcollect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited for " + w + " cycles"); for (Future<ArrayList<String>> f : qfutures) { try { ArrayList<String> test = f.get(); if (test != null) { if (test.size() > 0) { pages.addAll(test); } } if (f.isDone() == false) { f.cancel(true); } f = null; } catch (Exception e) { log.warn("Encoding Error!"); e.printStackTrace(); } } qfutures = null; qcollect = new HashSet<Callable<ArrayList<String>>>(4); } else if (checkattempts >= reattempts) { pages.clear(); } log.info("Found " + pages.size() + " records!"); // get hashes if necessary if (getHash) { log.info("Hashing " + pages.size() + " Records"); ArrayList<String> hashedrows = new ArrayList<String>(); for (String row : pages) { collect.add(new CreateHash(row, pid)); pid++; } log.info("Invoking"); futures = fjp.invokeAll(collect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited " + w + " Cycles!"); for (Future<String> f : futures) { if (f != null) { String json; try { json = f.get(termtime, TimeUnit.MILLISECONDS); if (json != null) { hashedrows.add(json); } } catch (Exception e) { log.warn("Encoding Error!"); e.printStackTrace(); } } } log.info("Hashed " + hashedrows.size() + " Records!"); pages = hashedrows; collect = new HashSet<Callable<String>>(pullsize); futures.clear(); log.info("Completed Hashing"); } log.info("Performing Regex"); // handle single patterns int i = 0; if (singlepats != null) { log.info("Found Singlepats"); int subs = 0; int rows = 0; for (String row : pages) { rows += 1; String inrow = row; try { inrow = inrow.replaceAll("\t|\r|\r\n|\n", ""); Map<String, Json> jmap = Json.read(inrow).asJsonMap(); if (singlepats.containsKey("table")) { subs += 1; if (fjp.isShutdown()) { fjp = new ForkJoinPool((Runtime.getRuntime().availableProcessors() * procnum)); } if (jmap.get(column) != null) { if (test) { System.out.println("//////////////////////HTML////////////////////////\n" + jmap.get(column).asString() + "\n///////////////////////////////END///////////////////////////\n\n"); } if (mustcontain != null) { if (jmap.get(column).asString().contains(mustcontain)) { if (cannotcontain != null) { if (jmap.get(column).asString().contains(cannotcontain) == false) collect.add(new ParsePage(unescape, replacementPattern, singlepats.get("table"), jmap.get(column).asString().replaceAll("\\s\\s", " "), singlepats, Calendar.getInstance().getTime().toString(), jmap.get("offenderhash").asString())); } else { collect.add(new ParsePage(unescape, replacementPattern, singlepats.get("table"), jmap.get(column).asString().replaceAll("\\s\\s", " "), singlepats, Calendar.getInstance().getTime().toString(), jmap.get("offenderhash").asString())); } } } else if (cannotcontain != null) { if (jmap.get(column).asString().contains(cannotcontain) == false) { collect.add( new ParsePage(unescape, replacementPattern, singlepats.get("table"), jmap.get(column).asString().replaceAll("\\s\\s", " "), singlepats, Calendar.getInstance().getTime().toString(), jmap.get("offenderhash").asString())); } } else { collect.add(new ParsePage(unescape, replacementPattern, singlepats.get("table"), jmap.get(column).asString().replaceAll("\\s\\s", " "), singlepats, Calendar.getInstance().getTime().toString(), jmap.get("offenderhash").asString())); } } } i++; if (((i % commit_size) == 0 & i != 0) || i == pages.size() || pages.size() == 1 && singlepats != null) { log.info("Getting Regex Results"); log.info("Getting Tasks"); futures = fjp.invokeAll(collect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited for " + w + " cycles"); for (Future<String> r : futures) { try { add = r.get(); if (add.contains("No Data") == false) { parsedrows.add(add); } add = null; } catch (Exception e) { log.warn("Encoding Error!"); e.printStackTrace(); } } futures = null; collect = new HashSet<Callable<String>>(); if (parsedrows.size() >= commit_size) { log.info("INSERTING " + parsedrows.size() + " records!"); if (parsedrows.size() >= SPLITSIZE) { sendToDb(parsedrows, true); } else { sendToDb(parsedrows, false); } parsedrows = new ArrayList<String>(pullsize); } // hint to the gc in case it actually pays off; use // -X:compactexplicitgc to improve odds and // -XX:UseConcMarkSweepGC for improving odds on // older generation strings // (think if i were a gambling man) System.gc(); Runtime.getRuntime().gc(); } } catch (Exception e) { log.warn("Encoding Error!"); e.printStackTrace(); } } log.info("Submitted " + subs + " records. Found " + rows + " rows"); } log.info("REMAINING ROWS TO COMMIT " + parsedrows.size()); log.info("Rows Left" + parsedrows.size()); if (parsedrows.size() > 0) { if (parsedrows.size() >= SPLITSIZE) { sendToDb(parsedrows, true); } else { sendToDb(parsedrows, false); } parsedrows = new ArrayList<String>(); } // handle multi patterns if (multipats != null) { // parse multiple pages for the run int subs = 0; for (String row : pages) { try { for (String k : multipats.keySet()) { if (fjp.isShutdown()) { fjp = new ForkJoinPool(Runtime.getRuntime().availableProcessors()); } Map<String, Json> jmap = Json.read(row).asJsonMap(); if (jmap.get(column) != null) { subs += 1; if (test) { System.out.println("//////////////////////HTML////////////////////////\n" + jmap.get(column).asString() + "\n///////////////////////////////END///////////////////////////\n\n"); } if (mustcontain != null) { if (jmap.get(column).asString().contains(mustcontain)) { if (cannotcontain != null) { if (jmap.get(column).asString().contains(cannotcontain) == false) { collect.add( new ParseMultiPage(unescape, replacementPattern, k, jmap.get(column).asString().replaceAll("\\s\\s", " "), jmap.get("offenderhash").asString(), Calendar.getInstance().getTime().toString(), multipats.get(k))); } } else { collect.add(new ParseMultiPage(unescape, replacementPattern, k, jmap.get(column).asString(), jmap.get("offenderhash").asString().replaceAll("\\s\\s", " "), Calendar.getInstance().getTime().toString(), multipats.get(k))); } } } else if (cannotcontain != null) { if (jmap.get(column).asString().contains(cannotcontain) == false) { collect.add(new ParseMultiPage(unescape, replacementPattern, k, jmap.get(column).asString().replaceAll("\\s\\s", " "), jmap.get("offenderhash").asString(), Calendar.getInstance().getTime().toString(), multipats.get(k))); } } else { collect.add(new ParseMultiPage(unescape, replacementPattern, k, jmap.get(column).asString().replaceAll("\\s\\s", " "), jmap.get("offenderhash").asString(), Calendar.getInstance().getTime().toString(), multipats.get(k))); } } i++; if (((i % commit_size) == 0 & i != 0) || i == pages.size() || pages.size() == 1 && multipats != null) { futures = fjp.invokeAll(collect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited " + w + " Cycles"); for (Future<String> r : futures) { try { add = r.get(); if (add.contains("No Data") == false) { for (String js : add.split("~")) { parsedrows.add(js); } } add = null; if (r.isDone() == false) { r.cancel(true); } r = null; } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ExecutionException e) { // TODO Auto-generated catch block e.printStackTrace(); } } futures = null; collect = new HashSet<Callable<String>>(); if (parsedrows.size() >= commit_size) { log.info("INSERTING " + parsedrows.size() + " records!"); if (parsedrows.size() >= SPLITSIZE) { sendToDb(parsedrows, true); } else { sendToDb(parsedrows, false); } parsedrows = new ArrayList<String>(pullsize); } // hint to the gc in case it actually pays off System.gc(); Runtime.getRuntime().gc(); } } } catch (Exception e) { log.warn("Encoding Error!"); } } log.info("Submitted " + subs + " records."); } // handle looped patterns if (loopedpats != null) { log.info("Looped Patterns Found"); int subs = 0; if (fjp.isShutdown()) { fjp = new ForkJoinPool(Runtime.getRuntime().availableProcessors() * procnum); } for (String row : pages) { try { for (String k : loopedpats.keySet()) { if (fjp.isShutdown()) { fjp = new ForkJoinPool(Runtime.getRuntime().availableProcessors() * procnum); } Map<String, Json> jmap = Json.read(row).asJsonMap(); if (jmap.get(column) != null) { subs += 1; if (mustcontain != null) { if (jmap.get(column).asString().contains(mustcontain)) { if (cannotcontain != null) { if (jmap.get(column).asString().contains(cannotcontain) == false) { collect.add( new LoopRegex(unescape, jmap.get(column).asString().replaceAll("\\s\\s", " "), jmap.get("offenderhash").asString(), Calendar.getInstance().getTime().toString(), k, replacementPattern, loopedpats.get(k), test)); } } else { collect.add(new LoopRegex(unescape, jmap.get(column).asString().replaceAll("\\s\\s", " "), jmap.get("offenderhash").asString(), Calendar.getInstance().getTime().toString(), k, replacementPattern, loopedpats.get(k), test)); } } } else if (cannotcontain != null) { if (jmap.get(column).asString().contains(cannotcontain) == false) { collect.add(new LoopRegex(unescape, jmap.get(column).asString().replaceAll("\\s\\s", " "), jmap.get("offenderhash").asString(), Calendar.getInstance().getTime().toString(), k, replacementPattern, loopedpats.get(k), test)); } } else { collect.add(new LoopRegex(unescape, jmap.get(column).asString().replaceAll("\\s\\s", " "), jmap.get("offenderhash").asString(), Calendar.getInstance().getTime().toString(), k, replacementPattern, loopedpats.get(k), test)); } jmap.remove(k); } i++; if (((i % commit_size) == 0 & i != 0) || (i % (pages.size() - 1)) == 0 || pages.size() == 1) { futures = fjp.invokeAll(collect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited " + w + " Cycles"); for (Future<String> r : futures) { try { add = r.get(); if (add.contains("No Data") == false) { for (String toarr : add.split("~")) { parsedrows.add(toarr); } } if (r.isDone() == false) { r.cancel(true); } add = null; } catch (Exception e) { log.warn("Encoding Error!"); e.printStackTrace(); } } futures = null; collect = new HashSet<Callable<String>>(); // hint to the gc in case it actually pays off System.gc(); Runtime.getRuntime().gc(); } } if (parsedrows.size() >= this.commit_size) { log.info("INSERTING " + parsedrows.size() + " records!"); if (parsedrows.size() >= SPLITSIZE) { sendToDb(parsedrows, true); } else { sendToDb(parsedrows, false); } parsedrows = new ArrayList<String>(pullsize); } } catch (Exception e) { log.warn("Encoding Error!"); } } log.info("Submitted " + subs + " records."); } if (collect.size() > 0) { log.info("Getting Last Regex Results for Iteration"); log.info("Getting Tasks"); futures = fjp.invokeAll(collect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited for " + w + " cycles"); for (Future<String> r : futures) { try { add = r.get(); if (add.contains("No Data") == false) { parsedrows.add(add); } add = null; } catch (Exception e) { log.warn("Encoding Error!"); e.printStackTrace(); } } futures = null; collect = new HashSet<Callable<String>>(pullsize); // hint to the gc in case it actually pays off; use // -X:compactexplicitgc to improve odds and // -XX:UseConcMarkSweepGC for improving odds on older generation // strings // (think if i were a gambling man) System.gc(); Runtime.getRuntime().gc(); } log.info("REMAINING ROWS TO COMMIT " + parsedrows.size()); log.info("Rows Left" + parsedrows.size()); if (parsedrows.size() > 0) { if (parsedrows.size() >= SPLITSIZE) { sendToDb(parsedrows, true); } else { sendToDb(parsedrows, false); } parsedrows = new ArrayList<String>(); } } while (pages != null && pages.size() > 0); // ensure that nothing is still caught in limbo // final parser to ensure that nothing is left out if (collect.size() > 0) { log.info("More Rows Caught in FJP, Completing Process"); futures = fjp.invokeAll(collect); w = 0; while (fjp.getActiveThreadCount() > 0 && fjp.isQuiescent() == false) { w++; } log.info("Waited " + w + " Cycles"); for (Future<String> r : futures) { try { add = r.get(); if (add.contains("No Data") == false) { for (String js : add.split("~")) { parsedrows.add(js); } } add = null; if (r.isDone() == false) { r.cancel(true); } r = null; } catch (InterruptedException e) { e.printStackTrace(); } catch (ExecutionException e) { e.printStackTrace(); } } futures = null; collect = null; } // send any remaining parsed rows to the db if (parsedrows.size() > 0) { if (parsedrows.size() >= SPLITSIZE) { sendToDb(parsedrows, true); } else { sendToDb(parsedrows, false); } parsedrows = new ArrayList<String>(); } log.info("Shutting Down Fork Join Pool"); if (fjp.isShutdown() == false) { fjp.shutdownNow(); } fjp = null; log.info("Complete @" + Calendar.getInstance().getTime().toString()); log.info("Total Runtime(seconds): " + Double.toString((double) (Calendar.getInstance().getTimeInMillis() - t) / 1000)); // hint to the gc in case it actually pays off System.gc(); Runtime.getRuntime().gc(); }