List of usage examples for java.util Iterator next
E next();
From source file:edu.umn.cs.spatialHadoop.operations.KNN.java
public static void main(String[] args) throws IOException { final OperationsParams params = new OperationsParams(new GenericOptionsParser(args)); Path[] paths = params.getPaths(); if (paths.length <= 1 && !params.checkInput()) { printUsage();/* w ww . ja v a 2 s .c o m*/ System.exit(1); } if (paths.length > 1 && !params.checkInputOutput()) { printUsage(); System.exit(1); } final Path inputFile = params.getInputPath(); int count = params.getInt("count", 1); double closeness = params.getFloat("closeness", -1.0f); final Point[] queryPoints = closeness < 0 ? params.getShapes("point", new Point()) : new Point[count]; final FileSystem fs = inputFile.getFileSystem(params); final int k = params.getInt("k", 1); int concurrency = params.getInt("concurrency", 100); if (k == 0) { LOG.warn("k = 0"); } if (queryPoints.length == 0) { printUsage(); throw new RuntimeException("Illegal arguments"); } final Path outputPath = paths.length > 1 ? paths[1] : null; if (closeness >= 0) { // Get query points according to its closeness to grid intersections GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inputFile); long seed = params.getLong("seed", System.currentTimeMillis()); Random random = new Random(seed); for (int i = 0; i < count; i++) { int i_block = random.nextInt(gindex.size()); int direction = random.nextInt(4); // Generate a point in the given direction // Get center point (x, y) Iterator<Partition> iterator = gindex.iterator(); while (i_block-- >= 0) iterator.next(); Partition partition = iterator.next(); double cx = (partition.x1 + partition.x2) / 2; double cy = (partition.y1 + partition.y2) / 2; double cw = partition.x2 - partition.x1; double ch = partition.y2 - partition.y1; int signx = ((direction & 1) == 0) ? 1 : -1; int signy = ((direction & 2) == 1) ? 1 : -1; double x = cx + cw * closeness / 2 * signx; double y = cy + ch * closeness / 2 * signy; queryPoints[i] = new Point(x, y); } } final BooleanWritable exceptionHappened = new BooleanWritable(); Thread.UncaughtExceptionHandler h = new Thread.UncaughtExceptionHandler() { public void uncaughtException(Thread th, Throwable ex) { ex.printStackTrace(); exceptionHappened.set(true); } }; // Run each query in a separate thread final Vector<Thread> threads = new Vector<Thread>(); for (int i = 0; i < queryPoints.length; i++) { Thread thread = new Thread() { @Override public void run() { try { Point query_point = queryPoints[threads.indexOf(this)]; OperationsParams newParams = new OperationsParams(params); OperationsParams.setShape(newParams, "point", query_point); Job job = knn(inputFile, outputPath, params); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } } }; thread.setUncaughtExceptionHandler(h); threads.add(thread); } long t1 = System.currentTimeMillis(); do { // Ensure that there is at least MaxConcurrentThreads running int i = 0; while (i < concurrency && i < threads.size()) { Thread.State state = threads.elementAt(i).getState(); if (state == Thread.State.TERMINATED) { // Thread already terminated, remove from the queue threads.remove(i); } else if (state == Thread.State.NEW) { // Start the thread and move to next one threads.elementAt(i++).start(); } else { // Thread is still running, skip over it i++; } } if (!threads.isEmpty()) { try { // Sleep for 10 seconds or until the first thread terminates threads.firstElement().join(10000); } catch (InterruptedException e) { e.printStackTrace(); } } } while (!threads.isEmpty()); long t2 = System.currentTimeMillis(); if (exceptionHappened.get()) throw new RuntimeException("Not all jobs finished correctly"); System.out.println("Time for " + queryPoints.length + " jobs is " + (t2 - t1) + " millis"); System.out.println("Total iterations: " + TotalIterations); }
From source file:edu.umn.cs.sthadoop.trajectory.KNNDTW.java
public static void main(String[] args) throws IOException { args = new String[10]; args[0] = "/export/scratch/mntgData/geolifeGPS/geolife_Trajectories_1.3/HDFS/index_geolife/yyyy-MM/2008-05"; args[1] = "/export/scratch/mntgData/geolifeGPS/geolife_Trajectories_1.3/HDFS/knn-dis-result"; args[2] = "shape:edu.umn.cs.sthadoop.trajectory.GeolifeTrajectory"; args[3] = "interval:2008-05-01,2008-05-30"; args[4] = "time:month"; args[5] = "k:1"; args[6] = "traj:39.9119983,116.606835;39.9119783,116.6065483;39.9119599,116.6062649;39.9119416,116.6059899;39.9119233,116.6057282;39.9118999,116.6054783;39.9118849,116.6052366;39.9118666,116.6050099;39.91185,116.604775;39.9118299,116.604525;39.9118049,116.6042649;39.91177,116.6040166;39.9117516,116.6037583;39.9117349,116.6035066;39.9117199,116.6032666;39.9117083,116.6030232;39.9117,116.6027566;39.91128,116.5969383;39.9112583,116.5966766;39.9112383,116.5964232;39.9112149,116.5961699;39.9111933,116.5959249;39.9111716,116.5956883"; args[7] = "-overwrite"; args[8] = "-local";// "-no-local"; args[9] = "point:39.9119983,116.606835"; final OperationsParams params = new OperationsParams(new GenericOptionsParser(args)); Path[] paths = params.getPaths(); if (paths.length <= 1 && !params.checkInput()) { printUsage();// w w w .ja va2 s.co m System.exit(1); } if (paths.length > 1 && !params.checkInputOutput()) { printUsage(); System.exit(1); } final Path inputFile = params.getInputPath(); int count = params.getInt("count", 1); double closeness = params.getFloat("closeness", -1.0f); final Point[] queryPoints = closeness < 0 ? params.getShapes("point", new Point()) : new Point[count]; final FileSystem fs = inputFile.getFileSystem(params); final int k = params.getInt("k", 1); int concurrency = params.getInt("concurrency", 100); if (k == 0) { LOG.warn("k = 0"); } if (queryPoints.length == 0) { printUsage(); throw new RuntimeException("Illegal arguments"); } final Path outputPath = paths.length > 1 ? paths[1] : null; if (closeness >= 0) { // Get query points according to its closeness to grid intersections GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inputFile); long seed = params.getLong("seed", System.currentTimeMillis()); Random random = new Random(seed); for (int i = 0; i < count; i++) { int i_block = random.nextInt(gindex.size()); int direction = random.nextInt(4); // Generate a point in the given direction // Get center point (x, y) Iterator<Partition> iterator = gindex.iterator(); while (i_block-- >= 0) iterator.next(); Partition partition = iterator.next(); double cx = (partition.x1 + partition.x2) / 2; double cy = (partition.y1 + partition.y2) / 2; double cw = partition.x2 - partition.x1; double ch = partition.y2 - partition.y1; int signx = ((direction & 1) == 0) ? 1 : -1; int signy = ((direction & 2) == 1) ? 1 : -1; double x = cx + cw * closeness / 2 * signx; double y = cy + ch * closeness / 2 * signy; queryPoints[i] = new Point(x, y); } } final BooleanWritable exceptionHappened = new BooleanWritable(); Thread.UncaughtExceptionHandler h = new Thread.UncaughtExceptionHandler() { public void uncaughtException(Thread th, Throwable ex) { ex.printStackTrace(); exceptionHappened.set(true); } }; // Run each query in a separate thread final Vector<Thread> threads = new Vector<Thread>(); for (int i = 0; i < queryPoints.length; i++) { Thread thread = new Thread() { @Override public void run() { try { Point query_point = queryPoints[threads.indexOf(this)]; OperationsParams newParams = new OperationsParams(params); OperationsParams.setShape(newParams, "point", query_point); Job job = knn(inputFile, outputPath, params); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } } }; thread.setUncaughtExceptionHandler(h); threads.add(thread); } long t1 = System.currentTimeMillis(); do { // Ensure that there is at least MaxConcurrentThreads running int i = 0; while (i < concurrency && i < threads.size()) { Thread.State state = threads.elementAt(i).getState(); if (state == Thread.State.TERMINATED) { // Thread already terminated, remove from the queue threads.remove(i); } else if (state == Thread.State.NEW) { // Start the thread and move to next one threads.elementAt(i++).start(); } else { // Thread is still running, skip over it i++; } } if (!threads.isEmpty()) { try { // Sleep for 10 seconds or until the first thread terminates threads.firstElement().join(10000); } catch (InterruptedException e) { e.printStackTrace(); } } } while (!threads.isEmpty()); long t2 = System.currentTimeMillis(); if (exceptionHappened.get()) throw new RuntimeException("Not all jobs finished correctly"); System.out.println("Time for " + queryPoints.length + " jobs is " + (t2 - t1) + " millis"); System.out.println("Total iterations: " + TotalIterations); }
From source file:org.ala.dao.CassandraPelopsHelper.java
public static void main(String[] args) throws Exception { CassandraPelopsHelper helper = new CassandraPelopsHelper(); helper.init();//from ww w .j a v a 2s. c o m Map<String, Object> map = helper.getSubColumnsByGuid("tc", "103067807"); Set<String> keys = map.keySet(); Iterator<String> it = keys.iterator(); while (it.hasNext()) { String key = it.next(); ColumnType type = ColumnType.getColumnType(key); Object o = map.get(type.getColumnName()); if (o instanceof List) { List l = (List) o; } else { Comparable c = (Comparable) o; } } /* TaxonConcept t = null; List<Comparable> l = new ArrayList<Comparable>(); for(int i=0; i< 10; i++){ t = new TaxonConcept(); t.setId(i); t.setGuid("urn:lsid:"+i); t.setNameString("Aus bus"); t.setAuthor("Smith"); t.setAuthorYear("2008"); t.setInfoSourceName("AFD"); t.setInfoSourceURL("http://afd.org.au"); helper.putSingle("taxonConcept", "tc", "taxonConcept", t.getGuid(), t); l.add(t); if(i % 1000==0){ System.out.println("id: "+i); } } helper.putList("taxonConcept", "tc", "taxonConcept", "128", l, true); CommonName c1 = new CommonName(); c1.setNameString("Dave"); CommonName c2 = new CommonName(); c2.setNameString("Frank"); helper.putSingle("taxonConcept", "tc", "taxonConcept", "123", t); helper.put("taxonConcept", "tc", "commonName", "123", c1); helper.put("taxonConcept", "tc", "commonName", "123", c2); helper.putSingle("taxonConcept", "tc", "taxonConcept", "124", t); TaxonConcept tc = (TaxonConcept) helper.get("taxonConcept", "tc", "taxonConcept", "123", TaxonConcept.class); System.out.println("Retrieved: "+tc.getNameString()); List<CommonName> cns = (List) helper.getList("taxonConcept", "tc", "commonName", "123", CommonName.class); System.out.println("Retrieved: "+cns); */ //cassandra scanning Scanner scanner = helper.getScanner("taxonConcept", "tc", "taxonConcept"); for (int i = 0; i < 10; i++) { System.out.println(new String(scanner.getNextGuid())); } System.exit(0); }
From source file:edu.nyu.vida.data_polygamy.relationship_computation.Relationship.java
/** * @param args/*from w w w . j a v a2 s . c o m*/ * @throws ParseException */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the relationship " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option scoreOption = new Option("sc", "score", true, "set threhsold for relationship score"); scoreOption.setRequired(false); scoreOption.setArgName("SCORE THRESHOLD"); options.addOption(scoreOption); Option strengthOption = new Option("st", "strength", true, "set threhsold for relationship strength"); strengthOption.setRequired(false); strengthOption.setArgName("STRENGTH THRESHOLD"); options.addOption(strengthOption); Option completeRandomizationOption = new Option("c", "complete-randomization", false, "use complete randomization when performing significance tests"); completeRandomizationOption.setRequired(false); options.addOption(completeRandomizationOption); Option idOption = new Option("id", "ids", false, "output id instead of names for datasets and attributes"); idOption.setRequired(false); options.addOption(idOption); Option g1Option = new Option("g1", "first-group", true, "set first group of datasets"); g1Option.setRequired(true); g1Option.setArgName("FIRST GROUP"); g1Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g1Option); Option g2Option = new Option("g2", "second-group", true, "set second group of datasets"); g2Option.setRequired(false); g2Option.setArgName("SECOND GROUP"); g2Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g2Option); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); Option removeOption = new Option("r", "remove-not-significant", false, "remove relationships that are not" + "significant from the final output"); removeOption.setRequired(false); options.addOption(removeOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.relationship_computation.Relationship", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } Path path = null; FileSystem fs = FileSystem.get(new Configuration()); ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> firstGroup = new ArrayList<String>(); ArrayList<String> secondGroup = new ArrayList<String>(); HashMap<String, String> datasetAgg = new HashMap<String, String>(); boolean removeNotSignificant = cmd.hasOption("r"); boolean removeExistingFiles = cmd.hasOption("f"); boolean completeRandomization = cmd.hasOption("c"); boolean hasScoreThreshold = cmd.hasOption("sc"); boolean hasStrengthThreshold = cmd.hasOption("st"); boolean outputIds = cmd.hasOption("id"); String scoreThreshold = hasScoreThreshold ? cmd.getOptionValue("sc") : ""; String strengthThreshold = hasStrengthThreshold ? cmd.getOptionValue("st") : ""; // all datasets ArrayList<String> all_datasets = new ArrayList<String>(); if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { all_datasets.add(line.split("\t")[0]); line = br.readLine(); } br.close(); if (s3) fs.close(); String[] all_datasets_array = new String[all_datasets.size()]; all_datasets.toArray(all_datasets_array); String[] firstGroupCmd = cmd.getOptionValues("g1"); String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : all_datasets_array; addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } if (firstGroup.isEmpty()) { System.out.println("No indices from datasets in G1."); System.exit(0); } if (secondGroup.isEmpty()) { System.out.println("No indices from datasets in G2."); System.exit(0); } // getting dataset ids String datasetNames = ""; String datasetIds = ""; HashMap<String, String> datasetId = new HashMap<String, String>(); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { datasetId.put(it.next(), null); } if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } br = new BufferedReader(new InputStreamReader(fs.open(path))); line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); all_datasets.add(dt[0]); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); if (s3) fs.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } String firstGroupStr = ""; String secondGroupStr = ""; for (String dataset : firstGroup) { firstGroupStr += datasetId.get(dataset) + ","; } for (String dataset : secondGroup) { secondGroupStr += datasetId.get(dataset) + ","; } firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1); secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1); String relationshipsDir = ""; if (outputIds) { relationshipsDir = FrameworkUtils.relationshipsIdsDir; } else { relationshipsDir = FrameworkUtils.relationshipsDir; } FrameworkUtils.createDir(s3bucket + relationshipsDir, s3conf, s3); String random = completeRandomization ? "complete" : "restricted"; String indexInputDirs = ""; String noRelationship = ""; HashSet<String> dirs = new HashSet<String>(); String dataset1; String dataset2; String datasetId1; String datasetId2; for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } datasetId1 = datasetId.get(dataset1); datasetId2 = datasetId.get(dataset2); if (dataset1.equals(dataset2)) continue; String correlationOutputFileName = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3); } if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) { dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset1); dirs.add(s3bucket + FrameworkUtils.indexDir + "/" + dataset2); } else { noRelationship += datasetId1 + "-" + datasetId2 + ","; } } } if (dirs.isEmpty()) { System.out.println("All the relationships were already computed."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } for (String dir : dirs) { indexInputDirs += dir + ","; } Configuration conf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); String jobName = "relationship" + "-" + random; String relationshipOutputDir = s3bucket + relationshipsDir + "/tmp/"; FrameworkUtils.removeFile(relationshipOutputDir, s3conf, s3); for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i))); } for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size", Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length)); } conf.set("dataset-keys", datasetIds); conf.set("dataset-names", datasetNames); conf.set("first-group", firstGroupStr); conf.set("second-group", secondGroupStr); conf.set("complete-random", String.valueOf(completeRandomization)); conf.set("output-ids", String.valueOf(outputIds)); conf.set("complete-random-str", random); conf.set("main-dataset-id", datasetId.get(shortDataset.get(0))); conf.set("remove-not-significant", String.valueOf(removeNotSignificant)); if (noRelationship.length() > 0) { conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1)); } if (hasScoreThreshold) { conf.set("score-threshold", scoreThreshold); } if (hasStrengthThreshold) { conf.set("strength-threshold", strengthThreshold); } conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); conf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); conf.set("mapreduce.input.fileinputformat.split.minsize", "0"); conf.set("mapreduce.task.io.sort.mb", "200"); conf.set("mapreduce.task.io.sort.factor", "100"); conf.set("mapreduce.task.timeout", "2400000"); if (s3) { machineConf.setMachineConfiguration(conf); conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); conf.set("bucket", s3bucket); } if (snappyCompression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } Job job = new Job(conf); job.setJobName(jobName); job.setMapOutputKeyClass(PairAttributeWritable.class); job.setMapOutputValueClass(TopologyTimeSeriesWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CorrelationMapper.class); job.setReducerClass(CorrelationReducer.class); job.setNumReduceTasks(machineConf.getNumberReduces()); job.setInputFormatClass(SequenceFileInputFormat.class); //job.setOutputFormatClass(TextOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, indexInputDirs.substring(0, indexInputDirs.length() - 1)); FileOutputFormat.setOutputPath(job, new Path(relationshipOutputDir)); job.setJarByClass(Relationship.class); long start = System.currentTimeMillis(); job.submit(); job.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } if (dataset1.equals(dataset2)) continue; String from = s3bucket + relationshipsDir + "/tmp/" + dataset1 + "-" + dataset2 + "/"; String to = s3bucket + relationshipsDir + "/" + dataset1 + "-" + dataset2 + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } } }
From source file:com.amalto.workbench.utils.XSDParser.java
/** * Print a simple type definition for the document. * /* ww w . j a v a 2s . co m*/ * @param xsdSimpleTypeDefinition a simple type definition in the schema for schema. */ /* * public void printSimpleTypeDefinition( XSDSimpleTypeDefinition xsdSimpleTypeDefinition) { if * (xsdSimpleTypeDefinition == null) { } else if (xsdSimpleTypeDefinition.getEffectiveEnumerationFacet() != null) { * List value = xsdSimpleTypeDefinition.getEffectiveEnumerationFacet() .getValue(); if (value.size() > 1) { * System.out.print("("); } for (Iterator enumerators = value.iterator(); enumerators.hasNext();) { String * enumerator = enumerators.next().toString(); System.out.print("<em>"); System.out.print(enumerator); * System.out.print("</em>"); if (enumerators.hasNext()) { System.out.print(" | "); } } if (value.size() > * 1) { System.out.print(")"); } } else if (xsdSimpleTypeDefinition.getElement() != null && * xsdSimpleTypeDefinition.getElement().hasAttribute( XSDConstants.ID_ATTRIBUTE)) { System.out.print("<a href='#" + * xsdSimpleTypeDefinition.getName() + "-simple-type'>"); System.out.print(xsdSimpleTypeDefinition.getName()); * System.out.print("</a>"); } else if (XSDVariety.UNION_LITERAL == xsdSimpleTypeDefinition .getVariety()) { * System.out.print("("); for (Iterator members = xsdSimpleTypeDefinition .getMemberTypeDefinitions().iterator(); * members.hasNext();) { XSDSimpleTypeDefinition memberTypeDefinition = (XSDSimpleTypeDefinition) members .next(); * printSimpleTypeDefinition(memberTypeDefinition); if (members.hasNext()) { System.out.print(" | "); } } * System.out.print(")"); } else if (XSDVariety.UNION_LITERAL == xsdSimpleTypeDefinition .getVariety()) { * System.out.print("List of "); printSimpleTypeDefinition(xsdSimpleTypeDefinition * .getItemTypeDefinition()); } else if (xsdSimpleTypeDefinition.getName() != null) { if * ("public".equals(xsdSimpleTypeDefinition.getName())) { System.out.print("<a target='Part2' href='" + * XSDConstants.PART2 + "#anyURI'>anyURI</a> "); System.out.print("<a target='Errata' href='" + errata + * "#pfipublic'><em>public</em></a>"); } else { System.out.print("<b><em>"); * System.out.print(xsdSimpleTypeDefinition.getName()); System.out.print("</em></b>"); } } else if * (xsdSimpleTypeDefinition.getEffectivePatternFacet() != null) { // * System.out.print(xsdSimpleTypeDefinition.getEffectivePatternFacet().getLexicalValue()); * * System.out.print("<em>"); System.out.print("<a target='Part1' href='" + XSDConstants.PART1 + * "#coss-identity-constraint'>"); System.out.print("a restricted xpath expression"); System.out.print("</a>"); * System.out.print("</em>"); } else { System.out.print("***"); } } */ public static void main(String args[]) { try { /* * String xsd = ""; FileInputStream fis = new FileInputStream( * "/home/bgrieder/workspace/XCBL35/XCBL35.xsd"); BufferedReader br = new BufferedReader(new * InputStreamReader(fis, "utf-8")); String line; while ((line = br.readLine()) != null) xsd += line + "\n"; * * XSDParser parser = new XSDParser(); parser.loadAndPrint(xsd); */ FileWriter fw = new FileWriter("/tmp/xcb35sr.xsd"); //$NON-NLS-1$ Resource.Factory.Registry.INSTANCE.getExtensionToFactoryMap().put("xsd", new XSDResourceFactoryImpl()); //$NON-NLS-1$ String xsdFile = "/home/bgrieder/workspace/XCBL35/XCBL35.xsd"; //$NON-NLS-1$ ResourceSet resourceSet = new ResourceSetImpl(); XSDResourceImpl xsdResource = (XSDResourceImpl) resourceSet.getResource(URI.createFileURI(xsdFile), true); /* * XSDResourceImpl res = new XSDResourceImpl(URI.createFileURI(xsdFile)); */ XSDSchema xsdSchema = xsdResource.getSchema(); String header = "<xsd:schema " + "elementFormDefault=\"qualified\" " //$NON-NLS-1$ //$NON-NLS-2$ + "targetNamespace=\"rrn:org.xcbl:schemas/xcbl/v3_5/xcbl35.xsd\" " //$NON-NLS-1$ + "xmlns=\"rrn:org.xcbl:schemas/xcbl/v3_5/xcbl35.xsd\" " //$NON-NLS-1$ + "xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\">"; //$NON-NLS-1$ fw.write(header); Iterator it = xsdSchema.getElementDeclarations().iterator(); for (; it.hasNext();) { XSDElementDeclaration elementDeclaration = (XSDElementDeclaration) it.next(); // if ("Order".equals(elementDeclaration.getName())) { fw.write(Util.nodeToString(elementDeclaration.getElement()) .replaceAll("xmlns:xsd=\"http:\\/\\/www\\.w3\\.org\\/2001\\/XMLSchema\"", "")); //$NON-NLS-1$ //$NON-NLS-2$ // } } it = xsdSchema.getTypeDefinitions().iterator(); for (; it.hasNext();) { XSDTypeDefinition typedef = (XSDTypeDefinition) it.next(); fw.write(Util.nodeToString(typedef.getElement())); } String footer = "</xsd:schema>"; //$NON-NLS-1$ fw.write(footer); fw.close(); } catch (Exception e) { log.error(e.getMessage(), e); } }
From source file:edu.umn.cs.sthadoop.operations.HSPKNNQ.java
public static void main(String[] args) throws IOException { //./hadoop jar /export/scratch/louai/idea-stHadoop/st-hadoop-uber.jar pknn /mntgIndex/yyyy-MM-dd/2017-08-03 /pknn k:2 point:-78.9659,35.7998 shape:edu.umn.cs.sthadoop.mntg.STPointMntg -overwrite // args = new String[8]; // args[0] = "/export/scratch/mntgData/mntgIndex"; // args[1] = "/export/scratch/mntgData/pknn"; // args[2] = "-overwrite"; // args[3] = "k:10"; // args[4] = "point:-78.9659063204100,35.7903907684998"; // args[5] = "shape:edu.umn.cs.sthadoop.trajectory.STPointTrajectory"; // args[6] = "interval:2017-08-03,2017-08-04"; // args[7] = "-overwrite"; final OperationsParams params = new OperationsParams(new GenericOptionsParser(args)); Path[] paths = params.getPaths(); if (paths.length <= 1 && !params.checkInput()) { printUsage();// w w w . j av a2 s . c om System.exit(1); } if (paths.length > 1 && !params.checkInputOutput()) { printUsage(); System.exit(1); } if (params.get("interval") == null) { System.err.println("Temporal range missing"); printUsage(); System.exit(1); } TextSerializable inObj = params.getShape("shape"); if (!(inObj instanceof STPoint)) { if (!(inObj instanceof STRectangle)) { LOG.error("Shape is not instance of STPoint or instance of STRectangle"); printUsage(); System.exit(1); } } // path to the spatio-temporal index. List<Path> STPaths = new ArrayList<Path>(); try { STPaths = STRangeQuery.getIndexedSlices(params); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } for (Path input : STPaths) { final Path inputFile = input; int count = params.getInt("count", 1); double closeness = params.getFloat("closeness", -1.0f); final Point[] queryPoints = closeness < 0 ? params.getShapes("point", new Point()) : new Point[count]; final FileSystem fs = inputFile.getFileSystem(params); final int k = params.getInt("k", 1); int concurrency = params.getInt("concurrency", 100); if (k == 0) { LOG.warn("k = 0"); } if (queryPoints.length == 0) { printUsage(); throw new RuntimeException("Illegal arguments"); } final Path outputPath = paths.length > 1 ? new Path(paths[1].toUri() + "-" + input.getName()) : null; if (closeness >= 0) { // Get query points according to its closeness to grid intersections GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inputFile); long seed = params.getLong("seed", System.currentTimeMillis()); Random random = new Random(seed); for (int i = 0; i < count; i++) { int i_block = random.nextInt(gindex.size()); int direction = random.nextInt(4); // Generate a point in the given direction // Get center point (x, y) Iterator<Partition> iterator = gindex.iterator(); while (i_block-- >= 0) iterator.next(); Partition partition = iterator.next(); double cx = (partition.x1 + partition.x2) / 2; double cy = (partition.y1 + partition.y2) / 2; double cw = partition.x2 - partition.x1; double ch = partition.y2 - partition.y1; int signx = ((direction & 1) == 0) ? 1 : -1; int signy = ((direction & 2) == 1) ? 1 : -1; double x = cx + cw * closeness / 2 * signx; double y = cy + ch * closeness / 2 * signy; queryPoints[i] = new Point(x, y); } } final BooleanWritable exceptionHappened = new BooleanWritable(); Thread.UncaughtExceptionHandler h = new Thread.UncaughtExceptionHandler() { public void uncaughtException(Thread th, Throwable ex) { ex.printStackTrace(); exceptionHappened.set(true); } }; // Run each query in a separate thread final Vector<Thread> threads = new Vector<Thread>(); for (int i = 0; i < queryPoints.length; i++) { Thread thread = new Thread() { @Override public void run() { try { Point query_point = queryPoints[threads.indexOf(this)]; OperationsParams newParams = new OperationsParams(params); OperationsParams.setShape(newParams, "point", query_point); Job job = knn(inputFile, outputPath, params); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } } }; thread.setUncaughtExceptionHandler(h); threads.add(thread); } long t1 = System.currentTimeMillis(); do { // Ensure that there is at least MaxConcurrentThreads running int i = 0; while (i < concurrency && i < threads.size()) { Thread.State state = threads.elementAt(i).getState(); if (state == Thread.State.TERMINATED) { // Thread already terminated, remove from the queue threads.remove(i); } else if (state == Thread.State.NEW) { // Start the thread and move to next one threads.elementAt(i++).start(); } else { // Thread is still running, skip over it i++; } } if (!threads.isEmpty()) { try { // Sleep for 10 seconds or until the first thread terminates threads.firstElement().join(10000); } catch (InterruptedException e) { e.printStackTrace(); } } } while (!threads.isEmpty()); long t2 = System.currentTimeMillis(); if (exceptionHappened.get()) throw new RuntimeException("Not all jobs finished correctly"); System.out.println("Time for " + queryPoints.length + " jobs is " + (t2 - t1) + " millis"); System.out.println("Total iterations: " + TotalIterations); } }
From source file:net.iiit.siel.analysis.lang.LanguageIdentifier.java
/** * The main method./*from w w w. j a v a2 s .c om*/ * * @param args the arguments */ public static void main(String args[]) { String usage = "Usage: LanguageIdentifier " + "[-identifyrows filename maxlines] " + "[-identifyfile charset filename] " + "[-identifyfileset charset files] " + "[-identifytext text] " + "[-identifyurl url]"; int command = 0; final int IDFILE = 1; final int IDTEXT = 2; final int IDURL = 3; final int IDFILESET = 4; final int IDROWS = 5; Vector fileset = new Vector(); String filename = ""; String charset = ""; String url = ""; String text = ""; int max = 0; // TODO niket writing test args here.. /* args = new String[2]; args[0] = "-identifyurl"; args[1] = "file:/home1/niket/TamilSamplePage.html"; //args[2] = "/home1/niket/nutch-clia/input.txt"; */ // TODO niket end here if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { // parse command line if (args[i].equals("-identifyfile")) { command = IDFILE; charset = args[++i]; filename = args[++i]; } if (args[i].equals("-identifyurl")) { command = IDURL; filename = args[++i]; } if (args[i].equals("-identifyrows")) { command = IDROWS; filename = args[++i]; max = Integer.parseInt(args[++i]); } if (args[i].equals("-identifytext")) { command = IDTEXT; for (i++; i < args.length - 1; i++) text += args[i] + " "; } if (args[i].equals("-identifyfileset")) { command = IDFILESET; charset = args[++i]; for (i++; i < args.length; i++) { File[] files = null; File f = new File(args[i]); if (f.isDirectory()) { files = f.listFiles(); } else { files = new File[] { f }; } for (int j = 0; j < files.length; j++) { fileset.add(files[j].getAbsolutePath()); } } } } Configuration conf = NutchConfiguration.create(); String lang = null; LanguageIdentifier idfr = new LanguageIdentifier(conf); File f; FileInputStream fis; try { switch (command) { case IDTEXT: lang = idfr.identify(text); System.out.println("Lang :" + lang); break; case IDFILE: f = new File(filename); fis = new FileInputStream(f); lang = idfr.identify(fis, charset); fis.close(); break; case IDURL: lang = LangIdentifierUtility.IdentifyLangFromURLDirectly(filename); /* * our url identifier is confused or couldn't identify lang from * URL */ if (lang == null || lang.equalsIgnoreCase("en")) { System.out.println("Ambuguity in identifying language from URL"); } else { System.out.println("Lang was identified(using URL) as: " + lang); } break; case IDROWS: f = new File(filename); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f))); String line; while (max > 0 && (line = br.readLine()) != null) { line = line.trim(); if (line.length() > 2) { max--; lang = idfr.identify(line); System.out.println("R=" + lang + ":" + line); } } br.close(); System.exit(0); break; case IDFILESET: /* * used for benchs for (int j=128; j<=524288; j*=2) { long start * = System.currentTimeMillis(); idfr.analyzeLength = j; */ System.out.println("FILESET"); Iterator i = fileset.iterator(); while (i.hasNext()) { try { filename = (String) i.next(); f = new File(filename); fis = new FileInputStream(f); lang = idfr.identify(fis, charset); fis.close(); } catch (Exception e) { System.out.println(e); } System.out.println(filename + " was identified as " + lang); } /* * used for benchs System.out.println(j + "/" + * (System.currentTimeMillis()-start)); } */ System.exit(0); break; } } catch (Exception e) { System.out.println(e); System.out.println("lang could not be identified properly"); e.printStackTrace(); } System.out.println("text was identified as " + lang); /* * DONOT delete the next few lines, they should be enabled, when a lang. * mapping map needs to be generated. TODO this is for printing * the hashMapRangeLangIDTable only * * idfr.langMarkerObject.printHashmapTableWithFormatting(); * * System.out * .println("\n\n\n Printing english text contents in this file:\n"); * System.out.println(idfr.langMarkerObject.getLangCharacters( * LanguageIdentifierConstants.LangShortNames.ENGLISH * .langShortName()).toString()); * * System.out * .println("\n\n\n Printing telugu text contents in this file:\n"); * System.out.println(idfr.langMarkerObject.getLangCharacters( * LanguageIdentifierConstants.LangShortNames.TELUGU * .langShortName()).toString()); * * System.out * .println("\n\n\n Printing unknown text contents in this file:\n"); * System.out.println(idfr.langMarkerObject.getLangCharacters( * LanguageIdentifierConstants.LangShortNames.UNKNOWN_LANG * .langShortName()).toString()); */ }
From source file:grnet.filter.XMLFiltering.java
public static void main(String[] args) throws IOException { // TODO Auto-generated method ssstub Enviroment enviroment = new Enviroment(args[0]); if (enviroment.envCreation) { Core core = new Core(); XMLSource source = new XMLSource(args[0]); File sourceFile = source.getSource(); if (sourceFile.exists()) { Collection<File> xmls = source.getXMLs(); System.out.println("Filtering repository:" + enviroment.dataProviderFilteredIn.getName()); System.out.println("Number of files to filter:" + xmls.size()); Iterator<File> iterator = xmls.iterator(); FilteringReport report = null; if (enviroment.getArguments().getProps().getProperty(Constants.createReport) .equalsIgnoreCase("true")) { report = new FilteringReport(enviroment.getArguments().getDestFolderLocation(), enviroment.getDataProviderFilteredIn().getName()); }//from w ww. j a v a 2 s . co m ConnectionFactory factory = new ConnectionFactory(); factory.setHost(enviroment.getArguments().getQueueHost()); factory.setUsername(enviroment.getArguments().getQueueUserName()); factory.setPassword(enviroment.getArguments().getQueuePassword()); while (iterator.hasNext()) { StringBuffer logString = new StringBuffer(); logString.append(enviroment.dataProviderFilteredIn.getName()); File xmlFile = iterator.next(); String name = xmlFile.getName(); name = name.substring(0, name.indexOf(".xml")); logString.append(" " + name); boolean xmlIsFilteredIn = core.filterXML(xmlFile, enviroment.getArguments().getQueries()); if (xmlIsFilteredIn) { logString.append(" " + "FilteredIn"); slf4jLogger.info(logString.toString()); Connection connection = factory.newConnection(); Channel channel = connection.createChannel(); channel.queueDeclare(QUEUE_NAME, false, false, false, null); channel.basicPublish("", QUEUE_NAME, null, logString.toString().getBytes()); channel.close(); connection.close(); try { if (report != null) { report.appendXMLFileNameNStatus(xmlFile.getPath(), Constants.filteredInData); report.raiseFilteredInFilesNum(); } FileUtils.copyFileToDirectory(xmlFile, enviroment.getDataProviderFilteredIn()); } catch (IOException e) { // TODO Auto-generated catch block // e.printStackTrace(); e.printStackTrace(); System.out.println("Filtering failed."); } } else { logString.append(" " + "FilteredOut"); slf4jLogger.info(logString.toString()); Connection connection = factory.newConnection(); Channel channel = connection.createChannel(); channel.queueDeclare(QUEUE_NAME, false, false, false, null); channel.basicPublish("", QUEUE_NAME, null, logString.toString().getBytes()); channel.close(); connection.close(); try { if (report != null) { report.appendXMLFileNameNStatus(xmlFile.getPath(), Constants.filteredOutData); report.raiseFilteredOutFilesNum(); } FileUtils.copyFileToDirectory(xmlFile, enviroment.getDataProviderFilteredOuT()); } catch (IOException e) { // TODO Auto-generated catch block // e.printStackTrace(); e.printStackTrace(); System.out.println("Filtering failed."); } } } if (report != null) { report.appendXPathExpression(enviroment.getArguments().getQueries()); report.appendGeneralInfo(); } System.out.println("Filtering is done."); } } }
From source file:Batch.java
static public void main(String[] args) { Connection conn = null;// w ww. j a v a 2 s. c o m try { ArrayList breakable = new ArrayList(); PreparedStatement stmt; Iterator users; ResultSet rs; Class.forName(args[0]).newInstance(); conn = DriverManager.getConnection(args[1], args[2], args[3]); stmt = conn.prepareStatement("SELECT user_id, password " + "FROM user"); rs = stmt.executeQuery(); while (rs.next()) { String uid = rs.getString(1); String pw = rs.getString(2); // Assume PasswordCracker is some class that provides // a single static method called crack() that attempts // to run password cracking routines on the password // if( PasswordCracker.crack(uid, pw) ) { // breakable.add(uid); // } } stmt.close(); if (breakable.size() < 1) { return; } stmt = conn.prepareStatement("UPDATE user " + "SET bad_password = 'Y' " + "WHERE uid = ?"); users = breakable.iterator(); while (users.hasNext()) { String uid = (String) users.next(); stmt.setString(1, uid); stmt.addBatch(); } stmt.executeBatch(); } catch (Exception e) { e.printStackTrace(); } finally { if (conn != null) { try { conn.close(); } catch (Exception e) { } } } }
From source file:de.unileipzig.ub.indexer.App.java
public static void main(String[] args) throws IOException { // create Options object Options options = new Options(); options.addOption("h", "help", false, "display this help"); options.addOption("f", "filename", true, "name of the JSON file whose content should be indexed"); options.addOption("i", "index", true, "the name of the target index"); options.addOption("d", "doctype", true, "the name of the doctype (title, local, ...)"); options.addOption("t", "host", true, "elasticsearch hostname (default: 0.0.0.0)"); options.addOption("p", "port", true, "transport port (that's NOT the http port, default: 9300)"); options.addOption("c", "cluster", true, "cluster name (default: elasticsearch_mdma)"); options.addOption("b", "bulksize", true, "number of docs sent in one request (default: 3000)"); options.addOption("v", "verbose", false, "show processing speed while indexing"); options.addOption("s", "status", false, "only show status of index for file"); options.addOption("r", "repair", false, "attempt to repair recoverable inconsistencies on the go"); options.addOption("e", "debug", false, "set logging level to debug"); options.addOption("l", "logfile", true, "logfile - in not specified only log to stdout"); options.addOption("m", "memcached", true, "host and port of memcached (default: localhost:11211)"); options.addOption("z", "latest-flag-on", true, "enable latest flag according to field (within content, e.g. 001)"); options.addOption("a", "flat", false, "flat-mode: do not check for inconsistencies"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null;/*from w w w . j a va2 s.c om*/ try { cmd = parser.parse(options, args); } catch (ParseException ex) { logger.error(ex); System.exit(1); } // setup logging Properties systemProperties = System.getProperties(); systemProperties.put("net.spy.log.LoggerImpl", "net.spy.memcached.compat.log.Log4JLogger"); System.setProperties(systemProperties); Logger.getLogger("net.spy.memcached").setLevel(Level.ERROR); Properties props = new Properties(); props.load(props.getClass().getResourceAsStream("/log4j.properties")); if (cmd.hasOption("debug")) { props.setProperty("log4j.logger.de.unileipzig", "DEBUG"); } if (cmd.hasOption("logfile")) { props.setProperty("log4j.rootLogger", "INFO, stdout, F"); props.setProperty("log4j.appender.F", "org.apache.log4j.FileAppender"); props.setProperty("log4j.appender.F.File", cmd.getOptionValue("logfile")); props.setProperty("log4j.appender.F.layout", "org.apache.log4j.PatternLayout"); props.setProperty("log4j.appender.F.layout.ConversionPattern", "%5p | %d | %F | %L | %m%n"); } PropertyConfigurator.configure(props); InetAddress addr = InetAddress.getLocalHost(); String memcachedHostAndPort = addr.getHostAddress() + ":11211"; if (cmd.hasOption("m")) { memcachedHostAndPort = cmd.getOptionValue("m"); } // setup caching try { if (memcachedClient == null) { memcachedClient = new MemcachedClient( new ConnectionFactoryBuilder().setFailureMode(FailureMode.Cancel).build(), AddrUtil.getAddresses("0.0.0.0:11211")); try { // give client and server 500ms Thread.sleep(300); } catch (InterruptedException ex) { } Collection availableServers = memcachedClient.getAvailableServers(); logger.info(availableServers); if (availableServers.size() == 0) { logger.info("no memcached servers found"); memcachedClient.shutdown(); memcachedClient = null; } else { logger.info(availableServers.size() + " memcached server(s) detected, fine."); } } } catch (IOException ex) { logger.warn("couldn't create a connection, bailing out: " + ex.getMessage()); } // process options if (cmd.hasOption("h")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("indexer", options, true); quit(0); } boolean verbose = false; if (cmd.hasOption("verbose")) { verbose = true; } // ES options String[] hosts = new String[] { "0.0.0.0" }; int port = 9300; String clusterName = "elasticsearch_mdma"; int bulkSize = 3000; if (cmd.hasOption("host")) { hosts = cmd.getOptionValues("host"); } if (cmd.hasOption("port")) { port = Integer.parseInt(cmd.getOptionValue("port")); } if (cmd.hasOption("cluster")) { clusterName = cmd.getOptionValue("cluster"); } if (cmd.hasOption("bulksize")) { bulkSize = Integer.parseInt(cmd.getOptionValue("bulksize")); if (bulkSize < 1 || bulkSize > 100000) { logger.error("bulksize must be between 1 and 100,000"); quit(1); } } // ES Client final Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "elasticsearch_mdma") .build(); final TransportClient client = new TransportClient(settings); for (String host : hosts) { client.addTransportAddress(new InetSocketTransportAddress(host, port)); } if (cmd.hasOption("filename") && cmd.hasOption("index") && cmd.hasOption("doctype")) { final String filename = cmd.getOptionValue("filename"); final File _file = new File(filename); if (_file.length() == 0) { logger.info(_file.getAbsolutePath() + " is empty, skipping"); quit(0); // file is empty } // for flat mode: leave a stampfile beside the json to // indicate previous successful processing File directory = new File(filename).getParentFile(); File stampfile = new File(directory, DigestUtils.shaHex(filename) + ".indexed"); long start = System.currentTimeMillis(); long lineCount = 0; final String indexName = cmd.getOptionValue("index"); final String docType = cmd.getOptionValue("doctype"); BulkRequestBuilder bulkRequest = client.prepareBulk(); try { if (cmd.hasOption("flat")) { // flat mode // ......... if (stampfile.exists()) { logger.info("SKIPPING, since it seems this file has already " + "been imported (found: " + stampfile.getAbsolutePath() + ")"); quit(0); } } else { final String srcSHA1 = extractSrcSHA1(filename); logger.debug(filename + " srcsha1: " + srcSHA1); long docsInIndex = getIndexedRecordCount(client, indexName, srcSHA1); logger.debug(filename + " indexed: " + docsInIndex); long docsInFile = getLineCount(filename); logger.debug(filename + " lines: " + docsInFile); // in non-flat-mode, indexing would take care // of inconsistencies if (docsInIndex == docsInFile) { logger.info("UP-TO DATE: " + filename + " (" + docsInIndex + ", " + srcSHA1 + ")"); client.close(); quit(0); } if (docsInIndex > 0) { logger.warn("INCONSISTENCY DETECTED: " + filename + ": indexed:" + docsInIndex + " lines:" + docsInFile); if (!cmd.hasOption("r")) { logger.warn( "Please re-run indexer with --repair flag or delete residues first with: $ curl -XDELETE " + hosts[0] + ":9200/" + indexName + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'"); client.close(); quit(1); } else { logger.info("Attempting to clear residues..."); // attempt to repair once DeleteByQueryResponse dbqr = client.prepareDeleteByQuery(indexName) .setQuery(termQuery("meta.srcsha1", srcSHA1)).execute().actionGet(); Iterator<IndexDeleteByQueryResponse> it = dbqr.iterator(); long deletions = 0; while (it.hasNext()) { IndexDeleteByQueryResponse response = it.next(); deletions += 1; } logger.info("Deleted residues of " + filename); logger.info("Refreshing [" + indexName + "]"); RefreshResponse refreshResponse = client.admin().indices() .refresh(new RefreshRequest(indexName)).actionGet(); long indexedAfterDelete = getIndexedRecordCount(client, indexName, srcSHA1); logger.info(indexedAfterDelete + " docs remained"); if (indexedAfterDelete > 0) { logger.warn("Not all residues cleaned. Try to fix this manually: $ curl -XDELETE " + hosts[0] + ":9200/" + indexName + "/_query -d ' {\"term\" : { \"meta.srcsha1\" : \"" + srcSHA1 + "\" }}'"); quit(1); } else { logger.info("Residues are gone. Now trying to reindex: " + filename); } } } } logger.info("INDEXING-REQUIRED: " + filename); if (cmd.hasOption("status")) { quit(0); } HashSet idsInBatch = new HashSet(); String idField = null; if (cmd.hasOption("z")) { idField = cmd.getOptionValue("z"); } final FileReader fr = new FileReader(filename); final BufferedReader br = new BufferedReader(fr); String line; // one line is one document while ((line = br.readLine()) != null) { // "Latest-Flag" machine // This gets obsolete with a "flat" index if (cmd.hasOption("z")) { // flag that indicates, whether the document // about to be indexed will be the latest boolean willBeLatest = true; // check if there is a previous (lower meta.timestamp) document with // the same identifier (whatever that may be - queried under "content") final String contentIdentifier = getContentIdentifier(line, idField); idsInBatch.add(contentIdentifier); // assumed in meta.timestamp final Long timestamp = Long.parseLong(getTimestamp(line)); logger.debug("Checking whether record is latest (line: " + lineCount + ")"); logger.debug(contentIdentifier + ", " + timestamp); // get all docs, which match the contentIdentifier // by filter, which doesn't score final TermFilterBuilder idFilter = new TermFilterBuilder("content." + idField, contentIdentifier); final TermFilterBuilder kindFilter = new TermFilterBuilder("meta.kind", docType); final AndFilterBuilder afb = new AndFilterBuilder(); afb.add(idFilter).add(kindFilter); final FilteredQueryBuilder fb = filteredQuery(matchAllQuery(), afb); final SearchResponse searchResponse = client.prepareSearch(indexName) .setSearchType(SearchType.DFS_QUERY_THEN_FETCH).setQuery(fb).setFrom(0) .setSize(1200) // 3 years and 105 days assuming daily updates at the most .setExplain(false).execute().actionGet(); final SearchHits searchHits = searchResponse.getHits(); logger.debug("docs with this id in the index: " + searchHits.getTotalHits()); for (final SearchHit hit : searchHits.getHits()) { final String docId = hit.id(); final Map<String, Object> source = hit.sourceAsMap(); final Map meta = (Map) source.get("meta"); final Long docTimestamp = Long.parseLong(meta.get("timestamp").toString()); // if the indexed doc timestamp is lower the the current one, // remove any latest flag if (timestamp >= docTimestamp) { source.remove("latest"); final ObjectMapper mapper = new ObjectMapper(); // put the updated doc back // IndexResponse response = client.prepareIndex(indexName, docType).setCreate(false).setId(docId) .setSource(mapper.writeValueAsBytes(source)) .execute(new ActionListener<IndexResponse>() { public void onResponse(IndexResponse rspns) { logger.debug("Removed latest flag from " + contentIdentifier + ", " + docTimestamp + ", " + hit.id() + " since (" + timestamp + " > " + docTimestamp + ")"); } public void onFailure(Throwable thrwbl) { logger.error("Could not remove flag from " + hit.id() + ", " + contentIdentifier); } }); // .execute() //.actionGet(); } else { logger.debug("Doc " + hit.id() + " is newer (" + docTimestamp + ")"); willBeLatest = false; } } if (willBeLatest) { line = setLatestFlag(line); logger.info("Setting latest flag on " + contentIdentifier + ", " + timestamp); } // end of latest-flag machine // beware - this will be correct as long as there // are no dups within one bulk! } bulkRequest.add(client.prepareIndex(indexName, docType).setSource(line)); lineCount++; logger.debug("Added line " + lineCount + " to BULK"); logger.debug(line); if (lineCount % bulkSize == 0) { if (idsInBatch.size() != bulkSize && cmd.hasOption("z")) { logger.error( "This batch has duplications in the ID. That's not bad for the index, just makes the latest flag fuzzy"); logger.error( "Bulk size was: " + bulkSize + ", but " + idsInBatch.size() + " IDs (only)"); } idsInBatch.clear(); logger.debug("Issuing BULK request"); final long actionCount = bulkRequest.numberOfActions(); final BulkResponse bulkResponse = bulkRequest.execute().actionGet(); final long tookInMillis = bulkResponse.getTookInMillis(); if (bulkResponse.hasFailures()) { logger.fatal("FAILED, bulk not indexed. exiting now."); Iterator<BulkItemResponse> it = bulkResponse.iterator(); while (it.hasNext()) { BulkItemResponse bir = it.next(); if (bir.isFailed()) { Failure failure = bir.getFailure(); logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage() + ", type: " + failure.getType() + ", index: " + failure.getIndex()); } } quit(1); } else { if (verbose) { final double elapsed = System.currentTimeMillis() - start; final double speed = (lineCount / elapsed * 1000); logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)"); } } bulkRequest = client.prepareBulk(); } } // handle the remaining items final long actionCount = bulkRequest.numberOfActions(); if (actionCount > 0) { final BulkResponse bulkResponse = bulkRequest.execute().actionGet(); final long tookInMillis = bulkResponse.getTookInMillis(); if (bulkResponse.hasFailures()) { logger.fatal("FAILED, bulk not indexed. exiting now."); Iterator<BulkItemResponse> it = bulkResponse.iterator(); while (it.hasNext()) { BulkItemResponse bir = it.next(); if (bir.isFailed()) { Failure failure = bir.getFailure(); logger.fatal("id: " + failure.getId() + ", message: " + failure.getMessage() + ", type: " + failure.getType() + ", index: " + failure.getIndex()); } } quit(1); } else { // trigger update now RefreshResponse refreshResponse = client.admin().indices() .refresh(new RefreshRequest(indexName)).actionGet(); if (verbose) { final double elapsed = System.currentTimeMillis() - start; final double speed = (lineCount / elapsed * 1000); logger.info("OK (" + filename + ") " + lineCount + " docs indexed (" + actionCount + "/" + tookInMillis + "ms" + "/" + String.format("%.2f", speed) + "r/s)"); } } } br.close(); client.close(); final double elapsed = (System.currentTimeMillis() - start) / 1000; final double speed = (lineCount / elapsed); logger.info("indexing (" + filename + ") " + lineCount + " docs took " + elapsed + "s (speed: " + String.format("%.2f", speed) + "r/s)"); if (cmd.hasOption("flat")) { try { FileUtils.touch(stampfile); } catch (IOException ioe) { logger.warn(".indexed files not created. Will reindex everything everytime."); } } } catch (IOException e) { client.close(); logger.error(e); quit(1); } finally { client.close(); } } quit(0); }