Java tutorial
package general; /*- * #%L * sparqlQueryTester * %% * Copyright (C) 2016 QueryAnalysis * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.univocity.parsers.common.ParsingContext; import com.univocity.parsers.common.processor.ObjectRowProcessor; import com.univocity.parsers.tsv.TsvParser; import com.univocity.parsers.tsv.TsvParserSettings; import input.InputHandlerParquet; import input.InputHandlerTSV; import logging.LoggingHandler; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.openrdf.query.parser.ParsedQuery; import org.openrdf.queryrender.sparql.SPARQLQueryRenderer; import query.OpenRDFQueryHandler; import scala.Tuple2; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.text.SimpleDateFormat; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import static java.nio.file.Files.readAllBytes; /** * @author jgonsior */ public final class Main { /** * Saves the encountered queryTypes. */ public static final Map<ParsedQuery, String> queryTypes = Collections .synchronizedMap(new HashMap<ParsedQuery, String>()); /** * Saves the mapping of query type and user agent to tool name and version. */ public static final Map<Tuple2<String, String>, Tuple2<String, String>> queryTypeToToolMapping = new HashMap<>(); /** * Saves if metrics should be calculated for bot queries. */ public static boolean withBots; /** * Saves if the input files should be modified with additional prefixes. */ public static boolean readPreprocessed; /** * Define a static logger variable. */ private static final Logger logger = Logger.getLogger(Main.class); /** * Since this is a utility class, it should not be instantiated. */ private Main() { throw new AssertionError("Instantiating utility class Main"); } /** * Selects the files to be processed and specifies the files to write to. * * @param args Arguments to specify runtime behavior. */ public static void main(String[] args) throws InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException { Options options = new Options(); options.addOption("l", "logging", false, "enables file logging"); options.addOption("j", "jena", false, "uses the Jena SPARQL Parser"); options.addOption("o", "openrdf", false, "uses the OpenRDF SPARQL Parser"); options.addOption("f", "file", true, "defines the input file prefix"); options.addOption("h", "help", false, "displays this help"); options.addOption("t", "tsv", false, "reads from .tsv-files"); // options.addOption("p", "parquet", false, "read from .parquet-files"); options.addOption("n", "numberOfThreads", true, "number of used threads, default 1"); options.addOption("b", "withBots", false, "enables metric calculation for bot queries+"); options.addOption("p", "readPreprocessed", false, "enables reading of preprocessed files"); //some parameters which can be changed through parameters //QueryHandler queryHandler = new OpenRDFQueryHandler(); String inputFilePrefix; String inputFileSuffix = ".tsv"; String queryParserName = "OpenRDF"; Class inputHandlerClass = null; Class queryHandlerClass = null; int numberOfThreads = 1; CommandLineParser parser = new DefaultParser(); CommandLine cmd; try { cmd = parser.parse(options, args); if (cmd.hasOption("help")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("help", options); return; } if (cmd.hasOption("openrdf")) { queryHandlerClass = OpenRDFQueryHandler.class; } if (cmd.hasOption("tsv")) { inputFileSuffix = ".tsv"; inputHandlerClass = InputHandlerTSV.class; } if (cmd.hasOption("parquet")) { inputFileSuffix = ".parquet"; Logger.getLogger("org").setLevel(Level.WARN); Logger.getLogger("akka").setLevel(Level.WARN); SparkConf conf = new SparkConf().setAppName("SPARQLQueryAnalyzer").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); inputHandlerClass = InputHandlerParquet.class; } if (inputHandlerClass == null) { System.out.println("Please specify which parser to use, either -t for TSV or -p for parquet."); } if (cmd.hasOption("file")) { inputFilePrefix = cmd.getOptionValue("file").trim(); } else { System.out.println( "Please specify at least the file which we should work on using the option '--file PREFIX' or 'f PREFIX'"); return; } if (cmd.hasOption("logging")) { LoggingHandler.initFileLog(queryParserName, inputFilePrefix); } if (cmd.hasOption("numberOfThreads")) { numberOfThreads = Integer.parseInt(cmd.getOptionValue("numberOfThreads")); } if (cmd.hasOption("withBots")) { withBots = true; } if (cmd.hasOption("readPreprocessed")) { readPreprocessed = true; } } catch (UnrecognizedOptionException e) { System.out.println("Unrecognized commandline option: " + e.getOption()); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("help", options); return; } catch (ParseException e) { System.out.println( "There was an error while parsing your command line input. Did you rechecked your syntax before running?"); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("help", options); return; } LoggingHandler.initConsoleLog(); loadPreBuildQueryTypes(); long startTime = System.nanoTime(); ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads); for (int day = 1; day <= 31; day++) { String inputFile = inputFilePrefix + String.format("%02d", day) + inputFileSuffix; Runnable parseOneMonthWorker = new ParseOneMonthWorker(inputFile, inputFilePrefix, inputHandlerClass, queryParserName, queryHandlerClass, day); executor.execute(parseOneMonthWorker); } executor.shutdown(); while (!executor.isTerminated()) { //wait until all workers are finished } writeQueryTypes(inputFilePrefix); long stopTime = System.nanoTime(); long millis = TimeUnit.MILLISECONDS.convert(stopTime - startTime, TimeUnit.NANOSECONDS); Date date = new Date(millis); System.out.println("Finished executing with all threads: " + new SimpleDateFormat("mm-dd HH:mm:ss:SSSSSSS").format(date)); } /** * Loads all pre-build query types. */ private static void loadPreBuildQueryTypes() { try (DirectoryStream<Path> directoryStream = Files .newDirectoryStream(Paths.get("preBuildQueryTypeFiles"))) { for (Path filePath : directoryStream) { if (Files.isRegularFile(filePath)) { if (filePath.toString().endsWith(".preBuildQueryType")) { String queryString = new String(readAllBytes(filePath)); OpenRDFQueryHandler queryHandler = new OpenRDFQueryHandler(); //queryHandler.setValidityStatus(1); queryHandler.setQueryString(queryString); if (queryHandler.getValidityStatus() != 1) { logger.info("The Pre-build query " + filePath + " is no valid SPARQL"); continue; } ParsedQuery normalizedPreBuildQuery = queryHandler.getNormalizedQuery(); String queryTypeName = filePath.toString().substring( filePath.toString().lastIndexOf("/") + 1, filePath.toString().lastIndexOf(".")); if (normalizedPreBuildQuery != null) { queryTypes.put(normalizedPreBuildQuery, queryTypeName); } else { logger.info("Pre-build query " + queryTypeName + " could not be parsed."); } } if (filePath.toString().endsWith(".tsv")) { TsvParserSettings parserSettings = new TsvParserSettings(); parserSettings.setLineSeparatorDetectionEnabled(true); parserSettings.setHeaderExtractionEnabled(true); parserSettings.setSkipEmptyLines(true); parserSettings.setReadInputOnSeparateThread(true); ObjectRowProcessor rowProcessor = new ObjectRowProcessor() { @Override public void rowProcessed(Object[] row, ParsingContext parsingContext) { if (row.length <= 1) { logger.warn("Ignoring line without tab while parsing."); return; } if (row.length == 5) { queryTypeToToolMapping.put(new Tuple2<>(row[0].toString(), row[1].toString()), new Tuple2<>(row[2].toString(), row[3].toString())); return; } logger.warn("Line with row length " + row.length + " found. Is the formatting of toolMapping.tsv correct?"); return; } }; parserSettings.setProcessor(rowProcessor); TsvParser parser = new TsvParser(parserSettings); parser.parse(filePath.toFile()); } } } } catch (IOException e) { logger.error("Could not read from directory inputData/queryType/premadeQueryTypeFiles", e); } } /** * Writes all found query Types to queryType/queryTypeFiles/. * * @param inputFilePrefix The location of the input data */ private static void writeQueryTypes(String inputFilePrefix) { String outputFolderName = inputFilePrefix.substring(0, inputFilePrefix.lastIndexOf('/') + 1) + "queryType/"; new File(outputFolderName).mkdir(); outputFolderName += "queryTypeFiles/"; File outputFolderFile = new File(outputFolderName); FileUtils.deleteQuietly(outputFolderFile); new File(outputFolderName).mkdir(); SPARQLQueryRenderer renderer = new SPARQLQueryRenderer(); String currentOutputFolderName = outputFolderName; for (ParsedQuery parsedQuery : queryTypes.keySet()) { String queryType = queryTypes.get(parsedQuery); try (BufferedWriter bw = new BufferedWriter( new FileWriter(currentOutputFolderName + queryType + ".queryType"))) { bw.write(renderer.render(parsedQuery)); bw.write("\n" + parsedQuery.toString()); } catch (IOException e) { logger.error("Could not write the query type " + queryType + ".", e); } catch (Exception e) { logger.error("Error while rendering query type " + queryType + ".", e); } } } }