Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.marmotta.loader.core; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.apache.commons.cli.*; import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream; import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; import org.apache.commons.compress.archivers.sevenz.SevenZFile; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2Utils; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipUtils; import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; import org.apache.commons.compress.compressors.xz.XZUtils; import org.apache.commons.configuration.Configuration; import org.apache.commons.configuration.MapConfiguration; import org.apache.commons.lang3.StringUtils; import org.apache.marmotta.loader.api.LoaderBackend; import org.apache.marmotta.loader.api.LoaderHandler; import org.apache.marmotta.loader.api.LoaderOptions; import org.apache.marmotta.loader.context.ContextHandler; import org.apache.marmotta.loader.functions.BackendIdentifierFunction; import org.apache.marmotta.loader.rio.GeonamesFormat; import org.apache.marmotta.loader.statistics.StatisticsHandler; import org.apache.marmotta.loader.util.DirectoryFilter; import org.openrdf.model.impl.URIImpl; import org.openrdf.rio.*; import org.openrdf.rio.helpers.BasicParserSettings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.util.*; /** * Add file description here! * * @author Sebastian Schaffert (sschaffert@apache.org) */ public class MarmottaLoader { private static ServiceLoader<LoaderBackend> backends = ServiceLoader.load(LoaderBackend.class); private static Logger log = LoggerFactory.getLogger(MarmottaLoader.class); private Configuration configuration; public MarmottaLoader(Configuration configuration) { this.configuration = configuration; } /** * Load data according to the specification given in the configuration from all directories and files specified * in the configuration. Returns the handler used for importing (in case further operations should be performed). */ public LoaderHandler load() throws RDFHandlerException { LoaderHandler handler = getLoader(); if (configuration.containsKey(LoaderOptions.CONTEXT)) { handler = new ContextHandler(handler, new URIImpl(configuration.getString(LoaderOptions.CONTEXT))); } if (configuration.containsKey(LoaderOptions.STATISTICS_ENABLED)) { handler = new StatisticsHandler(handler, configuration); } handler.initialise(); if (configuration.containsKey(LoaderOptions.DIRS)) { for (String dirname : configuration.getStringArray(LoaderOptions.DIRS)) { File dir = new File(dirname); try { loadDirectory(dir, handler, getRDFFormat(configuration.getString(LoaderOptions.FORMAT)), configuration.getString(LoaderOptions.COMPRESSION)); } catch (RDFParseException | IOException e) { log.warn("error importing directory {}: {}", dir, e.getMessage()); } } } if (configuration.containsKey(LoaderOptions.ARCHIVES)) { for (String archiveName : configuration.getStringArray(LoaderOptions.ARCHIVES)) { File archive = new File(archiveName); try { loadArchive(archive, handler, getRDFFormat(configuration.getString(LoaderOptions.FORMAT))); } catch (RDFParseException | IOException | ArchiveException e) { log.warn("error importing directory {}: {}", archive, e.getMessage()); } } } if (configuration.containsKey(LoaderOptions.FILES)) { for (String fname : configuration.getStringArray(LoaderOptions.FILES)) { File f = new File(fname); try { loadFile(f, handler, getRDFFormat(configuration.getString(LoaderOptions.FORMAT)), configuration.getString(LoaderOptions.COMPRESSION)); } catch (RDFParseException | IOException e) { log.warn("error importing file {}: {}", f, e.getMessage()); } } } handler.shutdown(); return handler; } /** * Load data from the inputstream given as first argument into the handler given as second argument. * * @param inStream input byte stream to read the data from; must be plain data in the format given as argument * @param handler handler to add the data to * @param format format to use for creating the parser * @throws RDFParseException * @throws IOException */ public void load(InputStream inStream, LoaderHandler handler, RDFFormat format) throws RDFParseException, IOException { try { RDFParser parser = createParser(format); parser.setRDFHandler(handler); parser.parse(inStream, configuration.getString(LoaderOptions.BASE_URI, "http://localhost/")); } catch (RDFHandlerException e) { log.error("error loading stream data in format {}: {}", format, e.getMessage()); } } /** * Load data from the reader given as first argument into the handler given as second argument. * * @param reader character stream to read the data from; must be plain data in the format given as argument * @param handler handler to add the data to * @param format format to use for creating the parser * @throws RDFParseException * @throws IOException */ public void load(Reader reader, LoaderHandler handler, RDFFormat format) throws RDFParseException, IOException { try { RDFParser parser = createParser(format); parser.setRDFHandler(handler); parser.parse(reader, configuration.getString(LoaderOptions.BASE_URI, "http://localhost/")); } catch (RDFHandlerException e) { log.error("error loading stream data in format {}: {}", format, e.getMessage()); } } /** * Load data from the reader given as first argument into the handler given as second argument. * * @param file file to read the data from; in case a compression format is not explicitly given, the method will * try to decide from the file name if the file is in a compressed format * @param handler handler to add the data to * @param format format to use for creating the parser or null for auto-detection * @param compression compression format to use, or null for auto-detection (see formats in org.apache.commons.compress.compressors.CompressorStreamFactory) * @throws RDFParseException * @throws IOException */ public void loadFile(File file, LoaderHandler handler, RDFFormat format, String compression) throws RDFParseException, IOException { log.info("loading file {} ...", file); CompressorStreamFactory cf = new CompressorStreamFactory(); cf.setDecompressConcatenated(true); // detect the file compression String detectedCompression = detectCompression(file); if (compression == null) { if (detectedCompression != null) { log.info("using auto-detected compression ({})", detectedCompression); compression = detectedCompression; } } else { if (detectedCompression != null && !compression.equals(detectedCompression)) { log.warn("user-specified compression ({}) overrides auto-detected compression ({})", compression, detectedCompression); } else { log.info("using user-specified compression ({})", compression); } } // detect the file format RDFFormat detectedFormat = Rio.getParserFormatForFileName(uncompressedName(file)); if (format == null) { if (detectedFormat != null) { log.info("using auto-detected format ({})", detectedFormat.getName()); format = detectedFormat; } else { throw new RDFParseException("could not detect input format of file " + file); } } else { if (detectedFormat != null && !format.equals(detectedFormat)) { log.warn("user-specified format ({}) overrides auto-detected format ({})", format.getName(), detectedFormat.getName()); } } // create input stream from file and wrap in compressor stream InputStream in; InputStream fin = new BufferedInputStream(new FileInputStream(file)); try { if (compression != null) { if (CompressorStreamFactory.GZIP.equalsIgnoreCase(compression)) { in = new GzipCompressorInputStream(fin, true); } else if (CompressorStreamFactory.BZIP2.equalsIgnoreCase(compression)) { in = new BZip2CompressorInputStream(fin, true); } else if (CompressorStreamFactory.XZ.equalsIgnoreCase(compression)) { in = new XZCompressorInputStream(fin, true); } else { // does not honour decompressConcatenated in = cf.createCompressorInputStream(compression, fin); } } else { in = cf.createCompressorInputStream(fin); } } catch (CompressorException ex) { log.info("no compression detected, using plain input stream"); in = fin; } // load using the input stream load(in, handler, format); } /** * Load data from the reader given as first argument into the handler given as second argument. * * @param directory directory to read the data from; in case a compression format is not explicitly given, the method will * try to decide from the file name if the file is in a compressed format * @param handler handler to add the data to * @param format format to use for creating the parser or null for auto-detection * @param compression compression format to use, or null for auto-detection (see formats in org.apache.commons.compress.compressors.CompressorStreamFactory) * @throws RDFParseException * @throws IOException */ public void loadDirectory(File directory, LoaderHandler handler, RDFFormat format, String compression) throws RDFParseException, IOException { log.info("loading files in directory {} ...", directory); if (directory.exists() && directory.isDirectory()) { for (File f : directory.listFiles(new DirectoryFilter())) { try { if (isArchive(f)) { loadArchive(f, handler, format); } else { loadFile(f, handler, format, compression); } } catch (RDFParseException | IOException | ArchiveException e) { log.warn("error importing file {}: {}", f, e.getMessage()); } } } else { throw new RDFParseException("could not load files from directory " + directory + ": it does not exist or is not a directory"); } } public void loadArchive(File archive, LoaderHandler handler, RDFFormat format) throws RDFParseException, IOException, ArchiveException { log.info("loading files in archive {} ...", archive); if (archive.exists() && archive.canRead()) { if (archive.getName().endsWith("7z")) { log.info("auto-detected archive format: 7Z"); final SevenZFile sevenZFile = new SevenZFile(archive); try { SevenZArchiveEntry entry; while ((entry = sevenZFile.getNextEntry()) != null) { if (!entry.isDirectory()) { log.info("loading entry {} ...", entry.getName()); // detect the file format RDFFormat detectedFormat = Rio.getParserFormatForFileName(entry.getName()); if (format == null) { if (detectedFormat != null) { log.info("auto-detected entry format: {}", detectedFormat.getName()); format = detectedFormat; } else { throw new RDFParseException( "could not detect input format of entry " + entry.getName()); } } else { if (detectedFormat != null && !format.equals(detectedFormat)) { log.warn("user-specified entry format ({}) overrides auto-detected format ({})", format.getName(), detectedFormat.getName()); } else { log.info("user-specified entry format: {}", format.getName()); } } load(new InputStream() { @Override public int read() throws IOException { return sevenZFile.read(); } @Override public int read(byte[] b) throws IOException { return sevenZFile.read(b); } @Override public int read(byte[] b, int off, int len) throws IOException { return sevenZFile.read(b, off, len); } }, handler, format); } } } finally { sevenZFile.close(); } } else { InputStream in; String archiveCompression = detectCompression(archive); InputStream fin = new BufferedInputStream(new FileInputStream(archive)); if (archiveCompression != null) { if (CompressorStreamFactory.GZIP.equalsIgnoreCase(archiveCompression)) { log.info("auto-detected archive compression: GZIP"); in = new GzipCompressorInputStream(fin, true); } else if (CompressorStreamFactory.BZIP2.equalsIgnoreCase(archiveCompression)) { log.info("auto-detected archive compression: BZIP2"); in = new BZip2CompressorInputStream(fin, true); } else if (CompressorStreamFactory.XZ.equalsIgnoreCase(archiveCompression)) { log.info("auto-detected archive compression: XZ"); in = new XZCompressorInputStream(fin, true); } else { in = fin; } } else { in = fin; } ArchiveInputStream zipStream = new ArchiveStreamFactory() .createArchiveInputStream(new BufferedInputStream(in)); logArchiveType(zipStream); ArchiveEntry entry; while ((entry = zipStream.getNextEntry()) != null) { if (!entry.isDirectory()) { log.info("loading entry {} ...", entry.getName()); // detect the file format RDFFormat detectedFormat = Rio.getParserFormatForFileName(entry.getName()); if (format == null) { if (detectedFormat != null) { log.info("auto-detected entry format: {}", detectedFormat.getName()); format = detectedFormat; } else { throw new RDFParseException( "could not detect input format of entry " + entry.getName()); } } else { if (detectedFormat != null && !format.equals(detectedFormat)) { log.warn("user-specified entry format ({}) overrides auto-detected format ({})", format.getName(), detectedFormat.getName()); } else { log.info("user-specified entry format: {}", format.getName()); } } load(zipStream, handler, format); } } } } else { throw new RDFParseException( "could not load files from archive " + archive + ": it does not exist or is not readable"); } } private void logArchiveType(ArchiveInputStream stream) { if (log.isInfoEnabled()) { if (stream instanceof ZipArchiveInputStream) { log.info("auto-detected archive format: ZIP"); } else if (stream instanceof TarArchiveInputStream) { log.info("auto-detected archive format: TAR"); } else if (stream instanceof CpioArchiveInputStream) { log.info("auto-detected archive format: CPIO"); } else { log.info("unknown archive format, relying on commons-compress"); } } } private boolean isArchive(File file) { if (file.getName().endsWith(".zip")) { return true; } else if (file.getName().endsWith(".7z")) { return true; } else if (file.getName().endsWith(".tar.gz")) { return true; } else if (file.getName().endsWith(".tar.bz2")) { return true; } else if (file.getName().endsWith(".tar.xz")) { return true; } else if (file.getName().endsWith(".tar")) { return true; } return false; } /** * Detect the compression format from the filename, or null in case auto-detection failed. * @param file * @return */ private String detectCompression(File file) { if (BZip2Utils.isCompressedFilename(file.getName())) { return CompressorStreamFactory.BZIP2; } else if (GzipUtils.isCompressedFilename(file.getName())) { return CompressorStreamFactory.GZIP; } else if (XZUtils.isCompressedFilename(file.getName())) { return CompressorStreamFactory.XZ; } else { return null; } } private String uncompressedName(File file) { if (BZip2Utils.isCompressedFilename(file.getAbsolutePath())) { return BZip2Utils.getUncompressedFilename(file.getName()); } else if (GzipUtils.isCompressedFilename(file.getAbsolutePath())) { return GzipUtils.getUncompressedFilename(file.getName()); } else if (XZUtils.isCompressedFilename(file.getAbsolutePath())) { return XZUtils.getUncompressedFilename(file.getName()); } else { return file.getName(); } } /** * Create a parser for the given format, turning off some of the stricter configuration settings so we * can handle more messy data without errors. * @param format * @return */ private RDFParser createParser(RDFFormat format) { RDFParser parser = Rio.createParser(format); parser.getParserConfig().addNonFatalError(BasicParserSettings.VERIFY_DATATYPE_VALUES); parser.getParserConfig().addNonFatalError(BasicParserSettings.FAIL_ON_UNKNOWN_DATATYPES); parser.getParserConfig().addNonFatalError(BasicParserSettings.NORMALIZE_DATATYPE_VALUES); return parser; } /** * Determine loader from configuration options. * @return */ private LoaderHandler getLoader() { if (configuration.containsKey(LoaderOptions.BACKEND)) { for (LoaderBackend backend : backends) { if (StringUtils.equalsIgnoreCase(backend.getIdentifier(), configuration.getString(LoaderOptions.BACKEND))) { return backend.createLoader(configuration); } } } List<LoaderBackend> backendSet = Lists.newArrayList(backends); if (backendSet.size() == 1) { return backendSet.get(0).createLoader(configuration); } throw new IllegalStateException( "there are multiple backends available, please select one explicitly using -B"); } /** * Get the RDF format from a user specfication (either mime type or short cut) * @param spec * @return */ private static RDFFormat getRDFFormat(String spec) { if (StringUtils.equalsIgnoreCase(spec, "turtle")) { return RDFFormat.TURTLE; } else if (StringUtils.equalsIgnoreCase(spec, "n3")) { return RDFFormat.N3; } else if (StringUtils.equalsIgnoreCase(spec, "rdf")) { return RDFFormat.RDFXML; } else if (StringUtils.equalsIgnoreCase(spec, "xml")) { return RDFFormat.RDFXML; } else if (StringUtils.equalsIgnoreCase(spec, "geonames")) { return GeonamesFormat.FORMAT; } else if (spec != null) { return Rio.getParserFormatForMIMEType(spec); } else { return null; } } private static void printHelp(Options options) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(MarmottaLoader.class.getSimpleName(), options, true); } /** * Build command line options. Base options are: * <ul> * <li>-h | --help: show help</li> * <li>-B | --backend: backend to use (kiwi, hbase, berkeleydb)</li> * <li>-z | --gzip: input is GZIP encoded</li> * <li>-j | --bzip2: input is BZIP2 encoded</li> * <li>-c | --context: URI of a context to add the statement to</li> * <li>-b | --base: URI to use as base for resolving relative URIs</li> * <li>-f | --file: input file to use for loading</li> * <li>-d | --dir: input directory containing files to use for loading</li> * <li>-t | --type: input format to use for parsing (MIME type)</li> * <li>-s | --statistics: collect statistics and write a graph into the file given</li> * </ul> * * In addition, loader backends can provide their own additional command line options. * * @return */ private static Options buildOptions() { final Options options = new Options(); options.addOption("h", "help", false, "print this help"); OptionGroup compression = new OptionGroup(); compression.addOption(new Option("z", "gzip", false, "input is gzip compressed")); compression.addOption(new Option("j", "bzip2", false, "input is bzip2 compressed")); options.addOptionGroup(compression); final Option backend = OptionBuilder.withArgName("backend").hasArgs(1) .withDescription("backend to use (" + StringUtils.join( Iterators.transform(backends.iterator(), new BackendIdentifierFunction()), ", ") + ")") .withLongOpt("backend").create('B'); options.addOption(backend); final Option base = OptionBuilder.withArgName("base").hasArgs(1) .withDescription("base URI to use for resolving relative URIs").withLongOpt("base").create('b'); options.addOption(base); final Option context = OptionBuilder.withArgName("context").hasArgs(1) .withDescription("URI of a context to add the statement to").withLongOpt("context").create('c'); options.addOption(context); final Option format = OptionBuilder.withArgName("type").hasArgs(1) .withDescription("input format to use for parsing (MIME type) in case auto-guessing does not work") .withLongOpt("type").create('t'); options.addOption(format); // input options OptionGroup input = new OptionGroup(); input.setRequired(true); final Option file = OptionBuilder.withArgName("file").hasArgs(Option.UNLIMITED_VALUES) .withDescription("input file(s) to load").withLongOpt("file").create('f'); input.addOption(file); final Option directories = OptionBuilder.withArgName("dir").hasArgs(Option.UNLIMITED_VALUES) .withDescription("input directories(s) to load").withLongOpt("dir").create('d'); input.addOption(directories); final Option archives = OptionBuilder.withArgName("archive").hasArgs(Option.UNLIMITED_VALUES) .withDescription("input archives(s) to load (zip, tar.gz)").withLongOpt("archive").create('a'); input.addOption(archives); options.addOptionGroup(input); final Option statistics = OptionBuilder.withArgName("statistics") .withDescription("collect statistics and write a graph into the file given") .withLongOpt("statistics").hasOptionalArg().create('s'); options.addOption(statistics); final Option property = OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator() .withDescription("set configuration property to value").create("D"); options.addOption(property); for (LoaderBackend b : backends) { for (Option o : b.getOptions()) { options.addOption(o); } } return options; } public static Configuration parseOptions(String[] args) throws ParseException { Options options = buildOptions(); CommandLineParser parser = new PosixParser(); CommandLine cmd = parser.parse(options, args); Configuration result = new MapConfiguration(new HashMap<String, Object>()); if (cmd.hasOption('B')) { // check backends Set<String> existing = Sets .newHashSet(Iterators.transform(backends.iterator(), new BackendIdentifierFunction())); if (!existing.contains(cmd.getOptionValue('B'))) { throw new ParseException("the backend " + cmd.getOptionValue('B') + " does not exist"); } result.setProperty(LoaderOptions.BACKEND, cmd.getOptionValue('B')); } if (cmd.hasOption('b')) { result.setProperty(LoaderOptions.BASE_URI, cmd.getOptionValue('b')); } if (cmd.hasOption('z')) { result.setProperty(LoaderOptions.COMPRESSION, CompressorStreamFactory.GZIP); } if (cmd.hasOption('j')) { result.setProperty(LoaderOptions.COMPRESSION, CompressorStreamFactory.BZIP2); } if (cmd.hasOption('c')) { result.setProperty(LoaderOptions.CONTEXT, cmd.getOptionValue('c')); } if (cmd.hasOption('t')) { RDFFormat fmt = getRDFFormat(cmd.getOptionValue('t')); if (fmt == null) { throw new ParseException("unrecognized MIME type: " + cmd.getOptionValue('t')); } result.setProperty(LoaderOptions.FORMAT, fmt.getDefaultMIMEType()); } if (cmd.hasOption('f')) { result.setProperty(LoaderOptions.FILES, Arrays.asList(cmd.getOptionValues('f'))); } if (cmd.hasOption('d')) { result.setProperty(LoaderOptions.DIRS, Arrays.asList(cmd.getOptionValues('d'))); } if (cmd.hasOption('a')) { result.setProperty(LoaderOptions.ARCHIVES, Arrays.asList(cmd.getOptionValues('a'))); } if (cmd.hasOption('s')) { result.setProperty(LoaderOptions.STATISTICS_ENABLED, true); result.setProperty(LoaderOptions.STATISTICS_GRAPH, cmd.getOptionValue('s')); } if (cmd.hasOption('D')) { for (Map.Entry e : cmd.getOptionProperties("D").entrySet()) { result.setProperty(e.getKey().toString(), e.getValue()); } } for (LoaderBackend b : backends) { for (Option option : b.getOptions()) { if (cmd.hasOption(option.getOpt())) { String key = String.format("backend.%s.%s", b.getIdentifier(), option.getLongOpt() != null ? option.getLongOpt() : option.getOpt()); if (option.hasArg()) { if (option.hasArgs()) { result.setProperty(key, Arrays.asList(cmd.getOptionValues(option.getOpt()))); } else { result.setProperty(key, cmd.getOptionValue(option.getOpt())); } } else { result.setProperty(key, true); } } } } return result; } /** * Parse command line arguments and start import * * @param args */ public static void main(String[] args) { try { Configuration cfg = parseOptions(args); MarmottaLoader loader = new MarmottaLoader(cfg); loader.load(); } catch (Exception e) { System.err.println("error parsing command line options: " + e.getMessage()); log.warn("Exception Details:", e); printHelp(buildOptions()); System.exit(1); } } }